This commit is contained in:
expdsn 2025-02-25 19:04:30 +08:00
parent e8e249de35
commit 141bf45123
4 changed files with 134 additions and 75 deletions

View File

@ -17,6 +17,7 @@
"g": "^2.0.1", "g": "^2.0.1",
"mongodb": "^6.13.0", "mongodb": "^6.13.0",
"puppeteer": "^24.2.1", "puppeteer": "^24.2.1",
"turndown": "^7.2.0",
"uuid": "^11.0.5" "uuid": "^11.0.5"
}, },
"pnpm": { "pnpm": {
@ -26,6 +27,7 @@
}, },
"devDependencies": { "devDependencies": {
"@types/node": "^22.13.4", "@types/node": "^22.13.4",
"@types/turndown": "^5.0.5",
"@types/uuid": "^10.0.0", "@types/uuid": "^10.0.0",
"ts-node": "^10.9.2", "ts-node": "^10.9.2",
"typescript": "^5.7.3" "typescript": "^5.7.3"

View File

@ -23,6 +23,9 @@ importers:
puppeteer: puppeteer:
specifier: ^24.2.1 specifier: ^24.2.1
version: 24.2.1(typescript@5.7.3) version: 24.2.1(typescript@5.7.3)
turndown:
specifier: ^7.2.0
version: 7.2.0
uuid: uuid:
specifier: ^11.0.5 specifier: ^11.0.5
version: 11.0.5 version: 11.0.5
@ -30,6 +33,9 @@ importers:
'@types/node': '@types/node':
specifier: ^22.13.4 specifier: ^22.13.4
version: 22.13.4 version: 22.13.4
'@types/turndown':
specifier: ^5.0.5
version: 5.0.5
'@types/uuid': '@types/uuid':
specifier: ^10.0.0 specifier: ^10.0.0
version: 10.0.0 version: 10.0.0
@ -64,6 +70,9 @@ packages:
'@jridgewell/trace-mapping@0.3.9': '@jridgewell/trace-mapping@0.3.9':
resolution: {integrity: sha512-3Belt6tdc8bPgAtbcmdtNJlirVoTmEb5e2gC94PnkwEW9jI6CAHUeoG85tjWP5WquqfavoMtMwiG4P926ZKKuQ==} resolution: {integrity: sha512-3Belt6tdc8bPgAtbcmdtNJlirVoTmEb5e2gC94PnkwEW9jI6CAHUeoG85tjWP5WquqfavoMtMwiG4P926ZKKuQ==}
'@mixmark-io/domino@2.2.0':
resolution: {integrity: sha512-Y28PR25bHXUg88kCV7nivXrP2Nj2RueZ3/l/jdx6J9f8J4nsEGcgX0Qe6lt7Pa+J79+kPiJU3LguR6O/6zrLOw==}
'@mongodb-js/saslprep@1.2.0': '@mongodb-js/saslprep@1.2.0':
resolution: {integrity: sha512-+ywrb0AqkfaYuhHs6LxKWgqbh3I72EpEgESCw37o+9qPx9WTCkgDm2B+eMrwehGtHBWHFU4GXvnSCNiFhhausg==} resolution: {integrity: sha512-+ywrb0AqkfaYuhHs6LxKWgqbh3I72EpEgESCw37o+9qPx9WTCkgDm2B+eMrwehGtHBWHFU4GXvnSCNiFhhausg==}
@ -90,6 +99,9 @@ packages:
'@types/node@22.13.4': '@types/node@22.13.4':
resolution: {integrity: sha512-ywP2X0DYtX3y08eFVx5fNIw7/uIv8hYUKgXoK8oayJlLnKcRfEYCxWMVE1XagUdVtCJlZT1AU4LXEABW+L1Peg==} resolution: {integrity: sha512-ywP2X0DYtX3y08eFVx5fNIw7/uIv8hYUKgXoK8oayJlLnKcRfEYCxWMVE1XagUdVtCJlZT1AU4LXEABW+L1Peg==}
'@types/turndown@5.0.5':
resolution: {integrity: sha512-TL2IgGgc7B5j78rIccBtlYAnkuv8nUQqhQc+DSYV5j9Be9XOcm/SKOVRuA47xAVI3680Tk9B1d8flK2GWT2+4w==}
'@types/uuid@10.0.0': '@types/uuid@10.0.0':
resolution: {integrity: sha512-7gqG38EyHgyP1S+7+xomFtL+ZNHcKv6DwNaCZmJmo1vgMugyF3TCnXVg4t1uk89mLNwnLtnY3TpOpCOyp1/xHQ==} resolution: {integrity: sha512-7gqG38EyHgyP1S+7+xomFtL+ZNHcKv6DwNaCZmJmo1vgMugyF3TCnXVg4t1uk89mLNwnLtnY3TpOpCOyp1/xHQ==}
@ -655,6 +667,9 @@ packages:
tslib@2.8.1: tslib@2.8.1:
resolution: {integrity: sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==} resolution: {integrity: sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==}
turndown@7.2.0:
resolution: {integrity: sha512-eCZGBN4nNNqM9Owkv9HAtWRYfLA4h909E/WGAWWBpmB275ehNhZyk87/Tpvjbp0jjNl9XwCsbe6bm6CqFsgD+A==}
typed-query-selector@2.12.0: typed-query-selector@2.12.0:
resolution: {integrity: sha512-SbklCd1F0EiZOyPiW192rrHZzZ5sBijB6xM+cpmrwDqObvdtunOHHIk9fCGsoK5JVIYXoyEp4iEdE3upFH3PAg==} resolution: {integrity: sha512-SbklCd1F0EiZOyPiW192rrHZzZ5sBijB6xM+cpmrwDqObvdtunOHHIk9fCGsoK5JVIYXoyEp4iEdE3upFH3PAg==}
@ -757,6 +772,8 @@ snapshots:
'@jridgewell/resolve-uri': 3.1.2 '@jridgewell/resolve-uri': 3.1.2
'@jridgewell/sourcemap-codec': 1.5.0 '@jridgewell/sourcemap-codec': 1.5.0
'@mixmark-io/domino@2.2.0': {}
'@mongodb-js/saslprep@1.2.0': '@mongodb-js/saslprep@1.2.0':
dependencies: dependencies:
sparse-bitfield: 3.0.3 sparse-bitfield: 3.0.3
@ -788,6 +805,8 @@ snapshots:
dependencies: dependencies:
undici-types: 6.20.0 undici-types: 6.20.0
'@types/turndown@5.0.5': {}
'@types/uuid@10.0.0': {} '@types/uuid@10.0.0': {}
'@types/webidl-conversions@7.0.3': {} '@types/webidl-conversions@7.0.3': {}
@ -1392,6 +1411,10 @@ snapshots:
tslib@2.8.1: {} tslib@2.8.1: {}
turndown@7.2.0:
dependencies:
'@mixmark-io/domino': 2.2.0
typed-query-selector@2.12.0: {} typed-query-selector@2.12.0: {}
typescript@5.7.3: {} typescript@5.7.3: {}

View File

@ -12,78 +12,78 @@ const fetchList = [
typeId: '6790aae23de33b392c0330b2', typeId: '6790aae23de33b392c0330b2',
url: 'https://ai-bot.cn/favorites/ai-writing-tools/' url: 'https://ai-bot.cn/favorites/ai-writing-tools/'
}, },
{ // {
name: 'AI图像网站', // name: 'AI图像网站',
typeId: '67908fc33de33b392c0330af', // typeId: '67908fc33de33b392c0330af',
url: 'https://ai-bot.cn/favorites/best-ai-image-tools/' // url: 'https://ai-bot.cn/favorites/best-ai-image-tools/'
}, // },
{ // {
name: 'AI视频网站', // name: 'AI视频网站',
typeId: '67b6f0b7b139d1d6aa14cd06', // typeId: '67b6f0b7b139d1d6aa14cd06',
url: 'https://ai-bot.cn/favorites/ai-video-tools/' // url: 'https://ai-bot.cn/favorites/ai-video-tools/'
}, // },
{ // {
name: 'AI音频网站', // name: 'AI音频网站',
typeId: '6791a98fc058e55ed0a094ca', // typeId: '6791a98fc058e55ed0a094ca',
url: 'https://ai-bot.cn/favorites/ai-audio-tools/' // url: 'https://ai-bot.cn/favorites/ai-audio-tools/'
}, // },
{ // {
name: 'AI办公网站', // name: 'AI办公网站',
typeId: '6790ab4f3de33b392c0330b3', // typeId: '6790ab4f3de33b392c0330b3',
url: 'https://ai-bot.cn/favorites/ai-office-tools/', // url: 'https://ai-bot.cn/favorites/ai-office-tools/',
hasSubType: true // hasSubType: true
}, // },
{ // {
name: 'AI搜索工具', // name: 'AI搜索工具',
typeId: '6790dc6b3de33b392c0330bb', // typeId: '6790dc6b3de33b392c0330bb',
url: 'https://ai-bot.cn/favorites/ai-search-engines/' // url: 'https://ai-bot.cn/favorites/ai-search-engines/'
}, // },
{ // {
name: 'AI对话网站', // name: 'AI对话网站',
typeId: '6790c2f93de33b392c0330b6', // typeId: '6790c2f93de33b392c0330b6',
url: 'https://ai-bot.cn/favorites/ai-chatbots/' // url: 'https://ai-bot.cn/favorites/ai-chatbots/'
}, // },
{ // {
name: 'AI内容检测', // name: 'AI内容检测',
typeId: '67b707c9b139d1d6aa14cd07', // typeId: '67b707c9b139d1d6aa14cd07',
url: 'https://ai-bot.cn/favorites/ai-content-detection-tools/' // url: 'https://ai-bot.cn/favorites/ai-content-detection-tools/'
}, // },
{ // {
name: 'AI学习工具', // name: 'AI学习工具',
typeId: '67b7080fb139d1d6aa14cd08', // typeId: '67b7080fb139d1d6aa14cd08',
url: 'https://ai-bot.cn/favorites/websites-to-learn-ai/' // url: 'https://ai-bot.cn/favorites/websites-to-learn-ai/'
}, // },
{ // {
name: 'AI开发平台', // name: 'AI开发平台',
typeId: '67b7eb3de0cf2993700b1186', // typeId: '67b7eb3de0cf2993700b1186',
url: 'https://ai-bot.cn/favorites/ai-frameworks/' // url: 'https://ai-bot.cn/favorites/ai-frameworks/'
}, // },
{ // {
name: 'AI提示工具', // name: 'AI提示工具',
typeId: '67b7e9bce0cf2993700b1184', // typeId: '67b7e9bce0cf2993700b1184',
url: 'https://ai-bot.cn/favorites/ai-prompt-tools/' // url: 'https://ai-bot.cn/favorites/ai-prompt-tools/'
}, // },
{ // {
name: 'AI法律助手', // name: 'AI法律助手',
typeId: '67b7eae0e0cf2993700b1185', // typeId: '67b7eae0e0cf2993700b1185',
url: 'https://ai-bot.cn/favorites/ai-legal-assistants/' // url: 'https://ai-bot.cn/favorites/ai-legal-assistants/'
}, // },
{ // {
name: 'AI训练模型', // name: 'AI训练模型',
typeId: '67b7eb84e0cf2993700b1187', // typeId: '67b7eb84e0cf2993700b1187',
url: 'https://ai-bot.cn/favorites/ai-models/' // url: 'https://ai-bot.cn/favorites/ai-models/'
}, // },
{ // {
name: 'AI设计工具', // name: 'AI设计工具',
typeId: '6790ab9d3de33b392c0330b4', // typeId: '6790ab9d3de33b392c0330b4',
url: 'https://ai-bot.cn/favorites/ai-design-tools/' // url: 'https://ai-bot.cn/favorites/ai-design-tools/'
}, // },
{ // {
name: 'AI编程工具', // name: 'AI编程工具',
typeId: '6790dc2c3de33b392c0330ba', // typeId: '6790dc2c3de33b392c0330ba',
url: 'https://ai-bot.cn/favorites/websites-to-learn-ai/' // url: 'https://ai-bot.cn/favorites/websites-to-learn-ai/'
} // }
] ]
function main() { function main() {

View File

@ -2,6 +2,7 @@ import axios from 'axios';
const cheerio = require('cheerio') const cheerio = require('cheerio')
import { downloadImage } from "../share/tools" import { downloadImage } from "../share/tools"
import { getCollection } from '../lib/mongodb'; import { getCollection } from '../lib/mongodb';
import Turndown from 'turndown';
import { FetchType } from '..'; import { FetchType } from '..';
// 要抓取的网页 URL // 要抓取的网页 URL
@ -11,8 +12,32 @@ async function getPageData(url: string, name: string) {
const { data } = await axios.get(url); const { data } = await axios.get(url);
const $ = cheerio.load(data); const $ = cheerio.load(data);
const element = $(`a[title="${name}"]`) const element = $(`a[title="${name}"]`)
return element.attr('href') const href = element.attr('href') as string
const panelBodyHtml = $('.panel-body').html();
// 2. 使用Turndown将HTML转换为Markdown
const turndown = new Turndown({
codeBlockStyle: 'fenced', // 代码块用```包裹
headingStyle: 'atx' // 标题用#符号
});
// 添加自定义规则(可选)
turndown.addRule('preCodeBlock', {
filter: ['pre'],
replacement: (content) => {
// 保留pre标签内的原始格式如代码块
return '\n```\n' + content + '\n```\n';
}
});
// 执行转换
const markdown = turndown.turndown(panelBodyHtml);
const title = $('.site-name').text().trim()
return {
href,
markdown,
title,
}
} catch (error) { } catch (error) {
console.error('Error fetching data:', error); console.error('Error fetching data:', error);
@ -47,11 +72,20 @@ export async function fetchData({ typeId, name: typeName, url, hasSubType = fals
const _originLink = $(element).find('img').attr('data-src'); const _originLink = $(element).find('img').attr('data-src');
let link = tempLink let link = tempLink
const articleData = {} as any
console.log(subTitle); console.log(subTitle);
if (tempLink.startsWith('https://ai-bot')) { if (tempLink.startsWith('https://ai-bot')) {
link = await getPageData(tempLink, name) || '' const pageData = await getPageData(tempLink, name)
if (pageData) {
link = pageData.href
articleData.markdown = pageData.markdown
articleData.title = pageData.title
}
console.log(pageData);
} }
// 假设工具的类别是固定的,比如 "AI写作工具" // 假设工具的类别是固定的,比如 "AI写作工具"
@ -87,7 +121,7 @@ export async function fetchData({ typeId, name: typeName, url, hasSubType = fals
} }
i++ i++
console.clear() // console.clear()
console.log(`正在爬取${typeName + ',' + subTitle || ''}类别的数据,共${length}条数据`); console.log(`正在爬取${typeName + ',' + subTitle || ''}类别的数据,共${length}条数据`);
console.log(`${typeName}:进度:${i}/${length}`); console.log(`${typeName}:进度:${i}/${length}`);