This commit is contained in:
expdsn 2025-02-25 19:04:30 +08:00
parent e8e249de35
commit 141bf45123
4 changed files with 134 additions and 75 deletions

View File

@ -17,6 +17,7 @@
"g": "^2.0.1",
"mongodb": "^6.13.0",
"puppeteer": "^24.2.1",
"turndown": "^7.2.0",
"uuid": "^11.0.5"
},
"pnpm": {
@ -26,6 +27,7 @@
},
"devDependencies": {
"@types/node": "^22.13.4",
"@types/turndown": "^5.0.5",
"@types/uuid": "^10.0.0",
"ts-node": "^10.9.2",
"typescript": "^5.7.3"

View File

@ -23,6 +23,9 @@ importers:
puppeteer:
specifier: ^24.2.1
version: 24.2.1(typescript@5.7.3)
turndown:
specifier: ^7.2.0
version: 7.2.0
uuid:
specifier: ^11.0.5
version: 11.0.5
@ -30,6 +33,9 @@ importers:
'@types/node':
specifier: ^22.13.4
version: 22.13.4
'@types/turndown':
specifier: ^5.0.5
version: 5.0.5
'@types/uuid':
specifier: ^10.0.0
version: 10.0.0
@ -64,6 +70,9 @@ packages:
'@jridgewell/trace-mapping@0.3.9':
resolution: {integrity: sha512-3Belt6tdc8bPgAtbcmdtNJlirVoTmEb5e2gC94PnkwEW9jI6CAHUeoG85tjWP5WquqfavoMtMwiG4P926ZKKuQ==}
'@mixmark-io/domino@2.2.0':
resolution: {integrity: sha512-Y28PR25bHXUg88kCV7nivXrP2Nj2RueZ3/l/jdx6J9f8J4nsEGcgX0Qe6lt7Pa+J79+kPiJU3LguR6O/6zrLOw==}
'@mongodb-js/saslprep@1.2.0':
resolution: {integrity: sha512-+ywrb0AqkfaYuhHs6LxKWgqbh3I72EpEgESCw37o+9qPx9WTCkgDm2B+eMrwehGtHBWHFU4GXvnSCNiFhhausg==}
@ -90,6 +99,9 @@ packages:
'@types/node@22.13.4':
resolution: {integrity: sha512-ywP2X0DYtX3y08eFVx5fNIw7/uIv8hYUKgXoK8oayJlLnKcRfEYCxWMVE1XagUdVtCJlZT1AU4LXEABW+L1Peg==}
'@types/turndown@5.0.5':
resolution: {integrity: sha512-TL2IgGgc7B5j78rIccBtlYAnkuv8nUQqhQc+DSYV5j9Be9XOcm/SKOVRuA47xAVI3680Tk9B1d8flK2GWT2+4w==}
'@types/uuid@10.0.0':
resolution: {integrity: sha512-7gqG38EyHgyP1S+7+xomFtL+ZNHcKv6DwNaCZmJmo1vgMugyF3TCnXVg4t1uk89mLNwnLtnY3TpOpCOyp1/xHQ==}
@ -655,6 +667,9 @@ packages:
tslib@2.8.1:
resolution: {integrity: sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==}
turndown@7.2.0:
resolution: {integrity: sha512-eCZGBN4nNNqM9Owkv9HAtWRYfLA4h909E/WGAWWBpmB275ehNhZyk87/Tpvjbp0jjNl9XwCsbe6bm6CqFsgD+A==}
typed-query-selector@2.12.0:
resolution: {integrity: sha512-SbklCd1F0EiZOyPiW192rrHZzZ5sBijB6xM+cpmrwDqObvdtunOHHIk9fCGsoK5JVIYXoyEp4iEdE3upFH3PAg==}
@ -757,6 +772,8 @@ snapshots:
'@jridgewell/resolve-uri': 3.1.2
'@jridgewell/sourcemap-codec': 1.5.0
'@mixmark-io/domino@2.2.0': {}
'@mongodb-js/saslprep@1.2.0':
dependencies:
sparse-bitfield: 3.0.3
@ -788,6 +805,8 @@ snapshots:
dependencies:
undici-types: 6.20.0
'@types/turndown@5.0.5': {}
'@types/uuid@10.0.0': {}
'@types/webidl-conversions@7.0.3': {}
@ -1392,6 +1411,10 @@ snapshots:
tslib@2.8.1: {}
turndown@7.2.0:
dependencies:
'@mixmark-io/domino': 2.2.0
typed-query-selector@2.12.0: {}
typescript@5.7.3: {}

View File

@ -12,78 +12,78 @@ const fetchList = [
typeId: '6790aae23de33b392c0330b2',
url: 'https://ai-bot.cn/favorites/ai-writing-tools/'
},
{
name: 'AI图像网站',
typeId: '67908fc33de33b392c0330af',
url: 'https://ai-bot.cn/favorites/best-ai-image-tools/'
},
{
name: 'AI视频网站',
typeId: '67b6f0b7b139d1d6aa14cd06',
url: 'https://ai-bot.cn/favorites/ai-video-tools/'
},
{
name: 'AI音频网站',
typeId: '6791a98fc058e55ed0a094ca',
url: 'https://ai-bot.cn/favorites/ai-audio-tools/'
},
{
name: 'AI办公网站',
typeId: '6790ab4f3de33b392c0330b3',
url: 'https://ai-bot.cn/favorites/ai-office-tools/',
hasSubType: true
},
{
name: 'AI搜索工具',
typeId: '6790dc6b3de33b392c0330bb',
url: 'https://ai-bot.cn/favorites/ai-search-engines/'
},
{
name: 'AI对话网站',
typeId: '6790c2f93de33b392c0330b6',
url: 'https://ai-bot.cn/favorites/ai-chatbots/'
},
{
name: 'AI内容检测',
typeId: '67b707c9b139d1d6aa14cd07',
url: 'https://ai-bot.cn/favorites/ai-content-detection-tools/'
},
{
name: 'AI学习工具',
typeId: '67b7080fb139d1d6aa14cd08',
url: 'https://ai-bot.cn/favorites/websites-to-learn-ai/'
},
{
name: 'AI开发平台',
typeId: '67b7eb3de0cf2993700b1186',
url: 'https://ai-bot.cn/favorites/ai-frameworks/'
},
{
name: 'AI提示工具',
typeId: '67b7e9bce0cf2993700b1184',
url: 'https://ai-bot.cn/favorites/ai-prompt-tools/'
},
{
name: 'AI法律助手',
typeId: '67b7eae0e0cf2993700b1185',
url: 'https://ai-bot.cn/favorites/ai-legal-assistants/'
},
{
name: 'AI训练模型',
typeId: '67b7eb84e0cf2993700b1187',
url: 'https://ai-bot.cn/favorites/ai-models/'
},
{
name: 'AI设计工具',
typeId: '6790ab9d3de33b392c0330b4',
url: 'https://ai-bot.cn/favorites/ai-design-tools/'
},
// {
// name: 'AI图像网站',
// typeId: '67908fc33de33b392c0330af',
// url: 'https://ai-bot.cn/favorites/best-ai-image-tools/'
// },
// {
// name: 'AI视频网站',
// typeId: '67b6f0b7b139d1d6aa14cd06',
// url: 'https://ai-bot.cn/favorites/ai-video-tools/'
// },
// {
// name: 'AI音频网站',
// typeId: '6791a98fc058e55ed0a094ca',
// url: 'https://ai-bot.cn/favorites/ai-audio-tools/'
// },
// {
// name: 'AI办公网站',
// typeId: '6790ab4f3de33b392c0330b3',
// url: 'https://ai-bot.cn/favorites/ai-office-tools/',
// hasSubType: true
// },
// {
// name: 'AI搜索工具',
// typeId: '6790dc6b3de33b392c0330bb',
// url: 'https://ai-bot.cn/favorites/ai-search-engines/'
// },
// {
// name: 'AI对话网站',
// typeId: '6790c2f93de33b392c0330b6',
// url: 'https://ai-bot.cn/favorites/ai-chatbots/'
// },
// {
// name: 'AI内容检测',
// typeId: '67b707c9b139d1d6aa14cd07',
// url: 'https://ai-bot.cn/favorites/ai-content-detection-tools/'
// },
// {
// name: 'AI学习工具',
// typeId: '67b7080fb139d1d6aa14cd08',
// url: 'https://ai-bot.cn/favorites/websites-to-learn-ai/'
// },
// {
// name: 'AI开发平台',
// typeId: '67b7eb3de0cf2993700b1186',
// url: 'https://ai-bot.cn/favorites/ai-frameworks/'
// },
// {
// name: 'AI提示工具',
// typeId: '67b7e9bce0cf2993700b1184',
// url: 'https://ai-bot.cn/favorites/ai-prompt-tools/'
// },
// {
// name: 'AI法律助手',
// typeId: '67b7eae0e0cf2993700b1185',
// url: 'https://ai-bot.cn/favorites/ai-legal-assistants/'
// },
// {
// name: 'AI训练模型',
// typeId: '67b7eb84e0cf2993700b1187',
// url: 'https://ai-bot.cn/favorites/ai-models/'
// },
// {
// name: 'AI设计工具',
// typeId: '6790ab9d3de33b392c0330b4',
// url: 'https://ai-bot.cn/favorites/ai-design-tools/'
// },
{
name: 'AI编程工具',
typeId: '6790dc2c3de33b392c0330ba',
url: 'https://ai-bot.cn/favorites/websites-to-learn-ai/'
}
// {
// name: 'AI编程工具',
// typeId: '6790dc2c3de33b392c0330ba',
// url: 'https://ai-bot.cn/favorites/websites-to-learn-ai/'
// }
]
function main() {

View File

@ -2,6 +2,7 @@ import axios from 'axios';
const cheerio = require('cheerio')
import { downloadImage } from "../share/tools"
import { getCollection } from '../lib/mongodb';
import Turndown from 'turndown';
import { FetchType } from '..';
// 要抓取的网页 URL
@ -11,8 +12,32 @@ async function getPageData(url: string, name: string) {
const { data } = await axios.get(url);
const $ = cheerio.load(data);
const element = $(`a[title="${name}"]`)
return element.attr('href')
const href = element.attr('href') as string
const panelBodyHtml = $('.panel-body').html();
// 2. 使用Turndown将HTML转换为Markdown
const turndown = new Turndown({
codeBlockStyle: 'fenced', // 代码块用```包裹
headingStyle: 'atx' // 标题用#符号
});
// 添加自定义规则(可选)
turndown.addRule('preCodeBlock', {
filter: ['pre'],
replacement: (content) => {
// 保留pre标签内的原始格式如代码块
return '\n```\n' + content + '\n```\n';
}
});
// 执行转换
const markdown = turndown.turndown(panelBodyHtml);
const title = $('.site-name').text().trim()
return {
href,
markdown,
title,
}
} catch (error) {
console.error('Error fetching data:', error);
@ -47,11 +72,20 @@ export async function fetchData({ typeId, name: typeName, url, hasSubType = fals
const _originLink = $(element).find('img').attr('data-src');
let link = tempLink
const articleData = {} as any
console.log(subTitle);
if (tempLink.startsWith('https://ai-bot')) {
link = await getPageData(tempLink, name) || ''
const pageData = await getPageData(tempLink, name)
if (pageData) {
link = pageData.href
articleData.markdown = pageData.markdown
articleData.title = pageData.title
}
console.log(pageData);
}
// 假设工具的类别是固定的,比如 "AI写作工具"
@ -87,7 +121,7 @@ export async function fetchData({ typeId, name: typeName, url, hasSubType = fals
}
i++
console.clear()
// console.clear()
console.log(`正在爬取${typeName + ',' + subTitle || ''}类别的数据,共${length}条数据`);
console.log(`${typeName}:进度:${i}/${length}`);