From 141bf451237d4d19ae799ec77a719d8693edc080 Mon Sep 17 00:00:00 2001 From: expdsn <18111002318@163.com> Date: Tue, 25 Feb 2025 19:04:30 +0800 Subject: [PATCH] save --- package.json | 2 + pnpm-lock.yaml | 23 ++++++++ src/index.ts | 142 +++++++++++++++++++++++----------------------- src/link/index.ts | 42 ++++++++++++-- 4 files changed, 134 insertions(+), 75 deletions(-) diff --git a/package.json b/package.json index cbb9c75..4e1f5c7 100644 --- a/package.json +++ b/package.json @@ -17,6 +17,7 @@ "g": "^2.0.1", "mongodb": "^6.13.0", "puppeteer": "^24.2.1", + "turndown": "^7.2.0", "uuid": "^11.0.5" }, "pnpm": { @@ -26,6 +27,7 @@ }, "devDependencies": { "@types/node": "^22.13.4", + "@types/turndown": "^5.0.5", "@types/uuid": "^10.0.0", "ts-node": "^10.9.2", "typescript": "^5.7.3" diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 5c994c5..6315ed5 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -23,6 +23,9 @@ importers: puppeteer: specifier: ^24.2.1 version: 24.2.1(typescript@5.7.3) + turndown: + specifier: ^7.2.0 + version: 7.2.0 uuid: specifier: ^11.0.5 version: 11.0.5 @@ -30,6 +33,9 @@ importers: '@types/node': specifier: ^22.13.4 version: 22.13.4 + '@types/turndown': + specifier: ^5.0.5 + version: 5.0.5 '@types/uuid': specifier: ^10.0.0 version: 10.0.0 @@ -64,6 +70,9 @@ packages: '@jridgewell/trace-mapping@0.3.9': resolution: {integrity: sha512-3Belt6tdc8bPgAtbcmdtNJlirVoTmEb5e2gC94PnkwEW9jI6CAHUeoG85tjWP5WquqfavoMtMwiG4P926ZKKuQ==} + '@mixmark-io/domino@2.2.0': + resolution: {integrity: sha512-Y28PR25bHXUg88kCV7nivXrP2Nj2RueZ3/l/jdx6J9f8J4nsEGcgX0Qe6lt7Pa+J79+kPiJU3LguR6O/6zrLOw==} + '@mongodb-js/saslprep@1.2.0': resolution: {integrity: sha512-+ywrb0AqkfaYuhHs6LxKWgqbh3I72EpEgESCw37o+9qPx9WTCkgDm2B+eMrwehGtHBWHFU4GXvnSCNiFhhausg==} @@ -90,6 +99,9 @@ packages: '@types/node@22.13.4': resolution: {integrity: sha512-ywP2X0DYtX3y08eFVx5fNIw7/uIv8hYUKgXoK8oayJlLnKcRfEYCxWMVE1XagUdVtCJlZT1AU4LXEABW+L1Peg==} + '@types/turndown@5.0.5': + resolution: {integrity: sha512-TL2IgGgc7B5j78rIccBtlYAnkuv8nUQqhQc+DSYV5j9Be9XOcm/SKOVRuA47xAVI3680Tk9B1d8flK2GWT2+4w==} + '@types/uuid@10.0.0': resolution: {integrity: sha512-7gqG38EyHgyP1S+7+xomFtL+ZNHcKv6DwNaCZmJmo1vgMugyF3TCnXVg4t1uk89mLNwnLtnY3TpOpCOyp1/xHQ==} @@ -655,6 +667,9 @@ packages: tslib@2.8.1: resolution: {integrity: sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==} + turndown@7.2.0: + resolution: {integrity: sha512-eCZGBN4nNNqM9Owkv9HAtWRYfLA4h909E/WGAWWBpmB275ehNhZyk87/Tpvjbp0jjNl9XwCsbe6bm6CqFsgD+A==} + typed-query-selector@2.12.0: resolution: {integrity: sha512-SbklCd1F0EiZOyPiW192rrHZzZ5sBijB6xM+cpmrwDqObvdtunOHHIk9fCGsoK5JVIYXoyEp4iEdE3upFH3PAg==} @@ -757,6 +772,8 @@ snapshots: '@jridgewell/resolve-uri': 3.1.2 '@jridgewell/sourcemap-codec': 1.5.0 + '@mixmark-io/domino@2.2.0': {} + '@mongodb-js/saslprep@1.2.0': dependencies: sparse-bitfield: 3.0.3 @@ -788,6 +805,8 @@ snapshots: dependencies: undici-types: 6.20.0 + '@types/turndown@5.0.5': {} + '@types/uuid@10.0.0': {} '@types/webidl-conversions@7.0.3': {} @@ -1392,6 +1411,10 @@ snapshots: tslib@2.8.1: {} + turndown@7.2.0: + dependencies: + '@mixmark-io/domino': 2.2.0 + typed-query-selector@2.12.0: {} typescript@5.7.3: {} diff --git a/src/index.ts b/src/index.ts index a249e6f..454939e 100644 --- a/src/index.ts +++ b/src/index.ts @@ -12,78 +12,78 @@ const fetchList = [ typeId: '6790aae23de33b392c0330b2', url: 'https://ai-bot.cn/favorites/ai-writing-tools/' }, - { - name: 'AI图像网站', - typeId: '67908fc33de33b392c0330af', - url: 'https://ai-bot.cn/favorites/best-ai-image-tools/' - }, - { - name: 'AI视频网站', - typeId: '67b6f0b7b139d1d6aa14cd06', - url: 'https://ai-bot.cn/favorites/ai-video-tools/' - }, - { - name: 'AI音频网站', - typeId: '6791a98fc058e55ed0a094ca', - url: 'https://ai-bot.cn/favorites/ai-audio-tools/' - }, - { - name: 'AI办公网站', - typeId: '6790ab4f3de33b392c0330b3', - url: 'https://ai-bot.cn/favorites/ai-office-tools/', - hasSubType: true - }, - { - name: 'AI搜索工具', - typeId: '6790dc6b3de33b392c0330bb', - url: 'https://ai-bot.cn/favorites/ai-search-engines/' - }, - { - name: 'AI对话网站', - typeId: '6790c2f93de33b392c0330b6', - url: 'https://ai-bot.cn/favorites/ai-chatbots/' - }, - { - name: 'AI内容检测', - typeId: '67b707c9b139d1d6aa14cd07', - url: 'https://ai-bot.cn/favorites/ai-content-detection-tools/' - }, - { - name: 'AI学习工具', - typeId: '67b7080fb139d1d6aa14cd08', - url: 'https://ai-bot.cn/favorites/websites-to-learn-ai/' - }, - { - name: 'AI开发平台', - typeId: '67b7eb3de0cf2993700b1186', - url: 'https://ai-bot.cn/favorites/ai-frameworks/' - }, - { - name: 'AI提示工具', - typeId: '67b7e9bce0cf2993700b1184', - url: 'https://ai-bot.cn/favorites/ai-prompt-tools/' - }, - { - name: 'AI法律助手', - typeId: '67b7eae0e0cf2993700b1185', - url: 'https://ai-bot.cn/favorites/ai-legal-assistants/' - }, - { - name: 'AI训练模型', - typeId: '67b7eb84e0cf2993700b1187', - url: 'https://ai-bot.cn/favorites/ai-models/' - }, - { - name: 'AI设计工具', - typeId: '6790ab9d3de33b392c0330b4', - url: 'https://ai-bot.cn/favorites/ai-design-tools/' - }, + // { + // name: 'AI图像网站', + // typeId: '67908fc33de33b392c0330af', + // url: 'https://ai-bot.cn/favorites/best-ai-image-tools/' + // }, + // { + // name: 'AI视频网站', + // typeId: '67b6f0b7b139d1d6aa14cd06', + // url: 'https://ai-bot.cn/favorites/ai-video-tools/' + // }, + // { + // name: 'AI音频网站', + // typeId: '6791a98fc058e55ed0a094ca', + // url: 'https://ai-bot.cn/favorites/ai-audio-tools/' + // }, + // { + // name: 'AI办公网站', + // typeId: '6790ab4f3de33b392c0330b3', + // url: 'https://ai-bot.cn/favorites/ai-office-tools/', + // hasSubType: true + // }, + // { + // name: 'AI搜索工具', + // typeId: '6790dc6b3de33b392c0330bb', + // url: 'https://ai-bot.cn/favorites/ai-search-engines/' + // }, + // { + // name: 'AI对话网站', + // typeId: '6790c2f93de33b392c0330b6', + // url: 'https://ai-bot.cn/favorites/ai-chatbots/' + // }, + // { + // name: 'AI内容检测', + // typeId: '67b707c9b139d1d6aa14cd07', + // url: 'https://ai-bot.cn/favorites/ai-content-detection-tools/' + // }, + // { + // name: 'AI学习工具', + // typeId: '67b7080fb139d1d6aa14cd08', + // url: 'https://ai-bot.cn/favorites/websites-to-learn-ai/' + // }, + // { + // name: 'AI开发平台', + // typeId: '67b7eb3de0cf2993700b1186', + // url: 'https://ai-bot.cn/favorites/ai-frameworks/' + // }, + // { + // name: 'AI提示工具', + // typeId: '67b7e9bce0cf2993700b1184', + // url: 'https://ai-bot.cn/favorites/ai-prompt-tools/' + // }, + // { + // name: 'AI法律助手', + // typeId: '67b7eae0e0cf2993700b1185', + // url: 'https://ai-bot.cn/favorites/ai-legal-assistants/' + // }, + // { + // name: 'AI训练模型', + // typeId: '67b7eb84e0cf2993700b1187', + // url: 'https://ai-bot.cn/favorites/ai-models/' + // }, + // { + // name: 'AI设计工具', + // typeId: '6790ab9d3de33b392c0330b4', + // url: 'https://ai-bot.cn/favorites/ai-design-tools/' + // }, - { - name: 'AI编程工具', - typeId: '6790dc2c3de33b392c0330ba', - url: 'https://ai-bot.cn/favorites/websites-to-learn-ai/' - } + // { + // name: 'AI编程工具', + // typeId: '6790dc2c3de33b392c0330ba', + // url: 'https://ai-bot.cn/favorites/websites-to-learn-ai/' + // } ] function main() { diff --git a/src/link/index.ts b/src/link/index.ts index 169c545..43a2ecb 100644 --- a/src/link/index.ts +++ b/src/link/index.ts @@ -2,6 +2,7 @@ import axios from 'axios'; const cheerio = require('cheerio') import { downloadImage } from "../share/tools" import { getCollection } from '../lib/mongodb'; +import Turndown from 'turndown'; import { FetchType } from '..'; // 要抓取的网页 URL @@ -11,8 +12,32 @@ async function getPageData(url: string, name: string) { const { data } = await axios.get(url); const $ = cheerio.load(data); const element = $(`a[title="${name}"]`) - return element.attr('href') + const href = element.attr('href') as string + const panelBodyHtml = $('.panel-body').html(); + // 2. 使用Turndown将HTML转换为Markdown + const turndown = new Turndown({ + codeBlockStyle: 'fenced', // 代码块用```包裹 + headingStyle: 'atx' // 标题用#符号 + }); + + // 添加自定义规则(可选) + turndown.addRule('preCodeBlock', { + filter: ['pre'], + replacement: (content) => { + // 保留pre标签内的原始格式(如代码块) + return '\n```\n' + content + '\n```\n'; + } + }); + // 执行转换 + const markdown = turndown.turndown(panelBodyHtml); + const title = $('.site-name').text().trim() + return { + href, + markdown, + title, + + } } catch (error) { console.error('Error fetching data:', error); @@ -47,11 +72,20 @@ export async function fetchData({ typeId, name: typeName, url, hasSubType = fals const _originLink = $(element).find('img').attr('data-src'); let link = tempLink - + const articleData = {} as any console.log(subTitle); if (tempLink.startsWith('https://ai-bot')) { - link = await getPageData(tempLink, name) || '' + const pageData = await getPageData(tempLink, name) + if (pageData) { + link = pageData.href + articleData.markdown = pageData.markdown + articleData.title = pageData.title + + + } + console.log(pageData); + } // 假设工具的类别是固定的,比如 "AI写作工具" @@ -87,7 +121,7 @@ export async function fetchData({ typeId, name: typeName, url, hasSubType = fals } i++ - console.clear() + // console.clear() console.log(`正在爬取${typeName + ',' + subTitle || ''}类别的数据,共${length}条数据`); console.log(`${typeName}:进度:${i}/${length}`);