diff --git a/.env.local b/.env.local new file mode 100644 index 0000000..2d8d9ac --- /dev/null +++ b/.env.local @@ -0,0 +1,5 @@ +MONGODB_URI=mongodb://expdsn:58662@expdsn.cloud:27017 +# MONGODB_URI=mongodb://expdsn:58662@localhost:27017 +SESSION_SECRET=lREDRcaFwZIzM7Rjw63XGj8trTyMqhVUsVwwhuTQnFs= +ALIYUN_RAM_ACCESS_KEY_ID=LTAI5tNzopZHJFa2Q9vqr1u5 +ALIYUN_RAM_ACCESS_KEY_SECRET=qPu7fyft0KJ1l6SGqbS71IW0vDbRlr \ No newline at end of file diff --git a/package.json b/package.json index 39e7635..f4d5683 100644 --- a/package.json +++ b/package.json @@ -14,6 +14,7 @@ "axios": "^1.7.9", "cheerio": "^1.0.0", "g": "^2.0.1", + "mongodb": "^6.13.0", "puppeteer": "^24.2.1", "uuid": "^11.0.5" }, diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 5b60493..5c994c5 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -17,6 +17,9 @@ importers: g: specifier: ^2.0.1 version: 2.0.1 + mongodb: + specifier: ^6.13.0 + version: 6.13.0(socks@2.8.4) puppeteer: specifier: ^24.2.1 version: 24.2.1(typescript@5.7.3) @@ -61,6 +64,9 @@ packages: '@jridgewell/trace-mapping@0.3.9': resolution: {integrity: sha512-3Belt6tdc8bPgAtbcmdtNJlirVoTmEb5e2gC94PnkwEW9jI6CAHUeoG85tjWP5WquqfavoMtMwiG4P926ZKKuQ==} + '@mongodb-js/saslprep@1.2.0': + resolution: {integrity: sha512-+ywrb0AqkfaYuhHs6LxKWgqbh3I72EpEgESCw37o+9qPx9WTCkgDm2B+eMrwehGtHBWHFU4GXvnSCNiFhhausg==} + '@puppeteer/browsers@2.7.1': resolution: {integrity: sha512-MK7rtm8JjaxPN7Mf1JdZIZKPD2Z+W7osvrC1vjpvfOX1K0awDIHYbNi89f7eotp7eMUn2shWnt03HwVbriXtKQ==} engines: {node: '>=18'} @@ -87,6 +93,12 @@ packages: '@types/uuid@10.0.0': resolution: {integrity: sha512-7gqG38EyHgyP1S+7+xomFtL+ZNHcKv6DwNaCZmJmo1vgMugyF3TCnXVg4t1uk89mLNwnLtnY3TpOpCOyp1/xHQ==} + '@types/webidl-conversions@7.0.3': + resolution: {integrity: sha512-CiJJvcRtIgzadHCYXw7dqEnMNRjhGZlYK05Mj9OyktqV8uVT8fD2BFOB7S1uwBE3Kj2Z+4UyPmFw/Ixgw/LAlA==} + + '@types/whatwg-url@11.0.5': + resolution: {integrity: sha512-coYR071JRaHa+xoEvvYqvnIHaVqaYrLPbsufM9BF63HkwI5Lgmy2QR8Q5K/lYDYo5AK82wOvSOS0UsLTpTG7uQ==} + '@types/yauzl@2.10.3': resolution: {integrity: sha512-oJoftv0LSuaDZE3Le4DbKX+KS9G36NzOeSap90UIK0yMA/NhKJhqlSGtNDORNRaIbQfzjXDrQa0ytJ6mNRGz/Q==} @@ -162,6 +174,10 @@ packages: boolbase@1.0.0: resolution: {integrity: sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww==} + bson@6.10.3: + resolution: {integrity: sha512-MTxGsqgYTwfshYWTRdmZRC+M7FnG1b4y7RO7p2k3X24Wq0yv1m77Wsj0BzlPzd/IowgESfsruQCUToa7vbOpPQ==} + engines: {node: '>=16.20.1'} + buffer-crc32@0.2.13: resolution: {integrity: sha512-VO9Ht/+p3SN7SKWqcrgEzjGbRSJYTx+Q1pTQC0wrWqHx0vpJraQ6GtHx8tvcg1rlK1byhU5gccxgOgj7B0TDkQ==} @@ -445,6 +461,9 @@ packages: resolution: {integrity: sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g==} engines: {node: '>= 0.4'} + memory-pager@1.5.0: + resolution: {integrity: sha512-ZS4Bp4r/Zoeq6+NLJpP+0Zzm0pR8whtGPf1XExKLJBAczGMnSi3It14OiNCStjQjM6NU1okjQGSxgEZN8eBYKg==} + mime-db@1.52.0: resolution: {integrity: sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==} engines: {node: '>= 0.6'} @@ -456,6 +475,36 @@ packages: mitt@3.0.1: resolution: {integrity: sha512-vKivATfr97l2/QBCYAkXYDbrIWPM2IIKEl7YPhjCvKlG3kE2gm+uBo6nEXK3M5/Ffh/FLpKExzOQ3JJoJGFKBw==} + mongodb-connection-string-url@3.0.2: + resolution: {integrity: sha512-rMO7CGo/9BFwyZABcKAWL8UJwH/Kc2x0g72uhDWzG48URRax5TCIcJ7Rc3RZqffZzO/Gwff/jyKwCU9TN8gehA==} + + mongodb@6.13.0: + resolution: {integrity: sha512-KeESYR5TEaFxOuwRqkOm3XOsMqCSkdeDMjaW5u2nuKfX7rqaofp7JQGoi7sVqQcNJTKuveNbzZtWMstb8ABP6Q==} + engines: {node: '>=16.20.1'} + peerDependencies: + '@aws-sdk/credential-providers': ^3.188.0 + '@mongodb-js/zstd': ^1.1.0 || ^2.0.0 + gcp-metadata: ^5.2.0 + kerberos: ^2.0.1 + mongodb-client-encryption: '>=6.0.0 <7' + snappy: ^7.2.2 + socks: ^2.7.1 + peerDependenciesMeta: + '@aws-sdk/credential-providers': + optional: true + '@mongodb-js/zstd': + optional: true + gcp-metadata: + optional: true + kerberos: + optional: true + mongodb-client-encryption: + optional: true + snappy: + optional: true + socks: + optional: true + ms@2.1.3: resolution: {integrity: sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==} @@ -514,6 +563,10 @@ packages: pump@3.0.2: resolution: {integrity: sha512-tUPXtzlGM8FE3P0ZL6DVs/3P58k9nk8/jZeQCurTJylQA8qFYzHFfhBJkuqyE0FifOsQ0uKWekiZ5g8wtr28cw==} + punycode@2.3.1: + resolution: {integrity: sha512-vYt7UD1U9Wg6138shLtLOvdAu+8DsC/ilFtEVHcH+wydcSpNE20AfSOduf6MkRFahL5FY7X1oU7nKVZFtfq8Fg==} + engines: {node: '>=6'} + puppeteer-core@24.2.1: resolution: {integrity: sha512-bCypUh3WXzETafv1TCFAjIUnI8BiQ/d+XvEfEXDLcIMm9CAvROqnBmbt79yBjwasoDZsgfXnUmIJU7Y27AalVQ==} engines: {node: '>=18'} @@ -555,6 +608,9 @@ packages: resolution: {integrity: sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g==} engines: {node: '>=0.10.0'} + sparse-bitfield@3.0.3: + resolution: {integrity: sha512-kvzhi7vqKTfkh0PZU+2D2PIllw2ymqJKujUcyPMd9Y75Nv4nPbGJZXNhxsgdQab2BmlDct1YnfQCguEvHr7VsQ==} + sprintf-js@1.1.3: resolution: {integrity: sha512-Oo+0REFV59/rz3gfJNKQiBlwfHaSESl1pcGyABQsnnIfWOFt6JNj5gCog2U6MLZ//IGYD+nA8nI+mTShREReaA==} @@ -578,6 +634,10 @@ packages: text-decoder@1.2.3: resolution: {integrity: sha512-3/o9z3X0X0fTupwsYvR03pJ/DjWuqqrfwBgTQzdWDiQSm9KitAyz/9WqsT2JQW7KV2m+bC2ol/zqpW37NHxLaA==} + tr46@5.0.0: + resolution: {integrity: sha512-tk2G5R2KRwBd+ZN0zaEXpmzdKyOYksXwywulIX95MBODjSzMIuQnQ3m8JxgbhnL1LeVo7lqQKsYa1O3Htl7K5g==} + engines: {node: '>=18'} + ts-node@10.9.2: resolution: {integrity: sha512-f0FFpIdcHgn8zcPSbf1dRevwt047YMnaiJM3u2w2RewrB+fob/zePZcrOyQoLMMO7aBIddLcQIEK5dYjkLnGrQ==} hasBin: true @@ -617,6 +677,10 @@ packages: v8-compile-cache-lib@3.0.1: resolution: {integrity: sha512-wa7YjyUGfNZngI/vtK0UHAN+lgDCxBPCylVXGp0zu59Fz5aiGtNXaq3DhIov063MorB+VfufLh3JlF2KdTK3xg==} + webidl-conversions@7.0.0: + resolution: {integrity: sha512-VwddBukDzu71offAQR975unBIGqfKZpM+8ZX6ySk8nYhVoo5CYaZyzt3YBvYtRtO+aoGlqxPg/B87NGVZ/fu6g==} + engines: {node: '>=12'} + whatwg-encoding@3.1.1: resolution: {integrity: sha512-6qN4hJdMwfYBtE3YBTTHhoeuUrDBPZmbQaxWAqSALV/MeEnR5z1xd8UKud2RAkFoPkmB+hli1TZSnyi84xz1vQ==} engines: {node: '>=18'} @@ -625,6 +689,10 @@ packages: resolution: {integrity: sha512-QaKxh0eNIi2mE9p2vEdzfagOKHCcj1pJ56EEHGQOVxp8r9/iszLUUV7v89x9O1p/T+NlTM5W7jW6+cz4Fq1YVg==} engines: {node: '>=18'} + whatwg-url@14.1.1: + resolution: {integrity: sha512-mDGf9diDad/giZ/Sm9Xi2YcyzaFpbdLpJPr+E9fSkyQ7KpQD4SdFcugkRQYzhmfI4KeV4Qpnn2sKPdo+kmsgRQ==} + engines: {node: '>=18'} + wrap-ansi@7.0.0: resolution: {integrity: sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==} engines: {node: '>=10'} @@ -689,6 +757,10 @@ snapshots: '@jridgewell/resolve-uri': 3.1.2 '@jridgewell/sourcemap-codec': 1.5.0 + '@mongodb-js/saslprep@1.2.0': + dependencies: + sparse-bitfield: 3.0.3 + '@puppeteer/browsers@2.7.1': dependencies: debug: 4.4.0 @@ -718,6 +790,12 @@ snapshots: '@types/uuid@10.0.0': {} + '@types/webidl-conversions@7.0.3': {} + + '@types/whatwg-url@11.0.5': + dependencies: + '@types/webidl-conversions': 7.0.3 + '@types/yauzl@2.10.3': dependencies: '@types/node': 22.13.4 @@ -788,6 +866,8 @@ snapshots: boolbase@1.0.0: {} + bson@6.10.3: {} + buffer-crc32@0.2.13: {} call-bind-apply-helpers@1.0.2: @@ -1085,6 +1165,8 @@ snapshots: math-intrinsics@1.1.0: {} + memory-pager@1.5.0: {} + mime-db@1.52.0: {} mime-types@2.1.35: @@ -1093,6 +1175,19 @@ snapshots: mitt@3.0.1: {} + mongodb-connection-string-url@3.0.2: + dependencies: + '@types/whatwg-url': 11.0.5 + whatwg-url: 14.1.1 + + mongodb@6.13.0(socks@2.8.4): + dependencies: + '@mongodb-js/saslprep': 1.2.0 + bson: 6.10.3 + mongodb-connection-string-url: 3.0.2 + optionalDependencies: + socks: 2.8.4 + ms@2.1.3: {} netmask@2.0.2: {} @@ -1173,6 +1268,8 @@ snapshots: end-of-stream: 1.4.4 once: 1.4.0 + punycode@2.3.1: {} + puppeteer-core@24.2.1: dependencies: '@puppeteer/browsers': 2.7.1 @@ -1228,6 +1325,10 @@ snapshots: source-map@0.6.1: optional: true + sparse-bitfield@3.0.3: + dependencies: + memory-pager: 1.5.0 + sprintf-js@1.1.3: {} streamx@2.22.0: @@ -1267,6 +1368,10 @@ snapshots: dependencies: b4a: 1.6.7 + tr46@5.0.0: + dependencies: + punycode: 2.3.1 + ts-node@10.9.2(@types/node@22.13.4)(typescript@5.7.3): dependencies: '@cspotcode/source-map-support': 0.8.1 @@ -1299,12 +1404,19 @@ snapshots: v8-compile-cache-lib@3.0.1: {} + webidl-conversions@7.0.0: {} + whatwg-encoding@3.1.1: dependencies: iconv-lite: 0.6.3 whatwg-mimetype@4.0.0: {} + whatwg-url@14.1.1: + dependencies: + tr46: 5.0.0 + webidl-conversions: 7.0.0 + wrap-ansi@7.0.0: dependencies: ansi-styles: 4.3.0 diff --git a/src/index.ts b/src/index.ts index 8c5cab3..96591f2 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,10 +1,24 @@ -import { fetchData } from "./link"; - -function main() { - console.log("Hello, this is the main function!"); - fetchData("AI写作工具") - // test() +import { fetchData, queryListData } from "./link"; +export type FetchType = { + typeId: string; + url: string; + name?: string; +} +const fetchList = [ + { + name: 'AI写作工具', + typeId: '67908fc33de33b392c0330af', + url: 'https://ai-bot.cn/favorites/ai-writing-tools/' + }, + { + name: '', + typeId: '' } - - main(); - \ No newline at end of file +] +function main() { + console.log("Hello, this is the main function!"); + queryListData(fetchList) + // test() +} + +main(); diff --git a/src/lib/mongodb.ts b/src/lib/mongodb.ts new file mode 100644 index 0000000..54b8495 --- /dev/null +++ b/src/lib/mongodb.ts @@ -0,0 +1,32 @@ +// lib/mongodb.ts +import { MongoClient, Db } from 'mongodb'; + + +const uri = "mongodb://expdsn:58662@expdsn.cloud:27017"; +const options = {}; +let client: MongoClient; +let clientPromise: Promise; + +if (!uri) { + throw new Error('Please add your Mongo URI to.env.local'); +} + + +if (process.env.NODE_ENV === 'development') { + if (!(global as any)._mongoClientPromise) { + client = new MongoClient(uri, options); + (global as any)._mongoClientPromise = client.connect(); + } + clientPromise = (global as any)._mongoClientPromise; +} else { + client = new MongoClient(uri, options); + clientPromise = client.connect(); +} + +export const getDb = async () => { + return (await clientPromise).db('crawler'); +}; +export const getCollection = async (collection: string) => { + const ins = await getDb(); + return ins.collection(collection); +}; \ No newline at end of file diff --git a/src/link/index.ts b/src/link/index.ts index 21ebf4d..536e11f 100644 --- a/src/link/index.ts +++ b/src/link/index.ts @@ -1,6 +1,8 @@ import axios from 'axios'; const cheerio = require('cheerio') import { downloadImage } from "../share/tools" +import { getCollection } from '../lib/mongodb'; +import { FetchType } from '..'; // 要抓取的网页 URL const url = 'https://ai-bot.cn/favorites/ai-writing-tools/'; @@ -17,6 +19,10 @@ async function getPageData(url: string, name: string) { } } +export async function queryListData(list: FetchType[]) { + const promiseList = list.map(item => fetchData(item.typeId)) + await Promise.all(promiseList) +} export async function fetchData(typeName: string) { try { // 请求目标页面 @@ -28,6 +34,9 @@ export async function fetchData(typeName: string) { // 提取工具卡片数据 const toolsData = [] as any; + const length = $('.url-card').length + console.log(`正在爬取${typeName}类别的数据,共${length}条数据`); + let i = 0 for (const element of $('.url-card')) { const name = $(element).find('.url-info strong').text().trim(); const tempLink = $(element).find('a').attr('href') as string; @@ -59,12 +68,15 @@ export async function fetchData(typeName: string) { }; toolsData.push(toolData); - console.log('完成' ); + i++ + console.log(`进度:${i}/${length}`); } console.log(toolsData); - + const col = await getCollection('link'); + col.insertMany(toolsData); + console.log('数据插入成功'); } catch (error) { console.error('Error fetching data:', error); }