This commit is contained in:
expdsn 2025-02-19 18:36:25 +08:00
commit 35944a46d7
7 changed files with 1505 additions and 0 deletions

3
.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
node_modules
downloads

30
package.json Normal file
View File

@ -0,0 +1,30 @@
{
"name": "my-crawler",
"version": "1.0.0",
"main": "index.js",
"scripts": {
"dev": "npx ts-node src/index.ts",
"test": "echo \"Error: no test specified\" && exit 1"
},
"keywords": [],
"author": "",
"license": "ISC",
"description": "",
"dependencies": {
"axios": "^1.7.9",
"cheerio": "^1.0.0",
"puppeteer": "^24.2.1",
"uuid": "^11.0.5"
},
"pnpm": {
"onlyBuiltDependencies": [
"puppeteer"
]
},
"devDependencies": {
"@types/node": "^22.13.4",
"@types/uuid": "^10.0.0",
"ts-node": "^10.9.2",
"typescript": "^5.7.3"
}
}

1331
pnpm-lock.yaml Normal file

File diff suppressed because it is too large Load Diff

9
src/index.ts Normal file
View File

@ -0,0 +1,9 @@
import { fetchData } from "./link";
function main() {
console.log("Hello, this is the main function!");
fetchData("AI写作工具")
}
main();

71
src/link/index.ts Normal file
View File

@ -0,0 +1,71 @@
import axios from 'axios';
const cheerio = require('cheerio')
import { downloadImage } from "../share/tools"
// 要抓取的网页 URL
const url = 'https://ai-bot.cn/favorites/ai-writing-tools/';
async function getPageData(url: string, name: string) {
try {
const { data } = await axios.get(url);
const $ = cheerio.load(data);
const element = $(`a[title="${name}"]`)
return element.attr('href')
} catch (error) {
console.error('Error fetching data:', error);
}
}
export async function fetchData(typeName: string) {
try {
// 请求目标页面
const { data } = await axios.get(url);
// 使用 cheerio 加载 HTML
const $ = cheerio.load(data);
// 提取工具卡片数据
const toolsData = [] as any;
$('.url-card').each(async (index: any, element: any) => {
const name = $(element).find('.url-info strong').text().trim();
const tempLink = $(element).find('a').attr('href') as string;
const description = $(element).find('.url-info p').text().trim();
const _id = $(element).attr('data-id');
const _originLink = $(element).find('img').attr('data-src');
let link = tempLink
if (tempLink.startsWith('https://ai-bot.cn')) {
link = await getPageData(tempLink, name) || ''
}
// 假设工具的类别是固定的,比如 "AI写作工具"
const type = typeName;
const priority = index + 1; // 根据索引来定义优先级
const addTime = Date.now();
const logoLink = await downloadImage(_originLink)
// console.log(logoLink);
// const logoLink = ''
// 将提取的数据转换为 Link 类型
const toolData = {
name,
link,
description,
_id,
type,
priority,
logoLink,
addTime,
};
await toolsData.push(toolData);
});
console.log(toolsData);
} catch (error) {
console.error('Error fetching data:', error);
}
}

47
src/share/tools.ts Normal file
View File

@ -0,0 +1,47 @@
import { promises as fs } from 'fs';
import axios from "axios";
import readline from "readline"
import path from 'path';
import { v4 as uuid } from 'uuid'; // 或使用 crypto 模块
const downloadDir = path.join(__dirname, '../../downloads');
export function askQuestion(query: string) {
return new Promise((resolve, reject) => {
const rl = readline.createInterface({
input: process.stdin,
output: process.stdout
});
rl.question(query, (answer: string) => {
rl.close();
resolve(answer);
});
});
}
export async function downloadImage(url: string) {
try {
// 获取图片响应
const response = await axios.get(url, {
responseType: 'arraybuffer',
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
});
// 获取文件扩展名
const ext = response.headers['content-type']?.split('/')[1] || 'jpg';
const id = uuid()
// 生成最终路径
const filename = `${id}.${ext}`;
const filePath = path.join(downloadDir, filename);
// 写入文件
await fs.writeFile(filePath, response.data);
return id;
} catch (error) {
console.error('下载失败:', error);
throw error;
}
}

14
tsconfig.json Normal file
View File

@ -0,0 +1,14 @@
{
"compilerOptions": {
"target": "ES6", // ES6
"module": "commonjs", // 使 CommonJS Node.js 使
"outDir": "./dist", //
"rootDir": "./src", // TypeScript
"strict": true, //
"esModuleInterop": true, // ES6
"skipLibCheck": true, //
"forceConsistentCasingInFileNames": true //
},
"include": ["src/**/*.ts"], // src TypeScript
"exclude": ["node_modules"] // node_modules
}