import axios from "axios"; import fs from "fs"; import path from "path"; import { timestampToDate, loopCall, keywordsInclude, // addToMessageQueue, } from "./utils.js"; import config from "./config.js"; import { SQLiteMessageQueue } from "./sqlite.js"; // import { messageQueue } from "./msgManager.js"; // import cheerio from "cheerio"; class Chery { constructor() { this.jsonMap = [ { name: "奇瑞采购公告", // filepath: path.resolve("chery_cg.json"), info: [], options: { name: "采购公告", url: "https://ebd.mychery.com/cms/api/dynamicData/queryContentPage", categoryId: "5035", siteId: "747", }, }, { name: "奇瑞寻源预告", // filepath: path.resolve("chery_xy.json"), info: [], options: { name: "寻源预告", url: "https://ebd.mychery.com/cms/api/dynamicData/queryContentPage", categoryId: "965901485789413376", siteId: "747", }, }, { name: "奇瑞变更公告", // filepath: path.resolve("chery_bg.json"), info: [], options: { name: "变更公告", url: "https://ebd.mychery.com/cms/api/dynamicData/queryContentPage", categoryId: "5032", siteId: "747", }, }, ]; console.log("奇瑞 爬虫启动..."); this.queue = new SQLiteMessageQueue(); this.start(); } async start() { try { await this.init(); } catch (err) { console.error("启动失败:", err); } } async init() { for (let item of this.jsonMap) { let announcements = this.queue.getAnnouncementsBySpider(item.name); if (announcements.length > 0) { this.loopFetchIncrement(item); } else { this.loopFetchFull(item); } // if (fs.existsSync(item.filepath)) { // let data = fs.readFileSync(item.filepath, "utf-8"); // item.info = data ? JSON.parse(data) : []; // if (item.info.length > 0) { // // await this.increment(item); // console.log(`${item.name} 历史文件存在,开始增量爬取`); // this.loopFetchIncrement(item); // } else { // this.loopFetchFull(item); // } // } else { // console.log(`${item.name}历史文件不存在,开始全量爬取`); // this.loopFetchFull(item); // } } } // 全量爬取 loopFetchFull(props) { try { loopCall(this.getInfo.bind(this), { time: config.fullFetchTime, pagenumber: 1, additional: props.options, stopWhen: (pagenumber, result) => { return ( pagenumber >= result.pages || pagenumber >= config.pageNumberLimit ); }, readyForNext: (pagenumber, result) => { props.info.push(...result.info); return pagenumber + 1; }, complete: (result) => { props.info.push(...result.info); console.log(`爬取完成,共获取 ${props.info.length} 条有效数据`); try { this.queue.saveAnnouncements(props.name, props.info); // this.writeFile(props); this.queue.addMessage(props.name, props.info); } catch (error) { console.error("数据库操作失败:", error); } this.loopFetchIncrement(props); }, }); } catch (error) { console.error(`奇瑞${props.options.name}全量爬取失败:`, error); } } loopFetchIncrement(props) { try { loopCall(this.getInfo.bind(this), { time: config.incrementFetchTime, // 5分钟间隔 pagenumber: 1, additional: props.options, readyForNext: (pagenumber, result) => { try { let newInfo = this.queue.filterNewAnnouncements( props.name, result.info ); // 存在新数据 if (newInfo.length > 0) { console.log(`发现 ${newInfo.length} 条新数据`); // props.info.push(...newInfo); this.queue.saveAnnouncements(props.name, newInfo); // this.writeFile(props); this.queue.addMessage(props.name, newInfo); // 全是新数据,继续下一页 if (newInfo.length === result.info.length) { return pagenumber + 1; } else { // 有部分重复数据,重新从第一页开始 return 1; } } else { console.log("没有发现新数据,继续监控..."); return 1; // 重新从第一页开始 } } catch (error) { console.error("数据库操作失败:", error); } }, }); } catch (error) { console.error(`奇瑞${props.options.name}增量爬取失败:`, error); } } async getInfo(pagenumber = 1, config) { let info = []; console.log(`${config.name}--获取第 ${pagenumber} 页数据...`); let result = await this.getList(pagenumber, config); if (result[0]) { // 出错, 记录错误日志 console.error("获取页面数据失败:", result[0]); return { pages: 30, info: [] }; } else { let pages = 30; let arr = result[1].res.rows; for (let i = 0; i < arr.length; i++) { let item = arr[i]; let endTime, publishTime; if (config.categoryId === "965901485789413376") { publishTime = item.publishDate.replace("T", " ").split(".")[0]; endTime = this.extractDeadlineTime(item.text); } else { endTime = item.signUpEndTime.replace("T", " ").split(".")[0]; publishTime = item.signUpBeginTime.replace("T", " ").split(".")[0]; } // 命中关键词 if ( endTime && keywordsInclude(item.title) && +new Date(endTime) >= Date.now() ) { // console.log("处理项目:", item.id, item.projectName); info.push({ id: item.url, name: item.title, publishTime: publishTime, endTime: endTime, urls: `https://ebd.mychery.com/cms` + item.url, }); } } return { pages, info }; } } // 分页获取数据 getList(pagenumber, config) { return axios({ url: config.url, data: { dto: { bidType: "", categoryId: config.categoryId, city: "", county: "", province: "", purchaseMode: "", secondCompanyId: "", siteId: config.siteId, }, pageNo: pagenumber, pageSize: "10", }, method: "post", }) .then((res) => { let result = res.data; if (result.code === 0) { return [null, result]; } else { return ["err", null]; } }) .catch((err) => { return [err, null]; }); } // writeFile(props) { // fs.writeFileSync(props.filepath, JSON.stringify(props.info), "utf-8"); // } extractDeadlineTime(html) { // 匹配"预告报名截止时间:"后面的时间格式 const regex = /预告报名截止时间:(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2})/; const match = html.match(regex); if (match) { return match[1]; } return null; } } new Chery();