import axios from "axios"; import fs from "fs"; import path from "path"; import { timestampToDate, loopCall } from "./utils.js"; import config from "./config.js"; import { SQLiteMessageQueue } from "./sqlite.js"; // import cheerio from "cheerio"; // import { messageQueue } from "./msgManager.js"; class GEELY { constructor() { this.url = "https://glzb.geely.com/gpmp/notice/listnotice"; // this.filepath = path.resolve("geely.json"); this.info = []; console.log("GEELY 爬虫启动..."); this.queue = new SQLiteMessageQueue(); this.start(); } async start() { try { await this.init(); } catch (err) { console.error("启动失败:", err); } } async init() { let announcements = this.queue.getAnnouncementsBySpider("吉利"); if (announcements.length > 0) { await this.increment(); } else { await this.fullFetch(); } // if (fs.existsSync(this.filepath)) { // let data = fs.readFileSync(this.filepath, "utf-8"); // this.info = data ? JSON.parse(data) : []; // if (this.info.length > 0) { // await this.increment(); // } else { // await this.fullFetch(); // } // } else { // console.log("历史文件不存在,开始全量爬取"); // await this.fullFetch(); // } } // 全量爬取 async fullFetch() { console.log("开始全量爬取..."); try { await loopCall(this.getInfo.bind(this), { time: config.fullFetchTime, pagenumber: 1, stopWhen: (pagenumber, result) => { return ( pagenumber >= result.pages || pagenumber >= config.pageNumberLimit ); // 限制最多2页用于测试 }, readyForNext: (pagenumber, result) => { this.info.push(...result.info); return pagenumber + 1; }, complete: (result) => { this.info.push(...result.info); console.log(`爬取完成,共获取 ${this.info.length} 条有效数据`); try { this.queue.saveAnnouncements("吉利", this.info); // this.writeFile(this.info); this.queue.addMessage("吉利", this.info); } catch (error) { console.error("数据库操作失败:", error); } }, }); } catch (error) { console.error("全量爬取失败:", error); } console.log("开始增量爬取..."); this.increment(); } // 增量爬取 async increment() { console.log("开始增量爬取模式,每5分钟检查一次新数据..."); try { await loopCall(this.getInfo.bind(this), { time: config.incrementFetchTime, // 5分钟间隔 pagenumber: 1, readyForNext: (pagenumber, result) => { try { let newInfo = this.queue.filterNewAnnouncements( "吉利", result.info ); // 存在新数据 if (newInfo.length > 0) { console.log(`发现 ${newInfo.length} 条新数据`); this.queue.saveAnnouncements("吉利", newInfo); this.queue.addMessage("吉利", newInfo); // 全是新数据,继续下一页 if (newInfo.length === result.info.length) { return pagenumber + 1; } else { // 有部分重复数据,重新从第一页开始 return 1; } } else { console.log("没有发现新数据,继续监控..."); return 1; // 重新从第一页开始 } } catch (error) { console.error("数据库操作失败:", error); } }, }); } catch (error) { console.error("增量爬取失败:", error); } } // 传入页码获取数据 async getInfo(pagenumber = 1) { let today = new Date().setHours(0, 0, 0, 0); let beforeOneMonth = today - 30 * 24 * 60 * 60 * 1000; let info = []; console.log(`正在获取第 ${pagenumber} 页数据...`); let result = await this.getList(pagenumber); if (result[0]) { // 出错, 记录错误日志 console.error("获取页面数据失败:", result[0]); return { pages: 0, info: [] }; } else { let total = result[1].data.total; let pages = Math.ceil(total / 20); let arr = result[1].data.items; for (let i = 0; i < arr.length; i++) { let item = arr[i]; if (item.endtime >= today && item.publishtime >= beforeOneMonth) { console.log("处理项目:", item.pjtnoticeid, item.pjtnoticename); let noticeRes = await this.getNoticeUrl(item.pjtnoticeid); if (noticeRes[0]) { // 获取招标公告内容报错 console.error("获取公告详情失败:", noticeRes[0]); } else { info.push({ id: item.pjtnoticeid, name: item.pjtnoticename, publishTime: timestampToDate(item.publishtime), endTime: timestampToDate(item.endtime), urls: noticeRes[1], }); } } } return { pages, info }; } } getList(pagenumber) { return axios({ url: this.url, params: { pagesize: 20, pagenumber: pagenumber, publishstatus: 2, bidcategoryid: 1442, iflongpro: 0, _: Date.now(), }, method: "get", }) .then((res) => { let result = res.data; if (result.code === "success") { return [null, result]; } else { return ["err", null]; } }) .catch((err) => { return [err, null]; }); } getNoticeUrl(id) { let timestamp = Date.now(); return axios({ url: `https://glzb.geely.com/gpmp/notice/query?_=${timestamp}&pjtnoticeid=${id}`, method: "get", }) .then((res) => { let result = res.data; if (result.code === "success") { let promises = []; for (let item of result.data.attachs) { let params = { name: item.attachname, downloadUrl: item.downloadUrl, previewUrl: item.previewUrl, attachname: item.attachname, _: Date.now(), }; promises.push( axios({ url: `https://glzb.geely.com/pub/file/info/preview`, method: "get", params, }) ); } return Promise.allSettled(promises).then((results) => { let urls = []; results.forEach((result) => { if ( result.status === "fulfilled" && result.value.data.code === "success" ) { urls.push(result.value.data.data); } }); return [null, urls]; }); } else { return ["err", null]; } }) .catch((err) => { console.log("err:", err); return [err, null]; }); } // writeFile(info) { // fs.writeFileSync(this.filepath, JSON.stringify(info), "utf-8"); // } } new GEELY();