import axios from "axios"; import fs from "fs"; import path from "path"; import { timestampToDate, loopCall, keywordsInclude } from "./utils.js"; import config from "./config.js"; import { SQLiteMessageQueue } from "./sqlite.js"; class GreatWall { constructor() { this.jsonMap = [ { name: "长城公开寻源", info: [], options: { name: "长城公开寻源", url: "https://srm.gwm.cn/cloud-srm/api-sou/sou-firstPage/souReqlistPage", }, }, { name: "长城招募公示大厅", info: [], options: { name: "长城招募公示大厅", url: "https://srm.gwm.cn/cloud-srm/api-sou/api-ql/Recruit/visitList", data: { type: "Recruit", lang: "zh-cn", query: { "*": {} }, payload: { filter: {}, page: { sort: "lastUpdateDate desc", pageNum: 1, pageSize: 8 }, }, action: "visitList", tree: true, }, }, }, ]; console.log("长城 爬虫启动..."); this.queue = new SQLiteMessageQueue(); this.start(); } async start() { try { await this.init(); } catch (err) { console.error("启动失败:", err); } } async init() { for (let item of this.jsonMap) { let announcements = this.queue.getAnnouncementsBySpider(item.name); if (announcements.length > 0) { this.loopFetchIncrement(item); } else { this.loopFetchFull(item); } } } // 全量爬取 loopFetchFull(props) { try { loopCall(this.getInfo.bind(this), { time: config.fullFetchTime, pagenumber: 1, additional: props.options, stopWhen: (pagenumber, result) => { return ( pagenumber >= result.pages || pagenumber >= config.pageNumberLimit ); }, readyForNext: (pagenumber, result) => { props.info.push(...result.info); return pagenumber + 1; }, complete: (result) => { props.info.push(...result.info); console.log(`爬取完成,共获取 ${props.info.length} 条有效数据`); try { if (props.info.length > 0) { this.queue.saveAnnouncements(props.name, props.info); // this.writeFile(props); this.queue.addMessage(props.name, props.info); } } catch (error) { console.error("数据库操作失败:", error); } this.loopFetchIncrement(props); }, }); } catch (error) { console.error(`${props.options.name}全量爬取失败:`, error); } } loopFetchIncrement(props) { try { loopCall(this.getInfo.bind(this), { time: config.incrementFetchTime, // 5分钟间隔 pagenumber: 1, additional: props.options, readyForNext: (pagenumber, result) => { try { let newInfo = this.queue.filterNewAnnouncements( props.name, result.info ); // 存在新数据 if (newInfo.length > 0) { console.log(`发现 ${newInfo.length} 条新数据`); // props.info.push(...newInfo); this.queue.saveAnnouncements(props.name, newInfo); // this.writeFile(props); this.queue.addMessage(props.name, newInfo); // 全是新数据,继续下一页 if (newInfo.length === result.info.length) { return pagenumber + 1; } else { // 有部分重复数据,重新从第一页开始 return 1; } } else { console.log("没有发现新数据,继续监控..."); return 1; // 重新从第一页开始 } } catch (error) { console.error("数据库操作失败:", error); } }, }); } catch (error) { console.error(`${props.options.name}增量爬取失败:`, error); } } async getInfo(pagenumber = 1, config) { let info = []; console.log(`${config.name}--获取第 ${pagenumber} 页数据...`); let result = await this.getList(pagenumber, config); if (result[0]) { // 出错, 记录错误日志 console.error("获取页面数据失败:", result[0]); return { pages: 0, info: [] }; } else { if (config.data) { // 招募公示大厅 let arr = result[1].data.records; let pages = result[1].data.pageCount; for (let i = 0; i < arr.length; i++) { let item = arr[i]; let endTime, publishTime; endTime = item.deadlineTime; publishTime = item.publishTime; // 命中关键词 if (keywordsInclude(item.title)) { info.push({ id: item.recruitId, name: item.title, publishTime: publishTime, endTime: endTime, urls: `https://srm.gwm.cn/#/portalBidding/vendorBiddingDetail?id=${item.recruitId}`, }); } } return { pages, info }; } else { // 公开寻源 let arr = result[1].data.list; let pages = result[1].data.pages; for (let i = 0; i < arr.length; i++) { let item = arr[i]; let endTime, publishTime; endTime = item.publicEndTime; publishTime = item.releaseDate; // 命中关键词 if (keywordsInclude(item.projectName)) { info.push({ id: item.reqHeadId, name: item.projectName, publishTime: publishTime, endTime: endTime, urls: `https://srm.gwm.cn/#/portal?id=${item.reqHeadId}`, }); } } return { pages, info }; } } } // 分页获取数据 getList(pagenumber, config) { let data = {}; if (config.data) { data = config.data; data.payload.page.pageNum = pagenumber; } else { data = { pageNum: pagenumber, pageSize: 8 }; } return axios({ url: config.url, data: data, method: "post", }) .then((res) => { let result = res.data; if (result.code == "0") { return [null, result]; } else { return ["err", null]; } }) .catch((err) => { return [err, null]; }); } // writeFile(props) { // fs.writeFileSync(props.filepath, JSON.stringify(props.info), "utf-8"); // } // extractDeadlineTime(html) { // // 匹配"预告报名截止时间:"后面的时间格式 // const regex = /预告报名截止时间:(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2})/; // const match = html.match(regex); // if (match) { // return match[1]; // } // return null; // } } new GreatWall();