From 12ee63b814fdd0f6c9d77fbb320592b5908f8cff Mon Sep 17 00:00:00 2001 From: huzhengrong Date: Thu, 23 Oct 2025 10:39:32 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 109 ++++++++++++ byd.js | 173 ++++++++++++++++++ changan.js | 188 ++++++++++++++++++++ chery.js | 251 ++++++++++++++++++++++++++ config.js | 6 + df.js | 187 ++++++++++++++++++++ ecosystem.config.cjs | 37 ++++ geely.js | 237 +++++++++++++++++++++++++ greatWall.js | 234 +++++++++++++++++++++++++ jianghuai.js | 385 ++++++++++++++++++++++++++++++++++++++++ leapMotor.js | 193 ++++++++++++++++++++ mailer.js | 100 +++++++++++ msgManager.js | 212 ++++++++++++++++++++++ nio.js | 170 ++++++++++++++++++ package.json | 23 +++ picc.js | 214 +++++++++++++++++++++++ readme.md | 47 +++++ sqlite.js | 320 ++++++++++++++++++++++++++++++++++ stats.js | 80 +++++++++ third.js | 309 ++++++++++++++++++++++++++++++++ utils.js | 271 +++++++++++++++++++++++++++++ yiqi.js | 199 +++++++++++++++++++++ youzhicai.js | 406 +++++++++++++++++++++++++++++++++++++++++++ 23 files changed, 4351 insertions(+) create mode 100644 .gitignore create mode 100644 byd.js create mode 100644 changan.js create mode 100644 chery.js create mode 100644 config.js create mode 100644 df.js create mode 100644 ecosystem.config.cjs create mode 100644 geely.js create mode 100644 greatWall.js create mode 100644 jianghuai.js create mode 100644 leapMotor.js create mode 100644 mailer.js create mode 100644 msgManager.js create mode 100644 nio.js create mode 100644 package.json create mode 100644 picc.js create mode 100644 readme.md create mode 100644 sqlite.js create mode 100644 stats.js create mode 100644 third.js create mode 100644 utils.js create mode 100644 yiqi.js create mode 100644 youzhicai.js diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c53d921 --- /dev/null +++ b/.gitignore @@ -0,0 +1,109 @@ +# Node.js +node_modules/ +npm-debug.log* +yarn-debug.log* +yarn-error.log* +pnpm-debug.log* +package-lock.json +yarn.lock +pnpm-lock.yaml + +# Logs +logs +*.log +*.log.* +log/ +pids +*.pid +*.seed +*.pid.lock + +# OS +.DS_Store +Thumbs.db +*.db + +# dotenv environment variables +.env +.env.* +!.env.example + +# Editor directories and files +.idea/ +.vscode/ +*.sublime-workspace +*.sublime-project + +# Build output +dist/ +build/ +out/ +coverage/ +.nyc_output/ + +# Optional npm cache directory +.npm/ + +# Optional eslint cache +.eslintcache + +# Optional REPL history +.node_repl_history + +# Mac system files +.AppleDouble +.LSOverride + +# Test coverage +coverage/ + +# TypeScript cache +*.tsbuildinfo + +# Optional: local data +*.local + +# Optional: debug +debug.log + +# Optional: next.js +.next/ + +# Optional: Nuxt.js +.nuxt/ + +# Optional: SvelteKit +.svelte-kit/ + +# Optional: vuepress +.vuepress/dist + +# Optional: Storybook +.storybook-out/ + +# Optional: Parcel +.cache/ + +# Optional: output of 'npm pack' +*.tgz + +# Optional: PM2 logs and pids +pids/ +*.pid +*.seed +*.pid.lock +pm2.log + +# Optional: dotenv +.env.local +.env.development.local +.env.test.local +.env.production.local + +# Optional: jest +jest.config.js +jest.config.ts + +# Optional: cypress +cypress/videos/ +cypress/screenshots/ diff --git a/byd.js b/byd.js new file mode 100644 index 0000000..3ef15ee --- /dev/null +++ b/byd.js @@ -0,0 +1,173 @@ +import axios from "axios"; +import fs from "fs"; +import path from "path"; +import { timestampToDate, loopCall, keywordsInclude } from "./utils.js"; +import config from "./config.js"; +import { SQLiteMessageQueue } from "./sqlite.js"; + +class BYD { + constructor() { + this.info = []; + console.log("比亚迪 爬虫启动..."); + this.queue = new SQLiteMessageQueue(); + this.start(); + } + + async start() { + try { + await this.init(); + } catch (err) { + console.error("启动失败:", err); + } + } + async init() { + let announcements = this.queue.getAnnouncementsBySpider("比亚迪"); + if (announcements.length > 0) { + await this.increment(); + } else { + await this.fullFetch(); + } + } + // 全量爬取 + async fullFetch() { + console.log("开始全量爬取..."); + try { + await loopCall(this.getInfo.bind(this), { + time: config.fullFetchTime, + pagenumber: 1, + stopWhen: (pagenumber, result) => { + return ( + pagenumber >= result.pages || pagenumber >= config.pageNumberLimit + ); + }, + readyForNext: (pagenumber, result) => { + this.info.push(...result.info); + return pagenumber + 1; + }, + complete: (result) => { + this.info.push(...result.info); + console.log(`爬取完成,共获取 ${this.info.length} 条有效数据`); + try { + if (this.info.length > 0) { + this.queue.saveAnnouncements("比亚迪", this.info); + // this.writeFile(this.info); + this.queue.addMessage("比亚迪", this.info); + } + } catch (error) { + console.error("数据库操作失败:", error); + } + }, + }); + } catch (error) { + console.error("全量爬取失败:", error); + } + console.log("开始增量爬取..."); + this.increment(); + } + + // 增量爬取 + async increment() { + console.log("开始增量爬取模式,每5分钟检查一次新数据..."); + try { + await loopCall(this.getInfo.bind(this), { + time: config.incrementFetchTime, // 5分钟间隔 + pagenumber: 1, + readyForNext: (pagenumber, result) => { + try { + let newInfo = this.queue.filterNewAnnouncements( + "比亚迪", + result.info + ); + // 存在新数据 + if (newInfo.length > 0) { + console.log(`发现 ${newInfo.length} 条新数据`); + // this.info.push(...newInfo); + this.queue.saveAnnouncements("比亚迪", newInfo); + // this.writeFile(this.info); + this.queue.addMessage("比亚迪", newInfo); + // 全是新数据,继续下一页 + if (newInfo.length === result.info.length) { + return pagenumber + 1; + } else { + // 有部分重复数据,重新从第一页开始 + return 1; + } + } else { + console.log("没有发现新数据,继续监控..."); + return 1; // 重新从第一页开始 + } + } catch (error) { + console.error("数据库操作失败:", error); + } + }, + }); + } catch (error) { + console.error("增量爬取失败:", error); + } + } + async getInfo(pagenumber = 1) { + let info = []; + console.log(`正在获取第 ${pagenumber} 页数据...`); + let result = await this.getList(pagenumber); + if (result[0]) { + // 出错, 记录错误日志 + console.error("获取页面数据失败:", result[0]); + return { pages: 0, info: [] }; + } else { + let total = result[1].data.total; + let pages = Math.ceil(total / 10); + let arr = result[1].data.records; + + for (let i = 0; i < arr.length; i++) { + let item = arr[i]; + let endTime = timestampToDate( + new Date(item.signUpEndTime).getTime(), + true + ); + // 命中关键词 + if ( + keywordsInclude(item.title) && + endTime && + +new Date(endTime) >= Date.now() + ) { + // console.log("处理项目:", item.sourcingId, item.title); + info.push({ + id: item.sourcingId, + name: item.title, + publishTime: timestampToDate( + new Date(item.tenderNoticePublishTime).getTime(), + true + ), + endTime: endTime, + urls: `https://spcn.byd.com/#/tender-detail?sourcingId=${item.sourcingId}`, + }); + } + } + return { pages, info }; + } + } + // 分页获取数据 + getList(pagenumber) { + return axios({ + url: "https://spcn.byd.com/api/srm-sou-sp/supplier/supplier/getTenderAnnouncementInfo", + data: { + pageNo: pagenumber, + pageSize: 10, + }, + method: "post", + }) + .then((res) => { + let result = res.data; + if (result.msg === "成功" && result.code === "000000") { + return [null, result]; + } else { + return ["err", null]; + } + }) + .catch((err) => { + return [err, null]; + }); + } +} + +new BYD(); diff --git a/changan.js b/changan.js new file mode 100644 index 0000000..c47b911 --- /dev/null +++ b/changan.js @@ -0,0 +1,188 @@ +import axios from "axios"; +import fs from "fs"; +import path from "path"; +import { + timestampToDate, + loopCall, + keywordsInclude, + // addToMessageQueue, +} from "./utils.js"; +import config from "./config.js"; +import { SQLiteMessageQueue } from "./sqlite.js"; +// import { messageQueue } from "./msgManager.js"; +// import cheerio from "cheerio"; + +class ChangAn { + constructor() { + // this.filepath = path.resolve("changan.json"); + this.info = []; + console.log("长安 爬虫启动..."); + this.queue = new SQLiteMessageQueue(); + this.start(); + } + + async start() { + try { + await this.init(); + } catch (err) { + console.error("启动失败:", err); + } + } + async init() { + let announcements = this.queue.getAnnouncementsBySpider("长安"); + if (announcements.length > 0) { + await this.increment(); + } else { + await this.fullFetch(); + } + + // if (fs.existsSync(this.filepath)) { + // let data = fs.readFileSync(this.filepath, "utf-8"); + // this.info = data ? JSON.parse(data) : []; + // if (this.info.length > 0) { + // await this.increment(); + // } else { + // await this.fullFetch(); + // } + // } else { + // console.log("历史文件不存在,开始全量爬取"); + // await this.fullFetch(); + // } + } + // 全量爬取 + async fullFetch() { + console.log("开始全量爬取..."); + try { + await loopCall(this.getInfo.bind(this), { + time: config.fullFetchTime, + pagenumber: 1, + stopWhen: (pagenumber, result) => { + return ( + pagenumber >= result.pages || pagenumber >= config.pageNumberLimit + ); + }, + readyForNext: (pagenumber, result) => { + this.info.push(...result.info); + return pagenumber + 1; + }, + complete: (result) => { + this.info.push(...result.info); + console.log(`爬取完成,共获取 ${this.info.length} 条有效数据`); + try { + this.queue.saveAnnouncements("长安", this.info); + // this.writeFile(this.info); + this.queue.addMessage("长安", this.info); + } catch (error) { + console.error("数据库操作失败:", error); + } + }, + }); + } catch (error) { + console.error("全量爬取失败:", error); + } + console.log("开始增量爬取..."); + this.increment(); + } + + // 增量爬取 + async increment() { + console.log("开始增量爬取模式,每5分钟检查一次新数据..."); + try { + await loopCall(this.getInfo.bind(this), { + time: config.incrementFetchTime, // 5分钟间隔 + pagenumber: 1, + readyForNext: (pagenumber, result) => { + try { + let newInfo = this.queue.filterNewAnnouncements( + "长安", + result.info + ); + // 存在新数据 + if (newInfo.length > 0) { + console.log(`发现 ${newInfo.length} 条新数据`); + // this.info.push(...newInfo); + this.queue.saveAnnouncements("长安", newInfo); + // this.writeFile(this.info); + this.queue.addMessage("长安", newInfo); + // 全是新数据,继续下一页 + if (newInfo.length === result.info.length) { + return pagenumber + 1; + } else { + // 有部分重复数据,重新从第一页开始 + return 1; + } + } else { + console.log("没有发现新数据,继续监控..."); + return 1; // 重新从第一页开始 + } + } catch (error) { + console.error("数据库操作失败:", error); + } + }, + }); + } catch (error) { + console.error("增量爬取失败:", error); + } + } + async getInfo(pagenumber = 1) { + let info = []; + console.log(`正在获取第 ${pagenumber} 页数据...`); + let result = await this.getList(pagenumber); + if (result[0]) { + // 出错, 记录错误日志 + console.error("获取页面数据失败:", result[0]); + return { pages: 0, info: [] }; + } else { + // let total = result[1].result.total; + let pages = result[1].result.pages; + let arr = result[1].result.records; + + for (let i = 0; i < arr.length; i++) { + let item = arr[i]; + // 命中关键词 + if (keywordsInclude(item.projectName)) { + console.log("处理项目:", item.id, item.projectName); + info.push({ + id: item.id, + name: item.projectName, + publishTime: item.startTime, + endTime: item.endTime, + urls: `https://portal.changan.com.cn/noProdNoticeInfo?_t=${Date.now()}&id=${ + item.id + }`, + }); + } + } + return { pages, info }; + } + } + // 分页获取数据 + getList(pagenumber) { + return axios({ + url: "https://portal.changan.com.cn/backend_8086/changan_platform/api/nonPdcSourceNoticeCt/listSourceNoticePageBySupplier", + params: { + _t: Date.now(), + pageNo: pagenumber, + pageSize: 20, + }, + method: "get", + }) + .then((res) => { + let result = res.data; + if (result.success) { + return [null, result]; + } else { + return ["err", null]; + } + }) + .catch((err) => { + return [err, null]; + }); + } + + // writeFile(info) { + // fs.writeFileSync(this.filepath, JSON.stringify(info), "utf-8"); + // } +} + +new ChangAn(); diff --git a/chery.js b/chery.js new file mode 100644 index 0000000..bad96ed --- /dev/null +++ b/chery.js @@ -0,0 +1,251 @@ +import axios from "axios"; +import fs from "fs"; +import path from "path"; +import { + timestampToDate, + loopCall, + keywordsInclude, + // addToMessageQueue, +} from "./utils.js"; +import config from "./config.js"; +import { SQLiteMessageQueue } from "./sqlite.js"; +// import { messageQueue } from "./msgManager.js"; +// import cheerio from "cheerio"; + +class Chery { + constructor() { + this.jsonMap = [ + { + name: "奇瑞采购公告", + // filepath: path.resolve("chery_cg.json"), + info: [], + options: { + name: "采购公告", + url: "https://ebd.mychery.com/cms/api/dynamicData/queryContentPage", + categoryId: "5035", + siteId: "747", + }, + }, + { + name: "奇瑞寻源预告", + // filepath: path.resolve("chery_xy.json"), + info: [], + options: { + name: "寻源预告", + url: "https://ebd.mychery.com/cms/api/dynamicData/queryContentPage", + categoryId: "965901485789413376", + siteId: "747", + }, + }, + { + name: "奇瑞变更公告", + // filepath: path.resolve("chery_bg.json"), + info: [], + options: { + name: "变更公告", + url: "https://ebd.mychery.com/cms/api/dynamicData/queryContentPage", + categoryId: "5032", + siteId: "747", + }, + }, + ]; + console.log("奇瑞 爬虫启动..."); + this.queue = new SQLiteMessageQueue(); + this.start(); + } + + async start() { + try { + await this.init(); + } catch (err) { + console.error("启动失败:", err); + } + } + async init() { + for (let item of this.jsonMap) { + let announcements = this.queue.getAnnouncementsBySpider(item.name); + if (announcements.length > 0) { + this.loopFetchIncrement(item); + } else { + this.loopFetchFull(item); + } + // if (fs.existsSync(item.filepath)) { + // let data = fs.readFileSync(item.filepath, "utf-8"); + // item.info = data ? JSON.parse(data) : []; + // if (item.info.length > 0) { + // // await this.increment(item); + // console.log(`${item.name} 历史文件存在,开始增量爬取`); + // this.loopFetchIncrement(item); + // } else { + // this.loopFetchFull(item); + // } + // } else { + // console.log(`${item.name}历史文件不存在,开始全量爬取`); + // this.loopFetchFull(item); + // } + } + } + // 全量爬取 + loopFetchFull(props) { + try { + loopCall(this.getInfo.bind(this), { + time: config.fullFetchTime, + pagenumber: 1, + additional: props.options, + stopWhen: (pagenumber, result) => { + return ( + pagenumber >= result.pages || pagenumber >= config.pageNumberLimit + ); + }, + readyForNext: (pagenumber, result) => { + props.info.push(...result.info); + return pagenumber + 1; + }, + complete: (result) => { + props.info.push(...result.info); + console.log(`爬取完成,共获取 ${props.info.length} 条有效数据`); + try { + this.queue.saveAnnouncements(props.name, props.info); + // this.writeFile(props); + this.queue.addMessage(props.name, props.info); + } catch (error) { + console.error("数据库操作失败:", error); + } + this.loopFetchIncrement(props); + }, + }); + } catch (error) { + console.error(`奇瑞${props.options.name}全量爬取失败:`, error); + } + } + loopFetchIncrement(props) { + try { + loopCall(this.getInfo.bind(this), { + time: config.incrementFetchTime, // 5分钟间隔 + pagenumber: 1, + additional: props.options, + readyForNext: (pagenumber, result) => { + try { + let newInfo = this.queue.filterNewAnnouncements( + props.name, + result.info + ); + // 存在新数据 + if (newInfo.length > 0) { + console.log(`发现 ${newInfo.length} 条新数据`); + // props.info.push(...newInfo); + this.queue.saveAnnouncements(props.name, newInfo); + // this.writeFile(props); + this.queue.addMessage(props.name, newInfo); + // 全是新数据,继续下一页 + if (newInfo.length === result.info.length) { + return pagenumber + 1; + } else { + // 有部分重复数据,重新从第一页开始 + return 1; + } + } else { + console.log("没有发现新数据,继续监控..."); + return 1; // 重新从第一页开始 + } + } catch (error) { + console.error("数据库操作失败:", error); + } + }, + }); + } catch (error) { + console.error(`奇瑞${props.options.name}增量爬取失败:`, error); + } + } + async getInfo(pagenumber = 1, config) { + let info = []; + console.log(`${config.name}--获取第 ${pagenumber} 页数据...`); + let result = await this.getList(pagenumber, config); + if (result[0]) { + // 出错, 记录错误日志 + console.error("获取页面数据失败:", result[0]); + return { pages: 30, info: [] }; + } else { + let pages = 30; + let arr = result[1].res.rows; + + for (let i = 0; i < arr.length; i++) { + let item = arr[i]; + let endTime, publishTime; + if (config.categoryId === "965901485789413376") { + publishTime = item.publishDate.replace("T", " ").split(".")[0]; + endTime = this.extractDeadlineTime(item.text); + } else { + endTime = item.signUpEndTime.replace("T", " ").split(".")[0]; + publishTime = item.signUpBeginTime.replace("T", " ").split(".")[0]; + } + // 命中关键词 + if ( + endTime && + keywordsInclude(item.title) && + +new Date(endTime) >= Date.now() + ) { + // console.log("处理项目:", item.id, item.projectName); + info.push({ + id: item.url, + name: item.title, + publishTime: publishTime, + endTime: endTime, + urls: `https://ebd.mychery.com/cms` + item.url, + }); + } + } + return { pages, info }; + } + } + // 分页获取数据 + getList(pagenumber, config) { + return axios({ + url: config.url, + data: { + dto: { + bidType: "", + categoryId: config.categoryId, + city: "", + county: "", + province: "", + purchaseMode: "", + secondCompanyId: "", + siteId: config.siteId, + }, + pageNo: pagenumber, + pageSize: "10", + }, + method: "post", + }) + .then((res) => { + let result = res.data; + if (result.code === 0) { + return [null, result]; + } else { + return ["err", null]; + } + }) + .catch((err) => { + return [err, null]; + }); + } + + // writeFile(props) { + // fs.writeFileSync(props.filepath, JSON.stringify(props.info), "utf-8"); + // } + + extractDeadlineTime(html) { + // 匹配"预告报名截止时间:"后面的时间格式 + const regex = /预告报名截止时间:(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2})/; + const match = html.match(regex); + + if (match) { + return match[1]; + } + + return null; + } +} + +new Chery(); diff --git a/config.js b/config.js new file mode 100644 index 0000000..7e4a58b --- /dev/null +++ b/config.js @@ -0,0 +1,6 @@ +export default { + // 分页获取限制 + pageNumberLimit: 3, + fullFetchTime: 2000, + incrementFetchTime: 5 * 60 * 1000, +}; diff --git a/df.js b/df.js new file mode 100644 index 0000000..7104a44 --- /dev/null +++ b/df.js @@ -0,0 +1,187 @@ +import axios from "axios"; +import fs from "fs"; +import path from "path"; +import { timestampToDate, loopCall, keywordsInclude } from "./utils.js"; +import config from "./config.js"; +import { SQLiteMessageQueue } from "./sqlite.js"; +import * as cheerio from "cheerio"; + +class DF { + constructor() { + this.jsonMap = [ + { + name: "东风【招标采购】", + info: [], + options: { + name: "东风【招标采购】", + url: "https://etp.dfmc.com.cn/jyxx/004001/", + homeIndex: "trade_info_new.html", + }, + }, + { + name: "东风【非招标采购】", + info: [], + options: { + name: "东风【非招标采购】", + url: "https://etp.dfmc.com.cn/jyxx/004002/", + homeIndex: "trade_info_newf.html", + }, + }, + ]; + console.log("东风 爬虫启动..."); + this.queue = new SQLiteMessageQueue(); + this.start(); + } + + async start() { + try { + await this.init(); + } catch (err) { + console.error("启动失败:", err); + } + } + async init() { + for (let item of this.jsonMap) { + let announcements = this.queue.getAnnouncementsBySpider(item.name); + if (announcements.length > 0) { + this.loopFetchIncrement(item); + } else { + this.loopFetchFull(item); + } + } + } + // 全量爬取 + loopFetchFull(props) { + try { + loopCall(this.getInfo.bind(this), { + time: config.fullFetchTime, + pagenumber: 1, + additional: props.options, + stopWhen: (pagenumber, result) => { + return ( + pagenumber >= result.pages || pagenumber >= config.pageNumberLimit + ); + }, + readyForNext: (pagenumber, result) => { + props.info.push(...result.info); + return pagenumber + 1; + }, + complete: (result) => { + props.info.push(...result.info); + console.log(`爬取完成,共获取 ${props.info.length} 条有效数据`); + try { + if (props.info.length > 0) { + this.queue.saveAnnouncements(props.name, props.info); + // this.writeFile(props); + this.queue.addMessage(props.name, props.info); + } + } catch (error) { + console.error("数据库操作失败:", error); + } + this.loopFetchIncrement(props); + }, + }); + } catch (error) { + console.error(`${props.options.name}全量爬取失败:`, error); + } + } + loopFetchIncrement(props) { + try { + loopCall(this.getInfo.bind(this), { + time: config.incrementFetchTime, // 5分钟间隔 + pagenumber: 1, + additional: props.options, + readyForNext: (pagenumber, result) => { + try { + let newInfo = this.queue.filterNewAnnouncements( + props.name, + result.info + ); + // 存在新数据 + if (newInfo.length > 0) { + console.log(`发现 ${newInfo.length} 条新数据`); + // props.info.push(...newInfo); + this.queue.saveAnnouncements(props.name, newInfo); + // this.writeFile(props); + this.queue.addMessage(props.name, newInfo); + // 全是新数据,继续下一页 + if (newInfo.length === result.info.length) { + return pagenumber + 1; + } else { + // 有部分重复数据,重新从第一页开始 + return 1; + } + } else { + console.log("没有发现新数据,继续监控..."); + return 1; // 重新从第一页开始 + } + } catch (error) { + console.error("数据库操作失败:", error); + } + }, + }); + } catch (error) { + console.error(`${props.options.name}增量爬取失败:`, error); + } + } + async getInfo(pagenumber = 1, config) { + let info = []; + console.log(`${config.name}--获取第 ${pagenumber} 页数据...`); + let result = await this.getList(pagenumber, config); + if (result[0]) { + // 出错, 记录错误日志 + console.error("获取页面数据失败:", result[0].status); + return { pages: 0, info: [] }; + } else { + // 第六页开始就要验证码了 + let pages = 5; + let html = result[1]; + const $ = cheerio.load(html); + $(".public-table tbody tr").each((index, element) => { + let id = $(element).find("td:nth-child(3)").text(); + let name = $(element).find("a").text(); + let publishTime = $(element).find("td:nth-child(6)").text(); + let endTime = $(element).find("td:nth-child(5)").text(); + let urls = + "https://etp.dfmc.com.cn" + $(element).find("a").attr("href"); + if ( + endTime && + +new Date(endTime) >= Date.now() && + keywordsInclude(name) + ) { + console.log("处理项目:", id, name); + info.push({ + id: id, + name: name, + publishTime: publishTime, + endTime: endTime, + urls: urls, + }); + } + }); + return { pages, info }; + } + } + // 分页获取数据 + getList(pagenumber, config) { + let url = config.url; + if (pagenumber === 1) { + url += config.homeIndex; + } else { + url += `${pagenumber}.html`; + } + return axios({ + url: url, + method: "get", + }) + .then((res) => { + let result = res.data; + return [null, result]; + }) + .catch((err) => { + return [err, null]; + }); + } +} + +new DF(); diff --git a/ecosystem.config.cjs b/ecosystem.config.cjs new file mode 100644 index 0000000..68cfb53 --- /dev/null +++ b/ecosystem.config.cjs @@ -0,0 +1,37 @@ +module.exports = { + apps: [ + // 消息队列管理器(优先启动) + { + name: "msg-manager", + script: "msgManager.js", + instances: 1, + autorestart: true, + watch: false, + max_memory_restart: "200M", + env: { + NODE_ENV: "production", + SERVICE_NAME: "msg-manager", + }, + error_file: "./logs/msg-manager-error.log", + out_file: "./logs/msg-manager-out.log", + log_file: "./logs/msg-manager-combined.log", + time: true, + }, + { + name: "picc-spider", + script: "picc.js", + instances: 1, + autorestart: true, + watch: false, + max_memory_restart: "300M", + env: { + NODE_ENV: "production", + SPIDER_NAME: "picc", + }, + error_file: "./logs/picc-error.log", + out_file: "./logs/picc-out.log", + log_file: "./logs/picc-combined.log", + time: true, + }, + ], +}; diff --git a/geely.js b/geely.js new file mode 100644 index 0000000..f71479e --- /dev/null +++ b/geely.js @@ -0,0 +1,237 @@ +import axios from "axios"; +import fs from "fs"; +import path from "path"; +import { timestampToDate, loopCall } from "./utils.js"; +import config from "./config.js"; +import { SQLiteMessageQueue } from "./sqlite.js"; +// import cheerio from "cheerio"; +// import { messageQueue } from "./msgManager.js"; + +class GEELY { + constructor() { + this.url = "https://glzb.geely.com/gpmp/notice/listnotice"; + // this.filepath = path.resolve("geely.json"); + this.info = []; + console.log("GEELY 爬虫启动..."); + this.queue = new SQLiteMessageQueue(); + this.start(); + } + + async start() { + try { + await this.init(); + } catch (err) { + console.error("启动失败:", err); + } + } + async init() { + let announcements = this.queue.getAnnouncementsBySpider("吉利"); + if (announcements.length > 0) { + await this.increment(); + } else { + await this.fullFetch(); + } + // if (fs.existsSync(this.filepath)) { + // let data = fs.readFileSync(this.filepath, "utf-8"); + // this.info = data ? JSON.parse(data) : []; + // if (this.info.length > 0) { + // await this.increment(); + // } else { + // await this.fullFetch(); + // } + // } else { + // console.log("历史文件不存在,开始全量爬取"); + // await this.fullFetch(); + // } + } + // 全量爬取 + async fullFetch() { + console.log("开始全量爬取..."); + try { + await loopCall(this.getInfo.bind(this), { + time: config.fullFetchTime, + pagenumber: 1, + stopWhen: (pagenumber, result) => { + return ( + pagenumber >= result.pages || pagenumber >= config.pageNumberLimit + ); // 限制最多2页用于测试 + }, + readyForNext: (pagenumber, result) => { + this.info.push(...result.info); + return pagenumber + 1; + }, + complete: (result) => { + this.info.push(...result.info); + console.log(`爬取完成,共获取 ${this.info.length} 条有效数据`); + try { + this.queue.saveAnnouncements("吉利", this.info); + // this.writeFile(this.info); + this.queue.addMessage("吉利", this.info); + } catch (error) { + console.error("数据库操作失败:", error); + } + }, + }); + } catch (error) { + console.error("全量爬取失败:", error); + } + console.log("开始增量爬取..."); + this.increment(); + } + + // 增量爬取 + async increment() { + console.log("开始增量爬取模式,每5分钟检查一次新数据..."); + try { + await loopCall(this.getInfo.bind(this), { + time: config.incrementFetchTime, // 5分钟间隔 + pagenumber: 1, + readyForNext: (pagenumber, result) => { + try { + let newInfo = this.queue.filterNewAnnouncements( + "吉利", + result.info + ); + // 存在新数据 + if (newInfo.length > 0) { + console.log(`发现 ${newInfo.length} 条新数据`); + this.queue.saveAnnouncements("吉利", newInfo); + this.queue.addMessage("吉利", newInfo); + // 全是新数据,继续下一页 + if (newInfo.length === result.info.length) { + return pagenumber + 1; + } else { + // 有部分重复数据,重新从第一页开始 + return 1; + } + } else { + console.log("没有发现新数据,继续监控..."); + return 1; // 重新从第一页开始 + } + } catch (error) { + console.error("数据库操作失败:", error); + } + }, + }); + } catch (error) { + console.error("增量爬取失败:", error); + } + } + // 传入页码获取数据 + async getInfo(pagenumber = 1) { + let today = new Date().setHours(0, 0, 0, 0); + let beforeOneMonth = today - 30 * 24 * 60 * 60 * 1000; + let info = []; + console.log(`正在获取第 ${pagenumber} 页数据...`); + let result = await this.getList(pagenumber); + if (result[0]) { + // 出错, 记录错误日志 + console.error("获取页面数据失败:", result[0]); + return { pages: 0, info: [] }; + } else { + let total = result[1].data.total; + let pages = Math.ceil(total / 20); + let arr = result[1].data.items; + + for (let i = 0; i < arr.length; i++) { + let item = arr[i]; + if (item.endtime >= today && item.publishtime >= beforeOneMonth) { + console.log("处理项目:", item.pjtnoticeid, item.pjtnoticename); + let noticeRes = await this.getNoticeUrl(item.pjtnoticeid); + if (noticeRes[0]) { + // 获取招标公告内容报错 + console.error("获取公告详情失败:", noticeRes[0]); + } else { + info.push({ + id: item.pjtnoticeid, + name: item.pjtnoticename, + publishTime: timestampToDate(item.publishtime), + endTime: timestampToDate(item.endtime), + urls: noticeRes[1], + }); + } + } + } + return { pages, info }; + } + } + getList(pagenumber) { + return axios({ + url: this.url, + params: { + pagesize: 20, + pagenumber: pagenumber, + publishstatus: 2, + bidcategoryid: 1442, + iflongpro: 0, + _: Date.now(), + }, + method: "get", + }) + .then((res) => { + let result = res.data; + if (result.code === "success") { + return [null, result]; + } else { + return ["err", null]; + } + }) + .catch((err) => { + return [err, null]; + }); + } + + getNoticeUrl(id) { + let timestamp = Date.now(); + return axios({ + url: `https://glzb.geely.com/gpmp/notice/query?_=${timestamp}&pjtnoticeid=${id}`, + method: "get", + }) + .then((res) => { + let result = res.data; + if (result.code === "success") { + let promises = []; + for (let item of result.data.attachs) { + let params = { + name: item.attachname, + downloadUrl: item.downloadUrl, + previewUrl: item.previewUrl, + attachname: item.attachname, + _: Date.now(), + }; + promises.push( + axios({ + url: `https://glzb.geely.com/pub/file/info/preview`, + method: "get", + params, + }) + ); + } + return Promise.allSettled(promises).then((results) => { + let urls = []; + results.forEach((result) => { + if ( + result.status === "fulfilled" && + result.value.data.code === "success" + ) { + urls.push(result.value.data.data); + } + }); + return [null, urls]; + }); + } else { + return ["err", null]; + } + }) + .catch((err) => { + console.log("err:", err); + return [err, null]; + }); + } + + // writeFile(info) { + // fs.writeFileSync(this.filepath, JSON.stringify(info), "utf-8"); + // } +} + +new GEELY(); diff --git a/greatWall.js b/greatWall.js new file mode 100644 index 0000000..eb86488 --- /dev/null +++ b/greatWall.js @@ -0,0 +1,234 @@ +import axios from "axios"; +import fs from "fs"; +import path from "path"; +import { timestampToDate, loopCall, keywordsInclude } from "./utils.js"; +import config from "./config.js"; +import { SQLiteMessageQueue } from "./sqlite.js"; + +class GreatWall { + constructor() { + this.jsonMap = [ + { + name: "长城公开寻源", + info: [], + options: { + name: "长城公开寻源", + url: "https://srm.gwm.cn/cloud-srm/api-sou/sou-firstPage/souReqlistPage", + }, + }, + { + name: "长城招募公示大厅", + info: [], + options: { + name: "长城招募公示大厅", + url: "https://srm.gwm.cn/cloud-srm/api-sou/api-ql/Recruit/visitList", + data: { + type: "Recruit", + lang: "zh-cn", + query: { "*": {} }, + payload: { + filter: {}, + page: { sort: "lastUpdateDate desc", pageNum: 1, pageSize: 8 }, + }, + action: "visitList", + tree: true, + }, + }, + }, + ]; + console.log("长城 爬虫启动..."); + this.queue = new SQLiteMessageQueue(); + this.start(); + } + + async start() { + try { + await this.init(); + } catch (err) { + console.error("启动失败:", err); + } + } + async init() { + for (let item of this.jsonMap) { + let announcements = this.queue.getAnnouncementsBySpider(item.name); + if (announcements.length > 0) { + this.loopFetchIncrement(item); + } else { + this.loopFetchFull(item); + } + } + } + // 全量爬取 + loopFetchFull(props) { + try { + loopCall(this.getInfo.bind(this), { + time: config.fullFetchTime, + pagenumber: 1, + additional: props.options, + stopWhen: (pagenumber, result) => { + return ( + pagenumber >= result.pages || pagenumber >= config.pageNumberLimit + ); + }, + readyForNext: (pagenumber, result) => { + props.info.push(...result.info); + return pagenumber + 1; + }, + complete: (result) => { + props.info.push(...result.info); + console.log(`爬取完成,共获取 ${props.info.length} 条有效数据`); + try { + if (props.info.length > 0) { + this.queue.saveAnnouncements(props.name, props.info); + // this.writeFile(props); + this.queue.addMessage(props.name, props.info); + } + } catch (error) { + console.error("数据库操作失败:", error); + } + this.loopFetchIncrement(props); + }, + }); + } catch (error) { + console.error(`${props.options.name}全量爬取失败:`, error); + } + } + loopFetchIncrement(props) { + try { + loopCall(this.getInfo.bind(this), { + time: config.incrementFetchTime, // 5分钟间隔 + pagenumber: 1, + additional: props.options, + readyForNext: (pagenumber, result) => { + try { + let newInfo = this.queue.filterNewAnnouncements( + props.name, + result.info + ); + // 存在新数据 + if (newInfo.length > 0) { + console.log(`发现 ${newInfo.length} 条新数据`); + // props.info.push(...newInfo); + this.queue.saveAnnouncements(props.name, newInfo); + // this.writeFile(props); + this.queue.addMessage(props.name, newInfo); + // 全是新数据,继续下一页 + if (newInfo.length === result.info.length) { + return pagenumber + 1; + } else { + // 有部分重复数据,重新从第一页开始 + return 1; + } + } else { + console.log("没有发现新数据,继续监控..."); + return 1; // 重新从第一页开始 + } + } catch (error) { + console.error("数据库操作失败:", error); + } + }, + }); + } catch (error) { + console.error(`${props.options.name}增量爬取失败:`, error); + } + } + async getInfo(pagenumber = 1, config) { + let info = []; + console.log(`${config.name}--获取第 ${pagenumber} 页数据...`); + let result = await this.getList(pagenumber, config); + if (result[0]) { + // 出错, 记录错误日志 + console.error("获取页面数据失败:", result[0]); + return { pages: 0, info: [] }; + } else { + if (config.data) { + // 招募公示大厅 + let arr = result[1].data.records; + let pages = result[1].data.pageCount; + for (let i = 0; i < arr.length; i++) { + let item = arr[i]; + let endTime, publishTime; + endTime = item.deadlineTime; + publishTime = item.publishTime; + // 命中关键词 + if (keywordsInclude(item.title)) { + info.push({ + id: item.recruitId, + name: item.title, + publishTime: publishTime, + endTime: endTime, + urls: `https://srm.gwm.cn/#/portalBidding/vendorBiddingDetail?id=${item.recruitId}`, + }); + } + } + return { pages, info }; + } else { + // 公开寻源 + let arr = result[1].data.list; + let pages = result[1].data.pages; + + for (let i = 0; i < arr.length; i++) { + let item = arr[i]; + let endTime, publishTime; + endTime = item.publicEndTime; + publishTime = item.releaseDate; + // 命中关键词 + if (keywordsInclude(item.projectName)) { + info.push({ + id: item.reqHeadId, + name: item.projectName, + publishTime: publishTime, + endTime: endTime, + urls: `https://srm.gwm.cn/#/portal?id=${item.reqHeadId}`, + }); + } + } + return { pages, info }; + } + } + } + // 分页获取数据 + getList(pagenumber, config) { + let data = {}; + if (config.data) { + data = config.data; + data.payload.page.pageNum = pagenumber; + } else { + data = { pageNum: pagenumber, pageSize: 8 }; + } + return axios({ + url: config.url, + data: data, + method: "post", + }) + .then((res) => { + let result = res.data; + if (result.code == "0") { + return [null, result]; + } else { + return ["err", null]; + } + }) + .catch((err) => { + return [err, null]; + }); + } + + // writeFile(props) { + // fs.writeFileSync(props.filepath, JSON.stringify(props.info), "utf-8"); + // } + + // extractDeadlineTime(html) { + // // 匹配"预告报名截止时间:"后面的时间格式 + // const regex = /预告报名截止时间:(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2})/; + // const match = html.match(regex); + + // if (match) { + // return match[1]; + // } + + // return null; + // } +} + +new GreatWall(); diff --git a/jianghuai.js b/jianghuai.js new file mode 100644 index 0000000..5963124 --- /dev/null +++ b/jianghuai.js @@ -0,0 +1,385 @@ +import axios from "axios"; +import fs from "fs"; +import path from "path"; +import JSON5 from "json5"; +import { timestampToDate, loopCall, keywordsInclude } from "./utils.js"; +import config from "./config.js"; +import { SQLiteMessageQueue } from "./sqlite.js"; + +class JiangHuai { + constructor(jsonMap) { + this.axiosInstance = axios.create({ timeout: 30000, maxRedirects: 5 }); + this.axiosInstance.interceptors.request.use((config) => { + // 添加cookie到请求头 + const cookieString = Array.from(this.cookiePair.entries()) + .map(([name, value]) => `${name}=${value}`) + .join("; "); + config.headers.Cookie = cookieString; + return config; + }); + this.axiosInstance.interceptors.response.use( + (response) => { + // 更新cookie到请求头 + let cookieArr = response.headers["set-cookie"]; + this.extractCookie(cookieArr); + return response; + }, + (error) => { + return Promise.reject(error); + } + ); + this.cookiePair = new Map(); + this.csrfToken = ""; + this.jsonMap = jsonMap; + // [ + // { + // name: "江淮【招标公告】", + // info: [], + // options: { + // name: "江淮【招标公告】", + // url: "https://ahjhqc.youzhicai.com/domain/data-list-new", + // data: { + // pageIndex: 1, + // type: 1, + // companyId: "", + // title: "", + // ntype: 1, + // start_time: "", + // end_time: "", + // child: "", + // tenderType: 3, + // }, + // }, + // }, + // { + // name: "江淮【变更/澄清公告】", + // info: [], + // options: { + // name: "江淮【变更/澄清公告】", + // url: "https://ahjhqc.youzhicai.com/domain/data-list-new", + // data: { + // pageIndex: 1, + // type: 1, + // companyId: "", + // title: "", + // ntype: "4,6", + // start_time: "", + // end_time: "", + // child: "", + // tenderType: 3, + // }, + // }, + // }, + // ]; + console.log("江淮 爬虫启动..."); + this.queue = new SQLiteMessageQueue(); + this.start(); + } + + async start() { + try { + await this.init(); + } catch (err) { + console.error("启动失败:", err); + } + } + async init() { + for (let item of this.jsonMap) { + let announcements = this.queue.getAnnouncementsBySpider(item.name); + if (announcements.length > 0) { + this.loopFetchIncrement(item); + } else { + this.loopFetchFull(item); + } + } + } + async initializeCookie() { + try { + let headers = { + headers: { + "User-Agent": + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36", + Accept: + "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8", + "Accept-Language": "zh-CN,zh;q=0.9", + "Cache-Control": "no-cache", + Pragma: "no-cache", + "Sec-Fetch-Dest": "document", + "Sec-Fetch-Mode": "navigate", + "Sec-Fetch-Site": "none", + "Upgrade-Insecure-Requests": "1", + }, + }; + const homeResponse = await this.axiosInstance.get( + "https://ahjhqc.youzhicai.com/homeindex/noticeListNew.html?type=1", + headers + ); + // 提取csrf-token + let tokenMatch = homeResponse.data.match( + / { + return ( + pagenumber >= result.pages || pagenumber >= config.pageNumberLimit + ); + }, + readyForNext: (pagenumber, result) => { + props.info.push(...result.info); + return pagenumber + 1; + }, + complete: (result) => { + props.info.push(...result.info); + console.log(`爬取完成,共获取 ${props.info.length} 条有效数据`); + try { + if (props.info.length > 0) { + this.queue.saveAnnouncements(props.name, props.info); + this.queue.addMessage(props.name, props.info); + } + } catch (error) { + console.error("数据库操作失败:", error); + } + this.loopFetchIncrement(props); + }, + }); + } catch (error) { + console.error(`${props.options.name}全量爬取失败:`, error); + } + } + loopFetchIncrement(props) { + console.log("开始增量爬取"); + try { + loopCall(this.getInfo.bind(this), { + time: config.incrementFetchTime, // 5分钟间隔 + pagenumber: 1, + additional: props.options, + readyForNext: (pagenumber, result) => { + try { + let newInfo = this.queue.filterNewAnnouncements( + props.name, + result.info + ); + // 存在新数据 + if (newInfo.length > 0) { + console.log(`发现 ${newInfo.length} 条新数据`); + // props.info.push(...newInfo); + this.queue.saveAnnouncements(props.name, newInfo); + // this.writeFile(props); + this.queue.addMessage(props.name, newInfo); + // 全是新数据,继续下一页 + if (newInfo.length === result.info.length) { + return pagenumber + 1; + } else { + // 有部分重复数据,重新从第一页开始 + return 1; + } + } else { + console.log("没有发现新数据,继续监控..."); + return 1; // 重新从第一页开始 + } + } catch (error) { + console.error("数据库操作失败:", error); + } + }, + }); + } catch (error) { + console.error(`${props.options.name}增量爬取失败:`, error); + } + } + async getInfo(pagenumber = 1, config) { + let info = []; + console.log(`${config.name}--获取第 ${pagenumber} 页数据...`); + let result = await this.getList(pagenumber, config); + if (result[0]) { + // 出错, 记录错误日志 + console.error("获取页面数据失败: ", result[0]); + return { pages: 0, info: [] }; + } else { + // 公开寻源 + let arr = result[1].list; + let total = result[1].total; + let pages = Math.ceil(total / 10); + + for (let i = 0; i < arr.length; i++) { + let item = arr[i]; + let endTime, publishTime; + publishTime = new Date(item.startTime).toLocaleDateString(); + endTime = new Date(item.endTime).toLocaleDateString(); + // 命中关键词 + if ( + keywordsInclude(item.noticeTitle) && + item.endTime && + +new Date(item.endTime) >= Date.now() + ) { + console.log("处理项目:", item.noticeTitle); + info.push({ + id: item.bulletinSID, + name: item.noticeTitle, + publishTime: publishTime, + endTime: endTime, + urls: `https://ahjhqc.youzhicai.com/${item.Url}`, + }); + } + } + return { pages, info }; + } + } + async getList(pagenumber, config) { + let data = config.data; + data.pageIndex = pagenumber; + let headers = { + Accept: "text/plain, */*; q=0.01", + "Accept-Language": "zh-CN,zh;q=0.9", + "Cache-Control": "no-cache", + "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", + Origin: "https://ahjhqc.youzhicai.com", + Pragma: "no-cache", + Priority: "u=1, i", + Referer: + "https://ahjhqc.youzhicai.com/homeindex/noticeListNew.html?type=1", + "Sec-Ch-Ua": + '"Not)A;Brand";v="8", "Chromium";v="138", "Google Chrome";v="138"', + "Sec-Ch-Ua-Mobile": "?0", + "Sec-Ch-Ua-Platform": '"macOS"', + "Sec-Fetch-Dest": "empty", + "Sec-Fetch-Mode": "cors", + "Sec-Fetch-Site": "same-origin", + "User-Agent": + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36", + "X-Requested-With": "XMLHttpRequest", + "X-Csrf-Token": this.csrfToken, + }; + try { + const response = await this.axiosInstance({ + url: config.url, + data, + method: "post", + headers, + }); + let result = JSON5.parse(response.data); + if (result.list && result.list.length > 0) { + return [null, result]; + } else { + return ["err", null]; + } + } catch (err) { + console.log("cookie不对"); + try { + await this.initializeCookie(); + headers["X-Csrf-Token"] = this.csrfToken; + const retryResponse = await this.axiosInstance({ + url: config.url, + data, + method: "post", + headers, + }); + // console.log(retryResponse.data); + let result = JSON5.parse(retryResponse.data); + if (result.list && result.list.length > 0) { + return [null, result]; + } else { + return ["err", null]; + } + } catch (retryErr) { + return [retryErr, null]; + } + } + } + // 分页获取数据 + // getList(pagenumber, config) { + // let data = config.data; + // data.pageIndex = pagenumber; + // return axios({ + // url: config.url, + // data: data, + // method: "post", + // headers: { + // "Content-Type": "application/x-www-form-urlencoded", + // }, + // }) + // .then((res) => { + // let result = res.data; + // if (result.list && result.list.length > 0) { + // return [null, result]; + // } else { + // return ["err", null]; + // } + // }) + // .catch((err) => { + // return [err, null]; + // }); + // } +} + +new JiangHuai([ + { + name: "江淮【招标公告】", + info: [], + options: { + name: "江淮【招标公告】", + url: "https://ahjhqc.youzhicai.com/domain/data-list-new", + data: { + pageIndex: 1, + type: 1, + companyId: "", + title: "", + ntype: 1, + start_time: "", + end_time: "", + child: "", + tenderType: 3, + }, + }, + }, +]); +new JiangHuai([ + { + name: "江淮【变更/澄清公告】", + info: [], + options: { + name: "江淮【变更/澄清公告】", + url: "https://ahjhqc.youzhicai.com/domain/data-list-new", + data: { + pageIndex: 1, + type: 1, + companyId: "", + title: "", + ntype: "4,6", + start_time: "", + end_time: "", + child: "", + tenderType: 3, + }, + }, + }, +]); diff --git a/leapMotor.js b/leapMotor.js new file mode 100644 index 0000000..d99e73e --- /dev/null +++ b/leapMotor.js @@ -0,0 +1,193 @@ +import axios from "axios"; +import fs from "fs"; +import path from "path"; +import { timestampToDate, loopCall, keywordsInclude } from "./utils.js"; +import config from "./config.js"; +import { SQLiteMessageQueue } from "./sqlite.js"; +// import cheerio from "cheerio"; + +class LeapMotor { + constructor() { + this.url = + "https://lpsrm.leapmotor.com/cloud-srm/api-inq/inq-anon/reqhead/listPage"; + this.info = []; + console.log("零跑 爬虫启动..."); + this.queue = new SQLiteMessageQueue(); + this.start(); + } + + async start() { + try { + await this.init(); + } catch (err) { + console.error("启动失败:", err); + } + } + async init() { + let announcements = this.queue.getAnnouncementsBySpider("零跑"); + if (announcements.length > 0) { + // console.log(announcements); + await this.increment(); + } else { + await this.fullFetch(); + } + } + // 全量爬取 + async fullFetch() { + console.log("开始全量爬取..."); + try { + await loopCall(this.getInfo.bind(this), { + time: config.fullFetchTime, + pagenumber: 1, + stopWhen: (pagenumber, result) => { + return ( + pagenumber >= result.pages || pagenumber >= config.pageNumberLimit + ); + }, + readyForNext: (pagenumber, result) => { + this.info.push(...result.info); + return pagenumber + 1; + }, + complete: (result) => { + this.info.push(...result.info); + console.log(`爬取完成,共获取 ${this.info.length} 条有效数据`); + try { + this.queue.saveAnnouncements("零跑", this.info); + this.queue.addMessage("零跑", this.info); + } catch (error) { + console.error("数据库操作失败:", error); + } + }, + }); + } catch (error) { + console.error("全量爬取失败:", error); + } + console.log("开始增量爬取..."); + this.increment(); + } + + // 增量爬取 + async increment() { + console.log("开始增量爬取模式,每5分钟检查一次新数据..."); + try { + await loopCall(this.getInfo.bind(this), { + time: config.incrementFetchTime, // 5分钟间隔 + pagenumber: 1, + readyForNext: (pagenumber, result) => { + // 判断数据是否存在 + try { + let newInfo = this.queue.filterNewAnnouncements( + "零跑", + result.info + ); + // 有新数据 + if (newInfo.length > 0) { + console.log(`发现 ${newInfo.length} 条新数据`); + + this.queue.saveAnnouncements("零跑", newInfo); + this.queue.addMessage("零跑", newInfo); + + // 全是新数据,继续下一页 + if (newInfo.length === result.info.length) { + return pagenumber + 1; + } else { + // 有部分重复数据,重新从第一页开始 + return 1; + } + } else { + console.log("没有发现新数据,继续监控..."); + return 1; // 重新从第一页开始 + } + } catch (error) { + console.error("数据库操作失败:", error); + } + }, + }); + } catch (error) { + console.error("增量爬取失败:", error); + } + } + // 传入页码获取数据 + async getInfo(pagenumber = 1) { + let info = []; + console.log(`正在获取第 ${pagenumber} 页数据...`); + let result = await this.getList(pagenumber); + if (result[0]) { + // 出错, 记录错误日志 + console.error("获取页面数据失败:", result[0]); + return { pages: 0, info: [] }; + } else { + // let total = result[1].data.total; + let pages = result[1].data.pages; + let arr = result[1].data.list; + + for (let i = 0; i < arr.length; i++) { + let item = arr[i]; + // 命中关键词 + if (keywordsInclude(item.souReqTitile)) { + console.log("处理项目:", item.reqHeadId, item.souReqTitile); + let noticeRes = await this.getNoticeUrl(item.reqHeadId); + if (noticeRes[0]) { + // 获取招标公告内容报错 + console.error("获取公告链接失败:", noticeRes[0]); + } else { + info.push({ + id: item.reqHeadId, + name: item.souReqTitile, + publishTime: item.publishTime, + endTime: item.expirationTime, + urls: noticeRes[1], + }); + } + } + } + return { pages, info }; + } + } + getList(pagenumber) { + return axios({ + url: this.url, + data: { + pageNum: pagenumber, + pageSize: 8, + }, + method: "post", + }) + .then((res) => { + let result = res.data; + if (result.code === "0") { + return [null, result]; + } else { + return ["err", null]; + } + }) + .catch((err) => { + return [err, null]; + }); + } + + getNoticeUrl(id) { + return axios({ + url: `https://lpsrm.leapmotor.com/cloud-srm/api-inq/inq-anon/pj/reqhead/get?id=${id}`, + method: "get", + }) + .then((res) => { + let result = res.data; + if (result.code === "0") { + return [null, result.data.extNoticeLink]; + } else { + return ["err", null]; + } + }) + .catch((err) => { + console.log("err:", err); + return [err, null]; + }); + } + + // writeFile(info) { + // fs.writeFileSync(this.filepath, JSON.stringify(info), "utf-8"); + // } +} + +new LeapMotor(); diff --git a/mailer.js b/mailer.js new file mode 100644 index 0000000..b9c2ebb --- /dev/null +++ b/mailer.js @@ -0,0 +1,100 @@ +import nodemailer from "nodemailer"; +import path from "path"; + +class EmailSender { + constructor(config) { + this.transporter = nodemailer.createTransport(config); + this.defaultFrom = config.auth.user; + } + async sendEmail(options) { + try { + const mailOptions = { + from: options.from || this.defaultFrom, + to: options.to, + cc: options.cc, + bcc: options.bcc, + subject: options.subject, + text: options.text, + html: options.html, + attachments: options.attachments || [], + }; + + const info = await this.transporter.sendMail(mailOptions); + console.log(`邮件发送成功: ${options.to} - ${info.messageId}`); + return { success: true, messageId: info.messageId }; + } catch (error) { + console.error(`邮件发送失败: ${options.to} -`, error.message); + throw error; + } + } + async sendBasicEmail(to, subject, content) { + return await this.sendEmail({ to, subject, html: content }); + } + + async sendEmailWithAttachments(to, subject, content, attachmentPath) { + const attachments = []; + if (attachmentPath) { + attachments.push({ + filename: path.basename(attachmentPath), + path: attachmentPath, + }); + } + return await this.sendEmail({ to, subject, html: content, attachments }); + } + + async sendBulkEmail(recipients, subject, content) { + const results = []; + for (const recipient of recipients) { + try { + const result = await this.sendEmail({ + to: recipient, + subject, + html: content, + }); + results.push({ recipient, success: true, result }); + } catch (error) { + results.push({ recipient, success: false, error: error.message }); + } + await new Promise((resolve) => setTimeout(resolve, 1000)); + } + return results; + } + + async testConnection() { + try { + await this.transporter.verify(); + console.log("邮件服务器连接成功"); + return true; + } catch (error) { + console.error("邮件服务器连接失败:", error); + return false; + } + } +} + +// async function example() { +// let emailSender = new EmailSender({ +// host: "smtp.exmail.qq.com", +// port: 465, +// secure: true, +// auth: { +// user: "jiqiren@axbbaoxian.com", +// pass: "Am13579q", +// }, +// }); +// const isConnected = await emailSender.testConnection(); +// if (!isConnected) { +// console.log("邮件服务器连接失败"); +// return; +// } +// emailSender.sendBasicEmail( +// "cpw@axbbaoxian.com", +// "测试邮件", +// "这是测试邮件内容" +// ); +// } + +// example().catch((err) => { +// console.error("程序错误:", err); +// }); +export { EmailSender }; diff --git a/msgManager.js b/msgManager.js new file mode 100644 index 0000000..26b57dd --- /dev/null +++ b/msgManager.js @@ -0,0 +1,212 @@ +// msgQueue.js - 基于事件的消息队列 +import { EventEmitter } from "events"; +import fs from "fs"; +import path from "path"; +import { EmailSender } from "./mailer.js"; +import { SQLiteMessageQueue } from "./sqlite.js"; +import { md5 } from "./utils.js"; +import axios from "axios"; + +class MessageQueue extends EventEmitter { + constructor() { + super(); + this.queue = new SQLiteMessageQueue(); + this.processing = false; + // this.queueFile = path.resolve("message_queue.json");K + this.emailSender = new EmailSender({ + host: "smtp.exmail.qq.com", + port: 465, + secure: true, + auth: { + user: "jiqiren@axbbaoxian.com", + pass: "Am13579q", + }, + }); + this.recipients = [ + "huzhengrong@axbbaoxian.com", + ]; + + // 启动处理器 + this.startProcessor(); + } + + // 添加消息到队列 + + // 处理队列 + async startProcessor() { + setInterval(async () => { + // 清除状态 不等于 pending的数据 + console.log("开始处理队列"); + try { + const pendingMessages = this.queue.getPendingMessages(); + if (!this.processing && pendingMessages.length > 0) { + await this.processQueue(pendingMessages); + } + } catch (error) { + console.error(`❌ 获取待处理消息失败:`, error); + } + }, 60 * 60 * 1000); // 1h处理一次 + } + + async processQueue(pendingMessages) { + this.processing = true; + + let msgMap = {}; + for (const message of pendingMessages) { + try { + console.log(`📧 处理消息: ${message.spider_name}`); + // console.log(typeof message.data); + // let formdata = JSON.parse(message.data); + if (!msgMap[message.spider_name]) { + msgMap[message.spider_name] = message.data; + } else { + msgMap[message.spider_name].push(...message.data); + } + + message.status = "sent"; + message.sent_at = new Date().toISOString(); + this.queue.updateMessageStatus( + message.id, + message.status, + message.sent_at + ); + } catch (error) { + console.error(`❌ 消息处理失败: ${message.id}`, error); + message.status = "failed"; + message.error_message = error.message; + this.queue.updateMessageStatus( + message.id, + message.status, + null, + // message.sent_at, + message.error_message + ); + } + } + let html = ""; + for (const spiderName in msgMap) { + html += this.generateTable(spiderName, msgMap[spiderName]); + } + try { + this.emailSender.sendBulkEmail(this.recipients, "招标项目最新公告", html); + } catch (error) { + console.error(`❌ 通知发送失败: ${error}`); + } + + this.processing = false; + } + + generateTable(spiderName, data) { + let tableHtml = ` +
+

+ 🕷️ ${spiderName} (${data.length} 条新增) +

+ +
+ + + + + + + + + + + + `; + data.forEach((item, index) => { + const rowColor = index % 2 === 0 ? "#f8f9fa" : "white"; + // const publishTime = this.formatDateTime(item.publishTime); + // const endTime = this.formatDateTime(item.endTime); + const urls = this.formatUrls(item.urls); + + tableHtml += ` + + + + + + + + `; + }); + + tableHtml += ` + +
序号项目名称发布时间截止时间查看详情
+ ${index + 1} + +
+ ${item.name} +
+ +
+ ${item.publishTime} + +
${item.endTime}
+
+ ${urls} +
+
+
+ `; + + return tableHtml; + } + + getSign(timestamp) { + let secret = "cpwyyds"; + let uri = "/common/message/push"; + const url = uri + timestamp + secret; + const myCalc = md5(url); + let sign = + myCalc.substring(5, 13) + + myCalc.substring(29, 31) + + myCalc.substring(18, 27); + //sign 转大写 + sign = sign.toUpperCase(); + return sign; + } + + formatUrls(urls) { + if (!urls) { + return '无链接'; + } + + // 处理数组形式的URLs + if (Array.isArray(urls)) { + if (urls.length === 0) { + return '无链接'; + } + + if (urls.length === 1) { + return `📄 查看`; + } + + // 多个链接的情况 + let linksHtml = '
'; + urls.forEach((url, index) => { + linksHtml += `📄 链接${ + index + 1 + }`; + }); + linksHtml += "
"; + return linksHtml; + } + + // 处理字符串形式的URL + if (typeof urls === "string") { + return `📄 查看`; + } + + return '链接格式错误'; + } +} + +const messageQueue = new MessageQueue(); + +export { messageQueue }; + +// export default MessageQueue; diff --git a/nio.js b/nio.js new file mode 100644 index 0000000..c6d43f7 --- /dev/null +++ b/nio.js @@ -0,0 +1,170 @@ +import axios from "axios"; +import fs from "fs"; +import path from "path"; +import { + timestampToDate, + loopCall, + keywordsInclude, + getYiqiNoticeUrl, + parseToGgDetailsParams, +} from "./utils.js"; +import config from "./config.js"; +import * as cheerio from "cheerio"; +import { SQLiteMessageQueue } from "./sqlite.js"; + +class NIO { + constructor() { + // this.filepath = path.resolve("yiqi.json"); + this.info = []; + console.log("蔚来 爬虫启动..."); + this.queue = new SQLiteMessageQueue(); + this.start(); + } + + async start() { + try { + await this.init(); + } catch (err) { + console.error("启动失败:", err); + } + } + async init() { + let announcements = this.queue.getAnnouncementsBySpider("蔚来"); + if (announcements.length > 0) { + await this.increment(); + } else { + await this.fullFetch(); + } + } + // 全量爬取 + async fullFetch() { + console.log("开始全量爬取..."); + try { + await loopCall(this.getInfo.bind(this), { + time: config.fullFetchTime, + pagenumber: 1, + stopWhen: (pagenumber, result) => { + return ( + pagenumber >= result.pages || pagenumber >= config.pageNumberLimit + ); + }, + readyForNext: (pagenumber, result) => { + this.info.push(...result.info); + return pagenumber + 1; + }, + complete: (result) => { + this.info.push(...result.info); + console.log(`爬取完成,共获取 ${this.info.length} 条有效数据`); + try { + if (this.info.length > 0) { + this.queue.saveAnnouncements("蔚来", this.info); + // this.writeFile(this.info); + this.queue.addMessage("蔚来", this.info); + } + } catch (error) { + console.error("数据库操作失败:", error); + } + }, + }); + } catch (error) { + console.error("全量爬取失败:", error); + } + console.log("开始增量爬取..."); + this.increment(); + } + + // 增量爬取 + async increment() { + console.log("开始增量爬取模式,每5分钟检查一次新数据..."); + try { + await loopCall(this.getInfo.bind(this), { + time: config.incrementFetchTime, // 5分钟间隔 + pagenumber: 1, + readyForNext: (pagenumber, result) => { + try { + let newInfo = this.queue.filterNewAnnouncements( + "蔚来", + result.info + ); + // 存在新数据 + if (newInfo.length > 0) { + console.log(`发现 ${newInfo.length} 条新数据`); + // this.info.push(...newInfo); + this.queue.saveAnnouncements("蔚来", newInfo); + // this.writeFile(this.info); + this.queue.addMessage("蔚来", newInfo); + // 全是新数据,继续下一页 + if (newInfo.length === result.info.length) { + return pagenumber + 1; + } else { + // 有部分重复数据,重新从第一页开始 + return 1; + } + } else { + console.log("没有发现新数据,继续监控..."); + return 1; // 重新从第一页开始 + } + } catch (error) { + console.error("数据库操作失败:", error); + } + }, + }); + } catch (error) { + console.error("增量爬取失败:", error); + } + } + async getInfo(pagenumber = 1) { + let info = []; + console.log(`正在获取第 ${pagenumber} 页数据...`); + let result = await this.getHtml(pagenumber); + if (result[0]) { + // 出错, 记录错误日志 + console.error("获取页面数据失败:", result[0]); + return { pages: 0, info: [] }; + } else { + let pages = 1; + let html = result[1]; + const $ = cheerio.load(html); + let jsonStr = $("#__NEXT_DATA__").text(); + let data = JSON.parse(jsonStr).props.pageProps.tenderNotices; + // console.log(data); + data.forEach((item) => { + let id = item.id; + let name = item.title; + let publishTime = item.publishDate; + let endTime = item.dueTime; + let urls = item.documents[0].url; + if ( + endTime && + +new Date(endTime) >= Date.now() && + keywordsInclude(name) + ) { + info.push({ + id, + name, + publishTime, + endTime, + urls, + }); + } + }); + return { pages, info }; + } + } + // 分页获取数据 + getHtml(pagenumber) { + return axios({ + url: "https://www.nio.cn/partnership/tender-notices", + method: "get", + }) + .then((res) => { + let result = res.data; + return [null, result]; + }) + .catch((err) => { + return [err, null]; + }); + } +} + +new NIO(); diff --git a/package.json b/package.json new file mode 100644 index 0000000..6ebac6a --- /dev/null +++ b/package.json @@ -0,0 +1,23 @@ +{ + "name": "net-spider", + "version": "1.0.0", + "description": "", + "main": "index.js", + "type": "module", + "scripts": { + "test": "echo \"Error: no test specified\" && exit 1", + "start": "pm2 start ecosystem.config.cjs", + "stop": "pm2 stop all", + "stats": "node stats.js", + "restart": "pm2 restart all" + }, + "author": "", + "license": "ISC", + "dependencies": { + "axios": "^1.12.2", + "better-sqlite3": "^12.4.1", + "cheerio": "^1.1.2", + "json5": "^2.2.3", + "nodemailer": "^7.0.6" + } +} diff --git a/picc.js b/picc.js new file mode 100644 index 0000000..804d1f8 --- /dev/null +++ b/picc.js @@ -0,0 +1,214 @@ +import axios from "axios"; +import fs from "fs"; +import path from "path"; +import { timestampToDate, loopCall } from "./utils.js"; +import config from "./config.js"; +import { SQLiteMessageQueue } from "./sqlite.js"; + +class PICC { + constructor() { + this.info = []; + console.log("中国人民保险 爬虫启动..."); + this.queue = new SQLiteMessageQueue(); + this.start(); + } + + async start() { + try { + await this.init(); + } catch (err) { + console.error("启动失败:", err); + } + } + async init() { + let announcements = this.queue.getAnnouncementsBySpider("中国人民保险"); + if (announcements.length > 0) { + await this.increment(); + } else { + await this.fullFetch(); + } + } + // 全量爬取 + async fullFetch() { + console.log("开始全量爬取..."); + try { + await loopCall(this.getInfo.bind(this), { + time: config.fullFetchTime, + pagenumber: 1, + stopWhen: (pagenumber, result) => { + return ( + pagenumber >= result.pages || pagenumber >= config.pageNumberLimit + ); + }, + readyForNext: (pagenumber, result) => { + this.info.push(...result.info); + return pagenumber + 1; + }, + complete: (result) => { + this.info.push(...result.info); + console.log(`爬取完成,共获取 ${this.info.length} 条有效数据`); + try { + if (this.info.length > 0) { + this.queue.saveAnnouncements("中国人民保险", this.info); + // this.writeFile(this.info); + this.queue.addMessage("中国人民保险", this.info); + } + } catch (error) { + console.error("数据库操作失败:", error); + } + }, + }); + } catch (error) { + console.error("全量爬取失败:", error); + } + console.log("开始增量爬取..."); + this.increment(); + } + + // 增量爬取 + async increment() { + console.log("开始增量爬取模式,每5分钟检查一次新数据..."); + try { + await loopCall(this.getInfo.bind(this), { + time: config.incrementFetchTime, // 5分钟间隔 + pagenumber: 1, + readyForNext: (pagenumber, result) => { + try { + let newInfo = this.queue.filterNewAnnouncements( + "中国人民保险", + result.info + ); + // 存在新数据 + if (newInfo.length > 0) { + console.log(`发现 ${newInfo.length} 条新数据`); + // this.info.push(...newInfo); + this.queue.saveAnnouncements("中国人民保险", newInfo); + // this.writeFile(this.info); + this.queue.addMessage("中国人民保险", newInfo); + // 全是新数据,继续下一页 + if (newInfo.length === result.info.length) { + return pagenumber + 1; + } else { + // 有部分重复数据,重新从第一页开始 + return 1; + } + } else { + console.log("没有发现新数据,继续监控..."); + return 1; // 重新从第一页开始 + } + } catch (error) { + console.error("数据库操作失败:", error); + } + }, + }); + } catch (error) { + console.error("增量爬取失败:", error); + } + } + async getInfo(pagenumber = 1) { + let info = []; + console.log(`正在获取第 ${pagenumber} 页数据...`); + let result = await this.getList(pagenumber); + if (result[0]) { + // 出错, 记录错误日志 + console.error("获取页面数据失败:", result[0]); + return { pages: 0, info: [] }; + } else { + let total = result[1].res.total; + let pages = Math.ceil(total / 10); + let arr = result[1].res.rows; + + for (let i = 0; i < arr.length; i++) { + let item = arr[i]; + let endTime = timestampToDate( + new Date(item.tenderFileSaleEndTime).getTime(), + true + ); + // 命中关键词 + if ( + this.keywordsInclude(item.title) && + endTime && + +new Date(endTime) >= Date.now() + ) { + // console.log("处理项目:", item.sourcingId, item.title); + info.push({ + id: item.sourcingId, + name: item.title, + publishTime: timestampToDate( + new Date(item.tenderFileSaleBeginTime).getTime(), + true + ), + endTime: endTime, + urls: `https://ec.picc.com/cms/default/webfile${item.url}`, + }); + } + } + return { pages, info }; + } + } + // 分页获取数据 + getList(pagenumber) { + return axios({ + url: "https://ec.picc.com/cms/api/dynamicData/queryContentPage", + data: { + dto:{ + categoryId:"211,213,214,215,216,217", + city:"", + county:"", + purchaseMode:"", + siteId:"725" + }, + pageNo: pagenumber, + pageSize: 10, + }, + method: "post", + headers: { + 'Accept': 'application/json, text/javascript, */*; q=0.01', + 'Accept-Encoding': 'gzip, deflate, br, zstd', + 'Accept-Language': 'zh-CN,zh;q=0.9', + 'Connection': 'keep-alive', + 'Content-Type': 'application/json; charset=UTF-8', + 'Cookie': 'G_rbec_47_11_8080=22685.52745.19855.0000', + 'Host': 'ec.picc.com', + 'Origin': 'https://ec.picc.com', + 'Referer': 'https://ec.picc.com/cms/default/webfile/ywgg1/index.html', + 'Sec-Fetch-Dest': 'empty', + 'Sec-Fetch-Mode': 'cors', + 'Sec-Fetch-Site': 'same-origin', + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36', + 'X-Requested-With': 'XMLHttpRequest', + 'Sec-Ch-Ua': '"Google Chrome";v="141", "Not?A_Brand";v="8", "Chromium";v="141"', + 'Sec-Ch-Ua-Mobile': '?0', + 'Sec-Ch-Ua-Platform': "macOS", + } + }) + .then((res) => { + let result = res.data; + console.log("then",result) + if (result.msg === "操作成功" && result.code === 0) { + return [null, result]; + } else { + return ["err", null]; + } + }) + .catch((err) => { + console.log('catch', err) + return [err, null]; + }); + } + + keywordsInclude(name) { + let keywords = [ + "保险", + "车险", + "非车险", + "科技", + "大模型", + "承保", + "第三方平台", + ]; + return keywords.some((keyword) => name.includes(keyword)); + } +} + +new PICC(); diff --git a/readme.md b/readme.md new file mode 100644 index 0000000..329d7b1 --- /dev/null +++ b/readme.md @@ -0,0 +1,47 @@ +# 查看指定爬虫详细信息 + +pm2 show chery-spider + +# 查看指定爬虫状态 + +pm2 list | grep chery-spider + +# 实时监控指定爬虫 + +pm2 monit chery-spider + +# 停止指定爬虫(不删除) + +pm2 stop chery-spider + +# 彻底删除爬虫进程 + +pm2 delete chery-spider + +# 停止并删除 + +pm2 stop chery-spider && pm2 delete chery-spider + +# 查看指定爬虫的实时日志 + +pm2 logs chery-spider + +# 查看最近 100 行日志 + +pm2 logs chery-spider --lines 100 + +# 只查看标准输出日志 + +pm2 logs chery-spider --out + +# 只查看错误日志 + +pm2 logs chery-spider --err + +# 查看某个时间段的日志 + +pm2 logs chery-spider --timestamp + +# 清空日志 + +pm2 flush chery-spider diff --git a/sqlite.js b/sqlite.js new file mode 100644 index 0000000..a86d0f2 --- /dev/null +++ b/sqlite.js @@ -0,0 +1,320 @@ +import Database from "better-sqlite3"; +import fs from "fs"; +// import { wechatPush } from "./utils.js"; + +class SQLiteMessageQueue { + constructor() { + // this.db = new Database("message_queue.db"); + this.db = new Database("spider_data.db"); + this.init(); + this.setupGracefulShutdown(); + } + init() { + this.db.exec(` + CREATE TABLE IF NOT EXISTS announcements ( + id TEXT PRIMARY KEY, + spider_name TEXT NOT NULL, + name TEXT NOT NULL, + publish_time TEXT, + end_time TEXT, + urls TEXT, + created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP, + updated_at TEXT + ) + `); + + this.db.exec(` + CREATE TABLE IF NOT EXISTS messages ( + id TEXT PRIMARY KEY, + spider_name TEXT NOT NULL, + data TEXT NOT NULL, + timestamp TEXT NOT NULL, + status TEXT DEFAULT 'pending', + sent_at TEXT, + error_message TEXT + ) + `); + this.db.exec(` + CREATE INDEX IF NOT EXISTS idx_announcements_spider ON announcements(spider_name); + CREATE INDEX IF NOT EXISTS idx_announcements_time ON announcements(publish_time); + CREATE INDEX IF NOT EXISTS idx_announcements_created ON announcements(created_at); + CREATE INDEX IF NOT EXISTS idx_status ON messages(status); + CREATE INDEX IF NOT EXISTS idx_spider_status ON messages(spider_name, status); + CREATE INDEX IF NOT EXISTS idx_timestamp ON messages(timestamp); + `); + + this.insertAnnouncementStmt = this.db.prepare(` + INSERT OR REPLACE INTO announcements + (id, spider_name, name, publish_time, end_time, urls, created_at, updated_at) + VALUES (?, ?, ?, ?, ?, ?, ?, ?) + `); + + this.getAnnouncementStmt = this.db.prepare(` + SELECT * FROM announcements WHERE id = ? + `); + + this.getAnnouncementsBySpiderStmt = this.db.prepare(` + SELECT * FROM announcements WHERE spider_name = ? + ORDER BY created_at DESC + `); + + this.checkAnnouncementExistsStmt = this.db.prepare(` + SELECT COUNT(*) as count FROM announcements WHERE id = ? + `); + + // 预编译SQL语句(提高性能) + this.insertStmt = this.db.prepare(` + INSERT INTO messages (id, spider_name, data, timestamp, status) + VALUES (?, ?, ?, ?, ?) + `); + + this.getPendingStmt = this.db.prepare(` + SELECT * FROM messages WHERE status = 'pending' + ORDER BY timestamp ASC + `); + + this.getFailedStmt = this.db.prepare(` + SELECT * FROM messages WHERE status = 'failed' + ORDER BY timestamp ASC + `); + + this.updateStatusStmt = this.db.prepare(` + UPDATE messages + SET status = ?, sent_at = ?, error_message = ? + WHERE id = ? + `); + } + // safeExecute(methodName, operation, ...args) { + // } + saveAnnouncement(spiderName, announcement) { + const now = new Date().toISOString(); + const isNew = !this.isAnnouncementExists(announcement.id); + + this.insertAnnouncementStmt.run( + announcement.id, + spiderName, + announcement.name, + announcement.publishTime, + announcement.endTime, + announcement.urls, + isNew ? now : this.getAnnouncement(announcement.id)?.created_at || now, + now + ); + + return isNew; + } + /** + * 批量保存公告并返回新公告 + */ + saveAnnouncements(spiderName, announcements) { + const newAnnouncements = []; + + // 使用事务提高性能 + const saveMany = this.db.transaction((announcements) => { + for (const announcement of announcements) { + const isNew = this.saveAnnouncement(spiderName, announcement); + if (isNew) { + newAnnouncements.push(announcement); + } + } + }); + + saveMany(announcements); + + console.log(`💾 ${spiderName}: 保存 ${announcements.length} 条公告`); + return newAnnouncements; + } + /** + * 检查公告是否存在 + */ + isAnnouncementExists(announcementId) { + const result = this.checkAnnouncementExistsStmt.get(announcementId); + return result.count > 0; + } + + /** + * 获取单个公告 + */ + getAnnouncement(id) { + return this.getAnnouncementStmt.get(id); + } + + /** + * 获取指定爬虫的所有公告 + */ + getAnnouncementsBySpider(spiderName) { + return this.getAnnouncementsBySpiderStmt.all(spiderName); + } + /** + * 根据 spiderName 删除其所有公告 + */ + deleteAnnouncementsBySpider(spiderName) { + const stmt = this.db.prepare(`DELETE FROM announcements WHERE spider_name = ?`); + const info = stmt.run(spiderName); + console.log(`🗑️ 删除 ${spiderName} 的公告,共删除 ${info.changes} 条`); + return info.changes; + } + /** + * 过滤出新公告 + */ + filterNewAnnouncements(spiderName, announcements) { + return announcements.filter( + (announcement) => !this.isAnnouncementExists(announcement.id) + ); + } + + // ============= + // 消息队列相关方法 + // ============= + + addMessage(spiderName, data) { + const message = { + id: Date.now() + "-" + Math.random().toString(36).substr(2, 9), + spider_name: spiderName, + data: JSON.stringify(data), + timestamp: new Date().toISOString(), + status: "pending", + }; + this.insertStmt.run( + message.id, + message.spider_name, + message.data, + message.timestamp, + message.status + ); + // wechatPush(spiderName, data); + console.log(`📤 添加消息到队列: ${spiderName} - ${data.length} 条数据`); + return message.id; + } + + getPendingMessages() { + const rows = this.getPendingStmt.all(); + return rows.map((row) => ({ + ...row, + data: JSON.parse(row.data), + })); + } + + getFailedMessages() { + const rows = this.getFailedStmt.all(); + return rows.map((row) => ({ + ...row, + data: JSON.parse(row.data), + })); + } + + updateMessageStatus(id, status, sentAt = null, errorMessage = null) { + this.updateStatusStmt.run(status, sentAt, errorMessage, id); + } + migrateFromJsonFile(spiderName, jsonFilePath) { + try { + if (!fs.existsSync(jsonFilePath)) { + console.log(`📁 ${jsonFilePath} 不存在,跳过迁移`); + return 0; + } + + const data = JSON.parse(fs.readFileSync(jsonFilePath, "utf-8")); + if (!Array.isArray(data) || data.length === 0) { + console.log(`📁 ${jsonFilePath} 数据为空,跳过迁移`); + return 0; + } + + const migrateMany = this.db.transaction((announcements) => { + for (const announcement of announcements) { + this.saveAnnouncement(spiderName, announcement); + } + }); + + migrateMany(data); + console.log(`🔄 成功迁移 ${data.length} 条 ${spiderName} 数据到数据库`); + return data.length; + } catch (error) { + console.error(`❌ 迁移 ${jsonFilePath} 失败:`, error); + return 0; + } + } + cleanOldMessages(daysBefore = 30) { + const cutoffDate = new Date(); + cutoffDate.setDate(cutoffDate.getDate() - daysBefore); + + const stmt = this.db.prepare(` + DELETE FROM messages + WHERE status = 'sent' AND sent_at < ? + `); + + const result = stmt.run(cutoffDate.toISOString()); + console.log(`🧹 清理了 ${result.changes} 条旧消息`); + } + + /** + * 获取统计信息 + */ + getStats() { + const stats = {}; + + // 按爬虫统计公告数量 + const announcementStats = this.db + .prepare( + ` + SELECT spider_name, COUNT(*) as count + FROM announcements + GROUP BY spider_name + ` + ).all() + // .prepare(` + // SELECT spider_name, name + // FROM announcements WHERE spider_name = '吉利' + // `) + // .all(); + + // 消息状态统计(status == pending) + const messageStats = this.db + .prepare( + ` + SELECT status, data, sent_at + FROM messages WHERE status = 'pending' + ` + ) + .all(); + + stats.announcements = announcementStats; + stats.messages = messageStats; + + return stats; + } + setupGracefulShutdown() { + // 正常退出信号 + process.on("SIGINT", () => { + console.log("收到 SIGINT 信号,正在关闭数据库..."); + this.close(); + process.exit(0); + }); + + // 终止信号 + process.on("SIGTERM", () => { + console.log("收到 SIGTERM 信号,正在关闭数据库..."); + this.close(); + process.exit(0); + }); + + // 未捕获异常 + process.on("uncaughtException", (error) => { + console.error("未捕获异常:", error); + this.close(); + process.exit(1); + }); + + // 未处理的Promise拒绝 + process.on("unhandledRejection", (reason, promise) => { + console.error("未处理的Promise拒绝:", reason); + this.close(); + process.exit(1); + }); + } + // 关闭数据库连接 + close() { + this.db.close(); + } +} + +export { SQLiteMessageQueue }; diff --git a/stats.js b/stats.js new file mode 100644 index 0000000..ade7443 --- /dev/null +++ b/stats.js @@ -0,0 +1,80 @@ +import { SQLiteMessageQueue } from "./sqlite.js"; +import path from "path"; +import { md5 } from "./utils.js"; +import axios from "axios"; + +const queue = new SQLiteMessageQueue(); + +const stats = queue.getStats(); + +// function merge() { +// let files = [ +// { name: "长安", path: "changan.json" }, +// { name: "奇瑞变更公告", path: "chery_bg.json" }, +// { name: "奇瑞采购公告", path: "chery_cg.json" }, +// { name: "奇瑞寻源预告", path: "chery_xy.json" }, +// { name: "零跑", path: "leapMotor.json" }, +// { name: "吉利", path: "geely.json" }, +// { name: "一汽", path: "yiqi.json" }, +// ]; +// files.forEach((file) => { +// queue.migrateFromJsonFile(file.name, path.resolve(file.path)); +// }); +// } +// merge(); +// 把message中的数据状态改成pending +// queue.getFailedMessages() +// .forEach((message) => { +// queue.updateMessageStatus(message.id, "pending"); +// }); +// function getSign(timestamp) { +// let secret = "cpwyyds"; +// let uri = "/common/message/push"; +// const url = uri + timestamp + secret; +// console.log(url); +// const myCalc = md5(url); +// let sign = +// myCalc.substring(5, 13) + +// myCalc.substring(29, 31) + +// myCalc.substring(18, 27); +// //sign 转大写 +// sign = sign.toUpperCase(); +// return sign; +// } +// let time = new Date().getTime(); +// let data = { +// timestamp: time, +// sign: getSign(time), +// templateNo: "A002", +// url: "https://www.baidu.com/", +// paramList: [ +// { +// key: "thing8", +// value: "网站name", +// }, +// { +// key: "thing2", +// value: "项目name", +// }, +// { +// key: "time14", +// value: "2025-11-2", +// }, +// { +// key: "time17", +// value: "2025-11-3 00:00:00", +// }, +// ], +// }; +// axios({ +// url: "https://testadvert.shenlintech.com/platform/common/message/push", +// method: "post", +// data, +// }) +// .then((res) => { +// console.log(res.data); +// }) +// .catch((err) => { +// console.log(err); +// }); +console.log(stats); diff --git a/third.js b/third.js new file mode 100644 index 0000000..ca9afd3 --- /dev/null +++ b/third.js @@ -0,0 +1,309 @@ +import axios from "axios"; +import fs from "fs"; +import path from "path"; +import JSON5 from "json5"; +import { timestampToDate, loopCall, keywordsInclude } from "./utils.js"; +import config from "./config.js"; +import { SQLiteMessageQueue } from "./sqlite.js"; +import * as cheerio from "cheerio"; + +class Third { + constructor(jsonMap) { + this.axiosInstance = axios.create({ timeout: 30000, maxRedirects: 5 }); + this.axiosInstance.interceptors.request.use((config) => { + // 添加cookie到请求头 + const cookieString = Array.from(this.cookiePair.entries()) + .map(([name, value]) => `${name}=${value}`) + .join("; "); + config.headers.Cookie = cookieString; + // console.log(config); + return config; + }); + this.axiosInstance.interceptors.response.use( + (response) => { + // 更新cookie到请求头 + let cookieArr = response.headers["set-cookie"] || []; + this.extractCookie(cookieArr); + return response; + }, + (error) => { + return Promise.reject(error); + } + ); + this.cookiePair = new Map(); + // this.csrfToken = ""; + this.jsonMap = jsonMap; + console.log("三方平台 爬虫启动..."); + this.queue = new SQLiteMessageQueue(); + this.start(); + } + + async start() { + try { + await this.init(); + } catch (err) { + console.error("启动失败:", err); + } + } + async init() { + for (let item of this.jsonMap) { + let announcements = this.queue.getAnnouncementsBySpider(item.name); + if (announcements.length > 0) { + this.loopFetchIncrement(item); + } else { + this.loopFetchFull(item); + } + } + } + async initializeCookie() { + try { + let headers = { + headers: { + Accept: "text/plain, */*; q=0.01", + "Accept-Language": "zh-CN,zh;q=0.9", + "Cache-Control": "no-cache", + "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", + Origin: "https://www.chinabidding.com", + Pragma: "no-cache", + Priority: "u=1, i", + Referer: "https://www.chinabidding.com/search/proj.htm", + "Sec-Ch-Ua": + '"Not)A;Brand";v="8", "Chromium";v="138", "Google Chrome";v="138"', + "Sec-Ch-Ua-Mobile": "?0", + "Sec-Ch-Ua-Platform": '"macOS"', + "Sec-Fetch-Dest": "empty", + "Sec-Fetch-Mode": "cors", + "Sec-Fetch-Site": "same-origin", + "User-Agent": + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36", + "X-Requested-With": "XMLHttpRequest", + }, + }; + const homeResponse = await this.axiosInstance.get( + "https://www.chinabidding.com/search/proj.htm", + headers + ); + } catch (err) { + console.log("err", err); + throw err; + } + } + extractCookie(cookieArr) { + for (let cookie of cookieArr) { + let [key, value] = cookie.split(";")[0].split("="); + this.cookiePair.set(key, value); + } + // console.log(this.cookiePair); + } + // 全量爬取 + loopFetchFull(props) { + console.log("开始全量爬取"); + try { + loopCall(this.getInfo.bind(this), { + time: config.fullFetchTime, + pagenumber: 1, + additional: props.options, + stopWhen: (pagenumber, result) => { + return ( + pagenumber >= result.pages || pagenumber >= config.pageNumberLimit + ); + }, + readyForNext: (pagenumber, result) => { + props.info.push(...result.info); + return pagenumber + 1; + }, + complete: (result) => { + props.info.push(...result.info); + console.log(`爬取完成,共获取 ${props.info.length} 条有效数据`); + try { + if (props.info.length > 0) { + this.queue.saveAnnouncements(props.name, props.info); + this.queue.addMessage(props.name, props.info); + } + } catch (error) { + console.error("数据库操作失败:", error); + } + this.loopFetchIncrement(props); + }, + }); + } catch (error) { + console.error(`${props.options.name}全量爬取失败:`, error); + } + } + loopFetchIncrement(props) { + console.log("开始增量爬取"); + try { + loopCall(this.getInfo.bind(this), { + time: config.incrementFetchTime, // 5分钟间隔 + pagenumber: 1, + additional: props.options, + readyForNext: (pagenumber, result) => { + try { + let newInfo = this.queue.filterNewAnnouncements( + props.name, + result.info + ); + // 存在新数据 + if (newInfo.length > 0) { + console.log(`发现 ${newInfo.length} 条新数据`); + // props.info.push(...newInfo); + this.queue.saveAnnouncements(props.name, newInfo); + // this.writeFile(props); + this.queue.addMessage(props.name, newInfo); + // 全是新数据,继续下一页 + if (newInfo.length === result.info.length) { + return pagenumber + 1; + } else { + // 有部分重复数据,重新从第一页开始 + return 1; + } + } else { + console.log("没有发现新数据,继续监控..."); + return 1; // 重新从第一页开始 + } + } catch (error) { + console.error("数据库操作失败:", error); + } + }, + }); + } catch (error) { + console.error(`${props.options.name}增量爬取失败:`, error); + } + } + + async getNoticeDetail(url) { + try { + let result = await axios.get(url); + return result.data; + } catch (err) { + return "err"; + } + } + async getInfo(pagenumber = 1, config) { + let info = []; + console.log(`${config.name}--获取第 ${pagenumber} 页数据...`); + let result = await this.getList(pagenumber, config); + if (result[0]) { + // 出错, 记录错误日志 + console.error("获取页面数据失败: ", result[0]); + return { pages: 0, info: [] }; + } else { + let pages = 3; + let html = result[1]; + const $ = cheerio.load(html); + $(".as-pager-body li").each((index, element) => { + let idmatch = $(element) + .find(".as-pager-item") + .attr("href") + .match(/\/bidDetail\/(\d+)\.html/); + let id = idmatch ? idmatch[1] : ""; + let name = $(element).find(".txt").attr("title"); + + let url = $(element).find(".as-pager-item").attr("href"); + if (keywordsInclude(name)) { + console.log("处理项目:", name); + info.push({ + id: id, + name: name, + urls: url, + publishTime: "--", + endTime: "--", + }); + } + }); + return { pages, info }; + } + } + async getList(pagenumber, config) { + let data = config.data; + data.currentPage = pagenumber; + let headers = { + Accept: "text/plain, */*; q=0.01", + "Accept-Language": "zh-CN,zh;q=0.9", + "Cache-Control": "no-cache", + "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", + Origin: "https://www.chinabidding.com", + Pragma: "no-cache", + Priority: "u=1, i", + Referer: "https://www.chinabidding.com/search/proj.htm", + "Sec-Ch-Ua": + '"Not)A;Brand";v="8", "Chromium";v="138", "Google Chrome";v="138"', + "Sec-Ch-Ua-Mobile": "?0", + "Sec-Ch-Ua-Platform": '"macOS"', + "Sec-Fetch-Dest": "empty", + "Sec-Fetch-Mode": "cors", + "Sec-Fetch-Site": "same-origin", + "User-Agent": + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36", + "X-Requested-With": "XMLHttpRequest", + }; + try { + const response = await this.axiosInstance({ + url: config.url, + data, + method: "post", + headers, + }); + let result = response.data; + return [null, result]; + } catch (err) { + console.log("cookie不对"); + try { + await this.initializeCookie(); + const retryResponse = await this.axiosInstance({ + url: config.url, + data, + method: "post", + headers, + }); + let result = retryResponse.data; + return [null, result]; + } catch (retryErr) { + return [retryErr, null]; + } + } + } +} + +new Third([ + { + name: "机电项目招投标【招标公告】", + info: [], + options: { + name: "机电项目招投标【招标公告】", + url: "https://www.chinabidding.com/search/proj.htm", + data: { + fullText: "", + pubDate: "", + infoClassCodes: "(0105 0103)", + normIndustry: "", + zoneCode: "", + fundSourceCodes: "", + poClass: "BidNotice", + rangeType: "", + currentPage: 1, + }, + }, + }, +]); +new Third([ + { + name: "机电项目招投标【招标变更公告】", + info: [], + options: { + name: "机电项目招投标【招标变更公告】", + url: "https://www.chinabidding.com/search/proj.htm", + data: { + fullText: "", + pubDate: "", + infoClassCodes: "(0106 0104)", + normIndustry: "", + zoneCode: "", + fundSourceCodes: "", + poClass: "BidNotice", + rangeType: "", + currentPage: 1, + }, + }, + }, +]); diff --git a/utils.js b/utils.js new file mode 100644 index 0000000..5223a1c --- /dev/null +++ b/utils.js @@ -0,0 +1,271 @@ +import crypto from "crypto"; +import axios from "axios"; +/** + * 将时间戳(毫秒)转换为 yyyy-mm-dd 格式的字符串 + * @param {number} timestamp - 毫秒级时间戳 + * @returns {string} yyyy-mm-dd 格式日期 + */ +function timestampToDate(timestamp, mode) { + const date = new Date(timestamp); + const year = date.getFullYear(); + // 补零 + const month = String(date.getMonth() + 1).padStart(2, "0"); + const day = String(date.getDate()).padStart(2, "0"); + if (!mode) { + return `${year}-${month}-${day}`; + } else { + const hours = String(date.getHours()).padStart(2, "0"); + const minutes = String(date.getMinutes()).padStart(2, "0"); + const seconds = String(date.getSeconds()).padStart(2, "0"); + return `${year}-${month}-${day} ${hours}:${minutes}:${seconds}`; + } +} + +function md5(text, inputEncoding = "utf8", outputEncoding = "hex") { + return crypto + .createHash("md5") + .update(text, inputEncoding) + .digest(outputEncoding); +} +function getSign(timestamp) { + let secret = "cpwyyds"; + let uri = "/common/message/push"; + const url = uri + timestamp + secret; + const myCalc = md5(url); + let sign = + myCalc.substring(5, 13) + + myCalc.substring(29, 31) + + myCalc.substring(18, 27); + //sign 转大写 + sign = sign.toUpperCase(); + return sign; +} +// 微信推送 +// function wechatPush(spiderName, arr) { +// for (let item of arr) { +// let timestamp = new Date().getTime(); +// let sign = getSign(timestamp); +// let url = ""; +// if (typeof item.urls === "string") { +// url = item.urls; +// } else { +// url = item.urls[0]; +// } +// let data = { +// timestamp, +// sign, +// templateNo: "A002", +// url, +// paramList: [ +// { +// key: "thing8", +// value: spiderName, +// }, +// { +// key: "thing2", +// value: +// item.name.length > 20 +// ? item.name.substring(0, 16) + "..." +// : item.name, +// }, +// { +// key: "time14", +// value: item.publishTime, +// }, +// { +// key: "time17", +// value: item.endTime, +// }, +// ], +// }; +// axios({ +// url: "https://advert.shenlintech.com/platform/common/message/push", +// method: "post", +// data, +// }); +// } +// } +// 废弃 +function addToMessageQueue(spiderName, data) { + const message = { + id: Date.now() + "-" + Math.random().toString(36).substr(2, 9), + spiderName, + data, + timestamp: new Date().toISOString(), + status: "pending", + }; + let queue = []; + const queueFile = "message_queue.json"; + if (fs.existsSync(queueFile)) { + queue = JSON.parse(fs.readFileSync(queueFile, "utf-8")); + } + // 添加新消息 + queue.push(message); + + fs.writeFileSync(queueFile, JSON.stringify(queue, null, 2)); + console.log(`📤 添加消息到队列: ${spiderName} - ${data.length} 条数据`); +} + +async function loopCall(fn, options = {}) { + let { time, pagenumber, stopWhen, readyForNext, complete, additional } = + options; + let shouldContinue = true; + while (shouldContinue) { + try { + let result = await fn(pagenumber, additional); + // console.log(`页面 ${pagenumber} 处理完成`); + + // 检查停止条件 + if (stopWhen && stopWhen(pagenumber, result)) { + complete && complete(result); + shouldContinue = false; + } else { + pagenumber = readyForNext(pagenumber, result); + await new Promise((resolve) => setTimeout(resolve, time)); + } + } catch (err) { + console.error("loopCall 出错:", err); + shouldContinue = false; + } + } +} +function keywordsInclude(name) { + let keywords = [ + "海外", + "国际", + "内容", + "营销", + "运营", + "直播", + "品牌", + "事件", + "策略", + "传播", + "执行", + "社媒", + "视频", + "制作", + "拍摄", + "效果", + ]; + return keywords.some((keyword) => name.includes(keyword)); +} +// 一汽专用获取公告链接的方法 +function getYiqiNoticeUrl(gongGaoType, guid, version, origin) { + let baseUrl = "https://etp.faw.cn/"; + //是否对参数加密 + var isSecrect = false; + + //候选人公示加密 + if (gongGaoType == 7) { + isSecrect = true; + } + if (isSecrect) { + var url = baseUrl + "/gg/toGongGaoDetail"; + guid = encodeSixF(guid); + // var params = { + // guid: guid, + // gongGaoType: gongGaoType, + // version: dealNullAndUndefined(version), + // statusCode: 1, + // isNew: 1, + // }; + // try { + // await httpPostCurrent(url, params); + // } catch (err) { + // console.log(err); + // return "加密链接"; + // } + return "加密链接,请直接上对应网站查看"; + } else { + var url = + baseUrl + + "/gg/toGongGaoDetail?guid=" + + guid + + "&gongGaoType=" + + gongGaoType + + "&version=" + + version + + "&isNew=1"; + return url; + } +} +function parseToGgDetailsParams(funcStr) { + // funcStr = "toGgDetails('6','642ed424-cd9b-4cb0-8b74-9cc868d8f95a:2','2','1','')" + + const match = funcStr.match(/toGgDetails\(([^)]+)\)/); + if (match) { + // 解析参数字符串 + const paramsStr = match[1]; + // 简单的参数解析(处理引号包围的参数) + const params = paramsStr + .split(",") + .map((param) => param.trim().replace(/['"]/g, "")); + return params; + } + return null; +} +function encodeSixF(input) { + var keyStr = + "ABCDEFGHIJKLMNOP" + + "QRSTUVWXYZabcdef" + + "ghijklmnopqrstuv" + + "wxyz0123456789+/" + + "="; + var output = ""; + var chr1, + chr2, + chr3 = ""; + var enc1, + enc2, + enc3, + enc4 = ""; + var i = 0; + do { + chr1 = input.charCodeAt(i++); + chr2 = input.charCodeAt(i++); + chr3 = input.charCodeAt(i++); + enc1 = chr1 >> 2; + enc2 = ((chr1 & 3) << 4) | (chr2 >> 4); + enc3 = ((chr2 & 15) << 2) | (chr3 >> 6); + enc4 = chr3 & 63; + if (isNaN(chr2)) { + enc3 = enc4 = 64; + } else if (isNaN(chr3)) { + enc4 = 64; + } + output = + output + + keyStr.charAt(enc1) + + keyStr.charAt(enc2) + + keyStr.charAt(enc3) + + keyStr.charAt(enc4); + chr1 = chr2 = chr3 = ""; + enc1 = enc2 = enc3 = enc4 = ""; + } while (i < input.length); + + if (output != null && output.indexOf("=") != -1) { + var reg = new RegExp("=", "g"); + var outputNew = output.replace(reg, "r1e2p3l4"); + output = outputNew; + } + + return output + "+*+"; +} +function dealNullAndUndefined(value) { + if (typeof value == "undefined") return ""; + if (value == null) return ""; + if (value == "null") return ""; + if (value == "undefined") return ""; + return value; +} +export { + timestampToDate, + loopCall, + keywordsInclude, + getYiqiNoticeUrl, + parseToGgDetailsParams, + addToMessageQueue, + md5, + // wechatPush +}; diff --git a/yiqi.js b/yiqi.js new file mode 100644 index 0000000..37f895d --- /dev/null +++ b/yiqi.js @@ -0,0 +1,199 @@ +import axios from "axios"; +import fs from "fs"; +import path from "path"; +import { + timestampToDate, + loopCall, + keywordsInclude, + getYiqiNoticeUrl, + parseToGgDetailsParams, + // addToMessageQueue, +} from "./utils.js"; +import config from "./config.js"; +import * as cheerio from "cheerio"; +import { SQLiteMessageQueue } from "./sqlite.js"; +// import { messageQueue } from "./msgManager.js"; + +class YiQi { + constructor() { + // this.filepath = path.resolve("yiqi.json"); + this.info = []; + console.log("一汽 爬虫启动..."); + this.queue = new SQLiteMessageQueue(); + this.start(); + } + + async start() { + try { + await this.init(); + } catch (err) { + console.error("启动失败:", err); + } + } + async init() { + let announcements = this.queue.getAnnouncementsBySpider("一汽"); + if (announcements.length > 0) { + await this.increment(); + } else { + await this.fullFetch(); + } + // if (fs.existsSync(this.filepath)) { + // let data = fs.readFileSync(this.filepath, "utf-8"); + // this.info = data ? JSON.parse(data) : []; + // if (this.info.length > 0) { + // await this.increment(); + // } else { + // await this.fullFetch(); + // } + // } else { + // console.log("历史文件不存在,开始全量爬取"); + // await this.fullFetch(); + // } + } + // 全量爬取 + async fullFetch() { + console.log("开始全量爬取..."); + try { + await loopCall(this.getInfo.bind(this), { + time: config.fullFetchTime, + pagenumber: 1, + stopWhen: (pagenumber, result) => { + return ( + pagenumber >= result.pages || pagenumber >= config.pageNumberLimit + ); + }, + readyForNext: (pagenumber, result) => { + this.info.push(...result.info); + return pagenumber + 1; + }, + complete: (result) => { + this.info.push(...result.info); + console.log(`爬取完成,共获取 ${this.info.length} 条有效数据`); + try { + this.queue.saveAnnouncements("一汽", this.info); + // this.writeFile(this.info); + this.queue.addMessage("一汽", this.info); + } catch (error) { + console.error("数据库操作失败:", error); + } + }, + }); + } catch (error) { + console.error("全量爬取失败:", error); + } + console.log("开始增量爬取..."); + this.increment(); + } + + // 增量爬取 + async increment() { + console.log("开始增量爬取模式,每5分钟检查一次新数据..."); + try { + await loopCall(this.getInfo.bind(this), { + time: config.incrementFetchTime, // 5分钟间隔 + pagenumber: 1, + readyForNext: (pagenumber, result) => { + try { + let newInfo = this.queue.filterNewAnnouncements( + "一汽", + result.info + ); + // let newInfo = result.info.filter( + // (item) => !this.info.some((info) => info.id === item.id) + // ); + // 存在新数据 + if (newInfo.length > 0) { + console.log(`发现 ${newInfo.length} 条新数据`); + // this.info.push(...newInfo); + this.queue.saveAnnouncements("一汽", newInfo); + // this.writeFile(this.info); + this.queue.addMessage("一汽", newInfo); + // 全是新数据,继续下一页 + if (newInfo.length === result.info.length) { + return pagenumber + 1; + } else { + // 有部分重复数据,重新从第一页开始 + return 1; + } + } else { + console.log("没有发现新数据,继续监控..."); + return 1; // 重新从第一页开始 + } + } catch (error) { + console.error("数据库操作失败:", error); + } + }, + }); + } catch (error) { + console.error("增量爬取失败:", error); + } + } + async getInfo(pagenumber = 1) { + let info = []; + console.log(`正在获取第 ${pagenumber} 页数据...`); + let result = await this.getHtml(pagenumber); + if (result[0]) { + // 出错, 记录错误日志 + console.error("获取页面数据失败:", result[0]); + return { pages: 30, info: [] }; + } else { + let pages = 30; + let html = result[1]; + const $ = cheerio.load(html); + let noticeEl = $(".zl-list-main .zl-col-6"); + noticeEl.each((index, element) => { + let id = $(element).find(".zl-desc-item:contains('项目编号')").text(); + let name = $(element).find(".title").text(); + let publishTime = $(element) + .find(".zl-desc-item:contains('发布时间')") + .text(); + let endTime = $(element).find(".daojishi").attr("data-time"); + // 获取生产链接的参数 + let funcStr = $(element).find(".jump").attr("onclick"); + + let funcArgs = parseToGgDetailsParams(funcStr); + // 公告未过期 && 命中关键词 + if (endTime && keywordsInclude(name)) { + let noticeUrl = getYiqiNoticeUrl(...funcArgs); + info.push({ + id: id.replace("项目编号:", ""), + name: name.trim(), + publishTime: publishTime.replace("发布时间:", "").trim(), + endTime: timestampToDate(Number(endTime)), + urls: noticeUrl, + }); + } + }); + + return { pages, info }; + } + } + // 分页获取数据 + getHtml(pagenumber) { + return axios({ + url: "https://etp.faw.cn/gg/allJYTypeGGList?hangYeType=-1&xmLeiXing=&ggStartTimeEnd=&gongGaoType=5&isNew=1", + data: { + searchType: "", + searchText: "", + currentPage: pagenumber, + }, + headers: { + "Content-Type": "application/x-www-form-urlencoded", + }, + method: "post", + }) + .then((res) => { + let result = res.data; + return [null, result]; + }) + .catch((err) => { + return [err, null]; + }); + } + + // writeFile(info) { + // fs.writeFileSync(this.filepath, JSON.stringify(info), "utf-8"); + // } +} + +new YiQi(); diff --git a/youzhicai.js b/youzhicai.js new file mode 100644 index 0000000..ca6f15e --- /dev/null +++ b/youzhicai.js @@ -0,0 +1,406 @@ +import axios from "axios"; +import fs from "fs"; +import path from "path"; +import JSON5 from "json5"; +import { timestampToDate, loopCall, keywordsInclude } from "./utils.js"; +import config from "./config.js"; +import { SQLiteMessageQueue } from "./sqlite.js"; +import * as cheerio from "cheerio"; + +class YouZhiCai { + constructor(jsonMap) { + this.axiosInstance = axios.create({ timeout: 30000, maxRedirects: 5 }); + this.axiosInstance.interceptors.request.use((config) => { + // 添加cookie到请求头 + const cookieString = Array.from(this.cookiePair.entries()) + .map(([name, value]) => `${name}=${value}`) + .join("; "); + config.headers.Cookie = cookieString; + return config; + }); + this.axiosInstance.interceptors.response.use( + (response) => { + // 更新cookie到请求头 + let cookieArr = response.headers["set-cookie"] || []; + this.extractCookie(cookieArr); + return response; + }, + (error) => { + return Promise.reject(error); + } + ); + this.cookiePair = new Map(); + // this.csrfToken = ""; + this.jsonMap = jsonMap; + console.log("优质采 爬虫启动..."); + this.queue = new SQLiteMessageQueue(); + this.start(); + } + + async start() { + try { + await this.init(); + } catch (err) { + console.error("启动失败:", err); + } + } + async init() { + for (let item of this.jsonMap) { + let announcements = this.queue.getAnnouncementsBySpider(item.name); + if (announcements.length > 0) { + this.loopFetchIncrement(item); + } else { + this.loopFetchFull(item); + } + } + } + async initializeCookie() { + try { + let headers = { + headers: { + Accept: "text/plain, */*; q=0.01", + "Accept-Language": "zh-CN,zh;q=0.9", + "Cache-Control": "no-cache", + "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", + Origin: "https://www.youzhicai.com", + Pragma: "no-cache", + Priority: "u=1, i", + Referer: "https://www.youzhicai.com/s/1_1_0_0_.html", + "Sec-Ch-Ua": + '"Not)A;Brand";v="8", "Chromium";v="138", "Google Chrome";v="138"', + "Sec-Ch-Ua-Mobile": "?0", + "Sec-Ch-Ua-Platform": '"macOS"', + "Sec-Fetch-Dest": "empty", + "Sec-Fetch-Mode": "cors", + "Sec-Fetch-Site": "same-origin", + "User-Agent": + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36", + "X-Requested-With": "XMLHttpRequest", + }, + }; + const homeResponse = await this.axiosInstance.get( + "https://www.youzhicai.com/s/1_1_0_0_.html", + headers + ); + // // 提取csrf-token + // let tokenMatch = homeResponse.data.match( + // / { + return ( + pagenumber >= result.pages || pagenumber >= config.pageNumberLimit + ); + }, + readyForNext: (pagenumber, result) => { + props.info.push(...result.info); + return pagenumber + 1; + }, + complete: (result) => { + props.info.push(...result.info); + console.log(`爬取完成,共获取 ${props.info.length} 条有效数据`); + try { + if (props.info.length > 0) { + this.queue.saveAnnouncements(props.name, props.info); + this.queue.addMessage(props.name, props.info); + } + } catch (error) { + console.error("数据库操作失败:", error); + } + this.loopFetchIncrement(props); + }, + }); + } catch (error) { + console.error(`${props.options.name}全量爬取失败:`, error); + } + } + loopFetchIncrement(props) { + console.log("开始增量爬取"); + try { + loopCall(this.getInfo.bind(this), { + time: config.incrementFetchTime, // 5分钟间隔 + pagenumber: 1, + additional: props.options, + readyForNext: (pagenumber, result) => { + try { + let newInfo = this.queue.filterNewAnnouncements( + props.name, + result.info + ); + // 存在新数据 + if (newInfo.length > 0) { + console.log(`发现 ${newInfo.length} 条新数据`); + // props.info.push(...newInfo); + this.queue.saveAnnouncements(props.name, newInfo); + // this.writeFile(props); + this.queue.addMessage(props.name, newInfo); + // 全是新数据,继续下一页 + if (newInfo.length === result.info.length) { + return pagenumber + 1; + } else { + // 有部分重复数据,重新从第一页开始 + return 1; + } + } else { + console.log("没有发现新数据,继续监控..."); + return 1; // 重新从第一页开始 + } + } catch (error) { + console.error("数据库操作失败:", error); + } + }, + }); + } catch (error) { + console.error(`${props.options.name}增量爬取失败:`, error); + } + } + async getInfo(pagenumber = 1, config) { + let info = []; + console.log(`${config.name}--获取第 ${pagenumber} 页数据...`); + let result = await this.getList(pagenumber, config); + if (result[0]) { + // 出错, 记录错误日志 + console.error("获取页面数据失败: ", result[0]); + return { pages: 0, info: [] }; + } else { + // 后面的都要验证码 + + // let pages = 2; + let html = result[1]; + const $ = cheerio.load(html); + let total = $("#recommendMsg .info-num-value").text(); + let pages = Math.ceil(total / 15); + if (pages > 2) { + pages = 2; + } + $(".project-li").each((index, element) => { + let id = $(element).find(".project-name0").attr("href"); + let name = $(element).find(".project-name0").attr("title"); + let publishTime = $(element).find(".pub-value0").text(); + let leftDay = $(element).find(".left-day .emOrange:eq(0)").text(); + let endTime = new Date( + +new Date(publishTime) + leftDay * 24 * 60 * 60 * 1000 + ).toLocaleDateString(); + // console.log(endTime); + let urls = "https://www.youzhicai.com" + id; + if (keywordsInclude(name)) { + console.log("处理项目:", name, publishTime, endTime); + info.push({ + id: id, + name: name, + publishTime: publishTime, + endTime: endTime, + urls: urls, + }); + } + }); + return { pages, info }; + } + } + async getList(pagenumber, config) { + let data = config.data; + data.PageIndex = pagenumber; + if (this.cookiePair.get("__RequestVerificationToken")) { + data.__RequestVerificationToken = this.cookiePair.get( + "__RequestVerificationToken" + ); + } + let headers = { + Accept: "text/plain, */*; q=0.01", + "Accept-Language": "zh-CN,zh;q=0.9", + "Cache-Control": "no-cache", + "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", + Origin: "https://www.youzhicai.com", + Pragma: "no-cache", + Priority: "u=1, i", + Referer: "https://www.youzhicai.com/s/1_1_0_0_.html", + "Sec-Ch-Ua": + '"Not)A;Brand";v="8", "Chromium";v="138", "Google Chrome";v="138"', + "Sec-Ch-Ua-Mobile": "?0", + "Sec-Ch-Ua-Platform": '"macOS"', + "Sec-Fetch-Dest": "empty", + "Sec-Fetch-Mode": "cors", + "Sec-Fetch-Site": "same-origin", + "User-Agent": + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36", + "X-Requested-With": "XMLHttpRequest", + }; + try { + const response = await this.axiosInstance({ + url: config.url, + data, + method: "post", + headers, + }); + let result = response.data; + return [null, result]; + } catch (err) { + console.log("cookie不对"); + try { + await this.initializeCookie(); + data.__RequestVerificationToken = this.cookiePair.get( + "__RequestVerificationToken" + ); + const retryResponse = await this.axiosInstance({ + url: config.url, + data, + method: "post", + headers, + }); + // console.log(retryResponse.data); + let result = retryResponse.data; + return [null, result]; + } catch (retryErr) { + return [retryErr, null]; + } + } + } +} + +new YouZhiCai([ + { + name: "优质采【招标公告】", + info: [], + options: { + name: "优质采【招标公告】", + url: "https://www.youzhicai.com/s/1_1_0_0_.html", + data: { + MsProvince: "", + MsCity: "", + MsStartDate: "", + MsEndDate: "", + AutoOr: 0, + BackOr: 0, + NoticeTitle: "", + searchAccuracy: "precise", + matchType: "precise", + TenderType: "", + MsBidderType: 1, + MsNoticeType: 1, + MsPublishType: 0, + MsSingUpType: 1, + MsSort: 2, + MsProvince: "", + PageIndex: 1, + PageSize: 15, + AgencyId: "", + SecondSearch: "", + SecondSearchType: "", + TotalSize: 10000, + SearchRange: 3, + year: "", + key1: "", + key2: "", + key3: "", + }, + }, + }, +]); +new YouZhiCai([ + { + name: "优质采【澄清/变更公告】", + info: [], + options: { + name: "优质采【澄清/变更公告】", + url: "https://www.youzhicai.com/s/1_1_0_0_.html", + data: { + MsProvince: "", + MsCity: "", + MsStartDate: "", + MsEndDate: "", + AutoOr: 0, + BackOr: 0, + NoticeTitle: "", + searchAccuracy: "precise", + matchType: "precise", + TenderType: "", + MsBidderType: 1, + MsNoticeType: 5, + MsPublishType: 0, + MsSingUpType: 1, + MsSort: 2, + MsProvince: "", + PageIndex: 1, + PageSize: 15, + AgencyId: "", + SecondSearch: "", + SecondSearchType: "", + TotalSize: 10000, + SearchRange: 3, + year: "", + key1: "", + key2: "", + key3: "", + }, + }, + }, +]); +new YouZhiCai([ + { + name: "优质采【招标项目计划】", + info: [], + options: { + name: "优质采【招标项目计划】", + url: "https://www.youzhicai.com/s/1_1_0_0_.html", + data: { + MsProvince: "", + MsCity: "", + MsStartDate: "", + MsEndDate: "", + AutoOr: 0, + BackOr: 0, + NoticeTitle: "", + searchAccuracy: "precise", + matchType: "precise", + TenderType: "", + MsBidderType: 1, + MsNoticeType: 7, + MsPublishType: 0, + MsSingUpType: 1, + MsSort: 2, + MsProvince: "", + PageIndex: 1, + PageSize: 15, + AgencyId: "", + SecondSearch: "", + SecondSearchType: "", + TotalSize: 10000, + SearchRange: 3, + year: "", + key1: "", + key2: "", + key3: "", + }, + }, + }, +]);