commit 12ee63b814fdd0f6c9d77fbb320592b5908f8cff Author: huzhengrong Date: Thu Oct 23 10:39:32 2025 +0800 初始化 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c53d921 --- /dev/null +++ b/.gitignore @@ -0,0 +1,109 @@ +# Node.js +node_modules/ +npm-debug.log* +yarn-debug.log* +yarn-error.log* +pnpm-debug.log* +package-lock.json +yarn.lock +pnpm-lock.yaml + +# Logs +logs +*.log +*.log.* +log/ +pids +*.pid +*.seed +*.pid.lock + +# OS +.DS_Store +Thumbs.db +*.db + +# dotenv environment variables +.env +.env.* +!.env.example + +# Editor directories and files +.idea/ +.vscode/ +*.sublime-workspace +*.sublime-project + +# Build output +dist/ +build/ +out/ +coverage/ +.nyc_output/ + +# Optional npm cache directory +.npm/ + +# Optional eslint cache +.eslintcache + +# Optional REPL history +.node_repl_history + +# Mac system files +.AppleDouble +.LSOverride + +# Test coverage +coverage/ + +# TypeScript cache +*.tsbuildinfo + +# Optional: local data +*.local + +# Optional: debug +debug.log + +# Optional: next.js +.next/ + +# Optional: Nuxt.js +.nuxt/ + +# Optional: SvelteKit +.svelte-kit/ + +# Optional: vuepress +.vuepress/dist + +# Optional: Storybook +.storybook-out/ + +# Optional: Parcel +.cache/ + +# Optional: output of 'npm pack' +*.tgz + +# Optional: PM2 logs and pids +pids/ +*.pid +*.seed +*.pid.lock +pm2.log + +# Optional: dotenv +.env.local +.env.development.local +.env.test.local +.env.production.local + +# Optional: jest +jest.config.js +jest.config.ts + +# Optional: cypress +cypress/videos/ +cypress/screenshots/ diff --git a/byd.js b/byd.js new file mode 100644 index 0000000..3ef15ee --- /dev/null +++ b/byd.js @@ -0,0 +1,173 @@ +import axios from "axios"; +import fs from "fs"; +import path from "path"; +import { timestampToDate, loopCall, keywordsInclude } from "./utils.js"; +import config from "./config.js"; +import { SQLiteMessageQueue } from "./sqlite.js"; + +class BYD { + constructor() { + this.info = []; + console.log("比亚迪 爬虫启动..."); + this.queue = new SQLiteMessageQueue(); + this.start(); + } + + async start() { + try { + await this.init(); + } catch (err) { + console.error("启动失败:", err); + } + } + async init() { + let announcements = this.queue.getAnnouncementsBySpider("比亚迪"); + if (announcements.length > 0) { + await this.increment(); + } else { + await this.fullFetch(); + } + } + // 全量爬取 + async fullFetch() { + console.log("开始全量爬取..."); + try { + await loopCall(this.getInfo.bind(this), { + time: config.fullFetchTime, + pagenumber: 1, + stopWhen: (pagenumber, result) => { + return ( + pagenumber >= result.pages || pagenumber >= config.pageNumberLimit + ); + }, + readyForNext: (pagenumber, result) => { + this.info.push(...result.info); + return pagenumber + 1; + }, + complete: (result) => { + this.info.push(...result.info); + console.log(`爬取完成,共获取 ${this.info.length} 条有效数据`); + try { + if (this.info.length > 0) { + this.queue.saveAnnouncements("比亚迪", this.info); + // this.writeFile(this.info); + this.queue.addMessage("比亚迪", this.info); + } + } catch (error) { + console.error("数据库操作失败:", error); + } + }, + }); + } catch (error) { + console.error("全量爬取失败:", error); + } + console.log("开始增量爬取..."); + this.increment(); + } + + // 增量爬取 + async increment() { + console.log("开始增量爬取模式,每5分钟检查一次新数据..."); + try { + await loopCall(this.getInfo.bind(this), { + time: config.incrementFetchTime, // 5分钟间隔 + pagenumber: 1, + readyForNext: (pagenumber, result) => { + try { + let newInfo = this.queue.filterNewAnnouncements( + "比亚迪", + result.info + ); + // 存在新数据 + if (newInfo.length > 0) { + console.log(`发现 ${newInfo.length} 条新数据`); + // this.info.push(...newInfo); + this.queue.saveAnnouncements("比亚迪", newInfo); + // this.writeFile(this.info); + this.queue.addMessage("比亚迪", newInfo); + // 全是新数据,继续下一页 + if (newInfo.length === result.info.length) { + return pagenumber + 1; + } else { + // 有部分重复数据,重新从第一页开始 + return 1; + } + } else { + console.log("没有发现新数据,继续监控..."); + return 1; // 重新从第一页开始 + } + } catch (error) { + console.error("数据库操作失败:", error); + } + }, + }); + } catch (error) { + console.error("增量爬取失败:", error); + } + } + async getInfo(pagenumber = 1) { + let info = []; + console.log(`正在获取第 ${pagenumber} 页数据...`); + let result = await this.getList(pagenumber); + if (result[0]) { + // 出错, 记录错误日志 + console.error("获取页面数据失败:", result[0]); + return { pages: 0, info: [] }; + } else { + let total = result[1].data.total; + let pages = Math.ceil(total / 10); + let arr = result[1].data.records; + + for (let i = 0; i < arr.length; i++) { + let item = arr[i]; + let endTime = timestampToDate( + new Date(item.signUpEndTime).getTime(), + true + ); + // 命中关键词 + if ( + keywordsInclude(item.title) && + endTime && + +new Date(endTime) >= Date.now() + ) { + // console.log("处理项目:", item.sourcingId, item.title); + info.push({ + id: item.sourcingId, + name: item.title, + publishTime: timestampToDate( + new Date(item.tenderNoticePublishTime).getTime(), + true + ), + endTime: endTime, + urls: `https://spcn.byd.com/#/tender-detail?sourcingId=${item.sourcingId}`, + }); + } + } + return { pages, info }; + } + } + // 分页获取数据 + getList(pagenumber) { + return axios({ + url: "https://spcn.byd.com/api/srm-sou-sp/supplier/supplier/getTenderAnnouncementInfo", + data: { + pageNo: pagenumber, + pageSize: 10, + }, + method: "post", + }) + .then((res) => { + let result = res.data; + if (result.msg === "成功" && result.code === "000000") { + return [null, result]; + } else { + return ["err", null]; + } + }) + .catch((err) => { + return [err, null]; + }); + } +} + +new BYD(); diff --git a/changan.js b/changan.js new file mode 100644 index 0000000..c47b911 --- /dev/null +++ b/changan.js @@ -0,0 +1,188 @@ +import axios from "axios"; +import fs from "fs"; +import path from "path"; +import { + timestampToDate, + loopCall, + keywordsInclude, + // addToMessageQueue, +} from "./utils.js"; +import config from "./config.js"; +import { SQLiteMessageQueue } from "./sqlite.js"; +// import { messageQueue } from "./msgManager.js"; +// import cheerio from "cheerio"; + +class ChangAn { + constructor() { + // this.filepath = path.resolve("changan.json"); + this.info = []; + console.log("长安 爬虫启动..."); + this.queue = new SQLiteMessageQueue(); + this.start(); + } + + async start() { + try { + await this.init(); + } catch (err) { + console.error("启动失败:", err); + } + } + async init() { + let announcements = this.queue.getAnnouncementsBySpider("长安"); + if (announcements.length > 0) { + await this.increment(); + } else { + await this.fullFetch(); + } + + // if (fs.existsSync(this.filepath)) { + // let data = fs.readFileSync(this.filepath, "utf-8"); + // this.info = data ? JSON.parse(data) : []; + // if (this.info.length > 0) { + // await this.increment(); + // } else { + // await this.fullFetch(); + // } + // } else { + // console.log("历史文件不存在,开始全量爬取"); + // await this.fullFetch(); + // } + } + // 全量爬取 + async fullFetch() { + console.log("开始全量爬取..."); + try { + await loopCall(this.getInfo.bind(this), { + time: config.fullFetchTime, + pagenumber: 1, + stopWhen: (pagenumber, result) => { + return ( + pagenumber >= result.pages || pagenumber >= config.pageNumberLimit + ); + }, + readyForNext: (pagenumber, result) => { + this.info.push(...result.info); + return pagenumber + 1; + }, + complete: (result) => { + this.info.push(...result.info); + console.log(`爬取完成,共获取 ${this.info.length} 条有效数据`); + try { + this.queue.saveAnnouncements("长安", this.info); + // this.writeFile(this.info); + this.queue.addMessage("长安", this.info); + } catch (error) { + console.error("数据库操作失败:", error); + } + }, + }); + } catch (error) { + console.error("全量爬取失败:", error); + } + console.log("开始增量爬取..."); + this.increment(); + } + + // 增量爬取 + async increment() { + console.log("开始增量爬取模式,每5分钟检查一次新数据..."); + try { + await loopCall(this.getInfo.bind(this), { + time: config.incrementFetchTime, // 5分钟间隔 + pagenumber: 1, + readyForNext: (pagenumber, result) => { + try { + let newInfo = this.queue.filterNewAnnouncements( + "长安", + result.info + ); + // 存在新数据 + if (newInfo.length > 0) { + console.log(`发现 ${newInfo.length} 条新数据`); + // this.info.push(...newInfo); + this.queue.saveAnnouncements("长安", newInfo); + // this.writeFile(this.info); + this.queue.addMessage("长安", newInfo); + // 全是新数据,继续下一页 + if (newInfo.length === result.info.length) { + return pagenumber + 1; + } else { + // 有部分重复数据,重新从第一页开始 + return 1; + } + } else { + console.log("没有发现新数据,继续监控..."); + return 1; // 重新从第一页开始 + } + } catch (error) { + console.error("数据库操作失败:", error); + } + }, + }); + } catch (error) { + console.error("增量爬取失败:", error); + } + } + async getInfo(pagenumber = 1) { + let info = []; + console.log(`正在获取第 ${pagenumber} 页数据...`); + let result = await this.getList(pagenumber); + if (result[0]) { + // 出错, 记录错误日志 + console.error("获取页面数据失败:", result[0]); + return { pages: 0, info: [] }; + } else { + // let total = result[1].result.total; + let pages = result[1].result.pages; + let arr = result[1].result.records; + + for (let i = 0; i < arr.length; i++) { + let item = arr[i]; + // 命中关键词 + if (keywordsInclude(item.projectName)) { + console.log("处理项目:", item.id, item.projectName); + info.push({ + id: item.id, + name: item.projectName, + publishTime: item.startTime, + endTime: item.endTime, + urls: `https://portal.changan.com.cn/noProdNoticeInfo?_t=${Date.now()}&id=${ + item.id + }`, + }); + } + } + return { pages, info }; + } + } + // 分页获取数据 + getList(pagenumber) { + return axios({ + url: "https://portal.changan.com.cn/backend_8086/changan_platform/api/nonPdcSourceNoticeCt/listSourceNoticePageBySupplier", + params: { + _t: Date.now(), + pageNo: pagenumber, + pageSize: 20, + }, + method: "get", + }) + .then((res) => { + let result = res.data; + if (result.success) { + return [null, result]; + } else { + return ["err", null]; + } + }) + .catch((err) => { + return [err, null]; + }); + } + + // writeFile(info) { + // fs.writeFileSync(this.filepath, JSON.stringify(info), "utf-8"); + // } +} + +new ChangAn(); diff --git a/chery.js b/chery.js new file mode 100644 index 0000000..bad96ed --- /dev/null +++ b/chery.js @@ -0,0 +1,251 @@ +import axios from "axios"; +import fs from "fs"; +import path from "path"; +import { + timestampToDate, + loopCall, + keywordsInclude, + // addToMessageQueue, +} from "./utils.js"; +import config from "./config.js"; +import { SQLiteMessageQueue } from "./sqlite.js"; +// import { messageQueue } from "./msgManager.js"; +// import cheerio from "cheerio"; + +class Chery { + constructor() { + this.jsonMap = [ + { + name: "奇瑞采购公告", + // filepath: path.resolve("chery_cg.json"), + info: [], + options: { + name: "采购公告", + url: "https://ebd.mychery.com/cms/api/dynamicData/queryContentPage", + categoryId: "5035", + siteId: "747", + }, + }, + { + name: "奇瑞寻源预告", + // filepath: path.resolve("chery_xy.json"), + info: [], + options: { + name: "寻源预告", + url: "https://ebd.mychery.com/cms/api/dynamicData/queryContentPage", + categoryId: "965901485789413376", + siteId: "747", + }, + }, + { + name: "奇瑞变更公告", + // filepath: path.resolve("chery_bg.json"), + info: [], + options: { + name: "变更公告", + url: "https://ebd.mychery.com/cms/api/dynamicData/queryContentPage", + categoryId: "5032", + siteId: "747", + }, + }, + ]; + console.log("奇瑞 爬虫启动..."); + this.queue = new SQLiteMessageQueue(); + this.start(); + } + + async start() { + try { + await this.init(); + } catch (err) { + console.error("启动失败:", err); + } + } + async init() { + for (let item of this.jsonMap) { + let announcements = this.queue.getAnnouncementsBySpider(item.name); + if (announcements.length > 0) { + this.loopFetchIncrement(item); + } else { + this.loopFetchFull(item); + } + // if (fs.existsSync(item.filepath)) { + // let data = fs.readFileSync(item.filepath, "utf-8"); + // item.info = data ? JSON.parse(data) : []; + // if (item.info.length > 0) { + // // await this.increment(item); + // console.log(`${item.name} 历史文件存在,开始增量爬取`); + // this.loopFetchIncrement(item); + // } else { + // this.loopFetchFull(item); + // } + // } else { + // console.log(`${item.name}历史文件不存在,开始全量爬取`); + // this.loopFetchFull(item); + // } + } + } + // 全量爬取 + loopFetchFull(props) { + try { + loopCall(this.getInfo.bind(this), { + time: config.fullFetchTime, + pagenumber: 1, + additional: props.options, + stopWhen: (pagenumber, result) => { + return ( + pagenumber >= result.pages || pagenumber >= config.pageNumberLimit + ); + }, + readyForNext: (pagenumber, result) => { + props.info.push(...result.info); + return pagenumber + 1; + }, + complete: (result) => { + props.info.push(...result.info); + console.log(`爬取完成,共获取 ${props.info.length} 条有效数据`); + try { + this.queue.saveAnnouncements(props.name, props.info); + // this.writeFile(props); + this.queue.addMessage(props.name, props.info); + } catch (error) { + console.error("数据库操作失败:", error); + } + this.loopFetchIncrement(props); + }, + }); + } catch (error) { + console.error(`奇瑞${props.options.name}全量爬取失败:`, error); + } + } + loopFetchIncrement(props) { + try { + loopCall(this.getInfo.bind(this), { + time: config.incrementFetchTime, // 5分钟间隔 + pagenumber: 1, + additional: props.options, + readyForNext: (pagenumber, result) => { + try { + let newInfo = this.queue.filterNewAnnouncements( + props.name, + result.info + ); + // 存在新数据 + if (newInfo.length > 0) { + console.log(`发现 ${newInfo.length} 条新数据`); + // props.info.push(...newInfo); + this.queue.saveAnnouncements(props.name, newInfo); + // this.writeFile(props); + this.queue.addMessage(props.name, newInfo); + // 全是新数据,继续下一页 + if (newInfo.length === result.info.length) { + return pagenumber + 1; + } else { + // 有部分重复数据,重新从第一页开始 + return 1; + } + } else { + console.log("没有发现新数据,继续监控..."); + return 1; // 重新从第一页开始 + } + } catch (error) { + console.error("数据库操作失败:", error); + } + }, + }); + } catch (error) { + console.error(`奇瑞${props.options.name}增量爬取失败:`, error); + } + } + async getInfo(pagenumber = 1, config) { + let info = []; + console.log(`${config.name}--获取第 ${pagenumber} 页数据...`); + let result = await this.getList(pagenumber, config); + if (result[0]) { + // 出错, 记录错误日志 + console.error("获取页面数据失败:", result[0]); + return { pages: 30, info: [] }; + } else { + let pages = 30; + let arr = result[1].res.rows; + + for (let i = 0; i < arr.length; i++) { + let item = arr[i]; + let endTime, publishTime; + if (config.categoryId === "965901485789413376") { + publishTime = item.publishDate.replace("T", " ").split(".")[0]; + endTime = this.extractDeadlineTime(item.text); + } else { + endTime = item.signUpEndTime.replace("T", " ").split(".")[0]; + publishTime = item.signUpBeginTime.replace("T", " ").split(".")[0]; + } + // 命中关键词 + if ( + endTime && + keywordsInclude(item.title) && + +new Date(endTime) >= Date.now() + ) { + // console.log("处理项目:", item.id, item.projectName); + info.push({ + id: item.url, + name: item.title, + publishTime: publishTime, + endTime: endTime, + urls: `https://ebd.mychery.com/cms` + item.url, + }); + } + } + return { pages, info }; + } + } + // 分页获取数据 + getList(pagenumber, config) { + return axios({ + url: config.url, + data: { + dto: { + bidType: "", + categoryId: config.categoryId, + city: "", + county: "", + province: "", + purchaseMode: "", + secondCompanyId: "", + siteId: config.siteId, + }, + pageNo: pagenumber, + pageSize: "10", + }, + method: "post", + }) + .then((res) => { + let result = res.data; + if (result.code === 0) { + return [null, result]; + } else { + return ["err", null]; + } + }) + .catch((err) => { + return [err, null]; + }); + } + + // writeFile(props) { + // fs.writeFileSync(props.filepath, JSON.stringify(props.info), "utf-8"); + // } + + extractDeadlineTime(html) { + // 匹配"预告报名截止时间:"后面的时间格式 + const regex = /预告报名截止时间:(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2})/; + const match = html.match(regex); + + if (match) { + return match[1]; + } + + return null; + } +} + +new Chery(); diff --git a/config.js b/config.js new file mode 100644 index 0000000..7e4a58b --- /dev/null +++ b/config.js @@ -0,0 +1,6 @@ +export default { + // 分页获取限制 + pageNumberLimit: 3, + fullFetchTime: 2000, + incrementFetchTime: 5 * 60 * 1000, +}; diff --git a/df.js b/df.js new file mode 100644 index 0000000..7104a44 --- /dev/null +++ b/df.js @@ -0,0 +1,187 @@ +import axios from "axios"; +import fs from "fs"; +import path from "path"; +import { timestampToDate, loopCall, keywordsInclude } from "./utils.js"; +import config from "./config.js"; +import { SQLiteMessageQueue } from "./sqlite.js"; +import * as cheerio from "cheerio"; + +class DF { + constructor() { + this.jsonMap = [ + { + name: "东风【招标采购】", + info: [], + options: { + name: "东风【招标采购】", + url: "https://etp.dfmc.com.cn/jyxx/004001/", + homeIndex: "trade_info_new.html", + }, + }, + { + name: "东风【非招标采购】", + info: [], + options: { + name: "东风【非招标采购】", + url: "https://etp.dfmc.com.cn/jyxx/004002/", + homeIndex: "trade_info_newf.html", + }, + }, + ]; + console.log("东风 爬虫启动..."); + this.queue = new SQLiteMessageQueue(); + this.start(); + } + + async start() { + try { + await this.init(); + } catch (err) { + console.error("启动失败:", err); + } + } + async init() { + for (let item of this.jsonMap) { + let announcements = this.queue.getAnnouncementsBySpider(item.name); + if (announcements.length > 0) { + this.loopFetchIncrement(item); + } else { + this.loopFetchFull(item); + } + } + } + // 全量爬取 + loopFetchFull(props) { + try { + loopCall(this.getInfo.bind(this), { + time: config.fullFetchTime, + pagenumber: 1, + additional: props.options, + stopWhen: (pagenumber, result) => { + return ( + pagenumber >= result.pages || pagenumber >= config.pageNumberLimit + ); + }, + readyForNext: (pagenumber, result) => { + props.info.push(...result.info); + return pagenumber + 1; + }, + complete: (result) => { + props.info.push(...result.info); + console.log(`爬取完成,共获取 ${props.info.length} 条有效数据`); + try { + if (props.info.length > 0) { + this.queue.saveAnnouncements(props.name, props.info); + // this.writeFile(props); + this.queue.addMessage(props.name, props.info); + } + } catch (error) { + console.error("数据库操作失败:", error); + } + this.loopFetchIncrement(props); + }, + }); + } catch (error) { + console.error(`${props.options.name}全量爬取失败:`, error); + } + } + loopFetchIncrement(props) { + try { + loopCall(this.getInfo.bind(this), { + time: config.incrementFetchTime, // 5分钟间隔 + pagenumber: 1, + additional: props.options, + readyForNext: (pagenumber, result) => { + try { + let newInfo = this.queue.filterNewAnnouncements( + props.name, + result.info + ); + // 存在新数据 + if (newInfo.length > 0) { + console.log(`发现 ${newInfo.length} 条新数据`); + // props.info.push(...newInfo); + this.queue.saveAnnouncements(props.name, newInfo); + // this.writeFile(props); + this.queue.addMessage(props.name, newInfo); + // 全是新数据,继续下一页 + if (newInfo.length === result.info.length) { + return pagenumber + 1; + } else { + // 有部分重复数据,重新从第一页开始 + return 1; + } + } else { + console.log("没有发现新数据,继续监控..."); + return 1; // 重新从第一页开始 + } + } catch (error) { + console.error("数据库操作失败:", error); + } + }, + }); + } catch (error) { + console.error(`${props.options.name}增量爬取失败:`, error); + } + } + async getInfo(pagenumber = 1, config) { + let info = []; + console.log(`${config.name}--获取第 ${pagenumber} 页数据...`); + let result = await this.getList(pagenumber, config); + if (result[0]) { + // 出错, 记录错误日志 + console.error("获取页面数据失败:", result[0].status); + return { pages: 0, info: [] }; + } else { + // 第六页开始就要验证码了 + let pages = 5; + let html = result[1]; + const $ = cheerio.load(html); + $(".public-table tbody tr").each((index, element) => { + let id = $(element).find("td:nth-child(3)").text(); + let name = $(element).find("a").text(); + let publishTime = $(element).find("td:nth-child(6)").text(); + let endTime = $(element).find("td:nth-child(5)").text(); + let urls = + "https://etp.dfmc.com.cn" + $(element).find("a").attr("href"); + if ( + endTime && + +new Date(endTime) >= Date.now() && + keywordsInclude(name) + ) { + console.log("处理项目:", id, name); + info.push({ + id: id, + name: name, + publishTime: publishTime, + endTime: endTime, + urls: urls, + }); + } + }); + return { pages, info }; + } + } + // 分页获取数据 + getList(pagenumber, config) { + let url = config.url; + if (pagenumber === 1) { + url += config.homeIndex; + } else { + url += `${pagenumber}.html`; + } + return axios({ + url: url, + method: "get", + }) + .then((res) => { + let result = res.data; + return [null, result]; + }) + .catch((err) => { + return [err, null]; + }); + } +} + +new DF(); diff --git a/ecosystem.config.cjs b/ecosystem.config.cjs new file mode 100644 index 0000000..68cfb53 --- /dev/null +++ b/ecosystem.config.cjs @@ -0,0 +1,37 @@ +module.exports = { + apps: [ + // 消息队列管理器(优先启动) + { + name: "msg-manager", + script: "msgManager.js", + instances: 1, + autorestart: true, + watch: false, + max_memory_restart: "200M", + env: { + NODE_ENV: "production", + SERVICE_NAME: "msg-manager", + }, + error_file: "./logs/msg-manager-error.log", + out_file: "./logs/msg-manager-out.log", + log_file: "./logs/msg-manager-combined.log", + time: true, + }, + { + name: "picc-spider", + script: "picc.js", + instances: 1, + autorestart: true, + watch: false, + max_memory_restart: "300M", + env: { + NODE_ENV: "production", + SPIDER_NAME: "picc", + }, + error_file: "./logs/picc-error.log", + out_file: "./logs/picc-out.log", + log_file: "./logs/picc-combined.log", + time: true, + }, + ], +}; diff --git a/geely.js b/geely.js new file mode 100644 index 0000000..f71479e --- /dev/null +++ b/geely.js @@ -0,0 +1,237 @@ +import axios from "axios"; +import fs from "fs"; +import path from "path"; +import { timestampToDate, loopCall } from "./utils.js"; +import config from "./config.js"; +import { SQLiteMessageQueue } from "./sqlite.js"; +// import cheerio from "cheerio"; +// import { messageQueue } from "./msgManager.js"; + +class GEELY { + constructor() { + this.url = "https://glzb.geely.com/gpmp/notice/listnotice"; + // this.filepath = path.resolve("geely.json"); + this.info = []; + console.log("GEELY 爬虫启动..."); + this.queue = new SQLiteMessageQueue(); + this.start(); + } + + async start() { + try { + await this.init(); + } catch (err) { + console.error("启动失败:", err); + } + } + async init() { + let announcements = this.queue.getAnnouncementsBySpider("吉利"); + if (announcements.length > 0) { + await this.increment(); + } else { + await this.fullFetch(); + } + // if (fs.existsSync(this.filepath)) { + // let data = fs.readFileSync(this.filepath, "utf-8"); + // this.info = data ? JSON.parse(data) : []; + // if (this.info.length > 0) { + // await this.increment(); + // } else { + // await this.fullFetch(); + // } + // } else { + // console.log("历史文件不存在,开始全量爬取"); + // await this.fullFetch(); + // } + } + // 全量爬取 + async fullFetch() { + console.log("开始全量爬取..."); + try { + await loopCall(this.getInfo.bind(this), { + time: config.fullFetchTime, + pagenumber: 1, + stopWhen: (pagenumber, result) => { + return ( + pagenumber >= result.pages || pagenumber >= config.pageNumberLimit + ); // 限制最多2页用于测试 + }, + readyForNext: (pagenumber, result) => { + this.info.push(...result.info); + return pagenumber + 1; + }, + complete: (result) => { + this.info.push(...result.info); + console.log(`爬取完成,共获取 ${this.info.length} 条有效数据`); + try { + this.queue.saveAnnouncements("吉利", this.info); + // this.writeFile(this.info); + this.queue.addMessage("吉利", this.info); + } catch (error) { + console.error("数据库操作失败:", error); + } + }, + }); + } catch (error) { + console.error("全量爬取失败:", error); + } + console.log("开始增量爬取..."); + this.increment(); + } + + // 增量爬取 + async increment() { + console.log("开始增量爬取模式,每5分钟检查一次新数据..."); + try { + await loopCall(this.getInfo.bind(this), { + time: config.incrementFetchTime, // 5分钟间隔 + pagenumber: 1, + readyForNext: (pagenumber, result) => { + try { + let newInfo = this.queue.filterNewAnnouncements( + "吉利", + result.info + ); + // 存在新数据 + if (newInfo.length > 0) { + console.log(`发现 ${newInfo.length} 条新数据`); + this.queue.saveAnnouncements("吉利", newInfo); + this.queue.addMessage("吉利", newInfo); + // 全是新数据,继续下一页 + if (newInfo.length === result.info.length) { + return pagenumber + 1; + } else { + // 有部分重复数据,重新从第一页开始 + return 1; + } + } else { + console.log("没有发现新数据,继续监控..."); + return 1; // 重新从第一页开始 + } + } catch (error) { + console.error("数据库操作失败:", error); + } + }, + }); + } catch (error) { + console.error("增量爬取失败:", error); + } + } + // 传入页码获取数据 + async getInfo(pagenumber = 1) { + let today = new Date().setHours(0, 0, 0, 0); + let beforeOneMonth = today - 30 * 24 * 60 * 60 * 1000; + let info = []; + console.log(`正在获取第 ${pagenumber} 页数据...`); + let result = await this.getList(pagenumber); + if (result[0]) { + // 出错, 记录错误日志 + console.error("获取页面数据失败:", result[0]); + return { pages: 0, info: [] }; + } else { + let total = result[1].data.total; + let pages = Math.ceil(total / 20); + let arr = result[1].data.items; + + for (let i = 0; i < arr.length; i++) { + let item = arr[i]; + if (item.endtime >= today && item.publishtime >= beforeOneMonth) { + console.log("处理项目:", item.pjtnoticeid, item.pjtnoticename); + let noticeRes = await this.getNoticeUrl(item.pjtnoticeid); + if (noticeRes[0]) { + // 获取招标公告内容报错 + console.error("获取公告详情失败:", noticeRes[0]); + } else { + info.push({ + id: item.pjtnoticeid, + name: item.pjtnoticename, + publishTime: timestampToDate(item.publishtime), + endTime: timestampToDate(item.endtime), + urls: noticeRes[1], + }); + } + } + } + return { pages, info }; + } + } + getList(pagenumber) { + return axios({ + url: this.url, + params: { + pagesize: 20, + pagenumber: pagenumber, + publishstatus: 2, + bidcategoryid: 1442, + iflongpro: 0, + _: Date.now(), + }, + method: "get", + }) + .then((res) => { + let result = res.data; + if (result.code === "success") { + return [null, result]; + } else { + return ["err", null]; + } + }) + .catch((err) => { + return [err, null]; + }); + } + + getNoticeUrl(id) { + let timestamp = Date.now(); + return axios({ + url: `https://glzb.geely.com/gpmp/notice/query?_=${timestamp}&pjtnoticeid=${id}`, + method: "get", + }) + .then((res) => { + let result = res.data; + if (result.code === "success") { + let promises = []; + for (let item of result.data.attachs) { + let params = { + name: item.attachname, + downloadUrl: item.downloadUrl, + previewUrl: item.previewUrl, + attachname: item.attachname, + _: Date.now(), + }; + promises.push( + axios({ + url: `https://glzb.geely.com/pub/file/info/preview`, + method: "get", + params, + }) + ); + } + return Promise.allSettled(promises).then((results) => { + let urls = []; + results.forEach((result) => { + if ( + result.status === "fulfilled" && + result.value.data.code === "success" + ) { + urls.push(result.value.data.data); + } + }); + return [null, urls]; + }); + } else { + return ["err", null]; + } + }) + .catch((err) => { + console.log("err:", err); + return [err, null]; + }); + } + + // writeFile(info) { + // fs.writeFileSync(this.filepath, JSON.stringify(info), "utf-8"); + // } +} + +new GEELY(); diff --git a/greatWall.js b/greatWall.js new file mode 100644 index 0000000..eb86488 --- /dev/null +++ b/greatWall.js @@ -0,0 +1,234 @@ +import axios from "axios"; +import fs from "fs"; +import path from "path"; +import { timestampToDate, loopCall, keywordsInclude } from "./utils.js"; +import config from "./config.js"; +import { SQLiteMessageQueue } from "./sqlite.js"; + +class GreatWall { + constructor() { + this.jsonMap = [ + { + name: "长城公开寻源", + info: [], + options: { + name: "长城公开寻源", + url: "https://srm.gwm.cn/cloud-srm/api-sou/sou-firstPage/souReqlistPage", + }, + }, + { + name: "长城招募公示大厅", + info: [], + options: { + name: "长城招募公示大厅", + url: "https://srm.gwm.cn/cloud-srm/api-sou/api-ql/Recruit/visitList", + data: { + type: "Recruit", + lang: "zh-cn", + query: { "*": {} }, + payload: { + filter: {}, + page: { sort: "lastUpdateDate desc", pageNum: 1, pageSize: 8 }, + }, + action: "visitList", + tree: true, + }, + }, + }, + ]; + console.log("长城 爬虫启动..."); + this.queue = new SQLiteMessageQueue(); + this.start(); + } + + async start() { + try { + await this.init(); + } catch (err) { + console.error("启动失败:", err); + } + } + async init() { + for (let item of this.jsonMap) { + let announcements = this.queue.getAnnouncementsBySpider(item.name); + if (announcements.length > 0) { + this.loopFetchIncrement(item); + } else { + this.loopFetchFull(item); + } + } + } + // 全量爬取 + loopFetchFull(props) { + try { + loopCall(this.getInfo.bind(this), { + time: config.fullFetchTime, + pagenumber: 1, + additional: props.options, + stopWhen: (pagenumber, result) => { + return ( + pagenumber >= result.pages || pagenumber >= config.pageNumberLimit + ); + }, + readyForNext: (pagenumber, result) => { + props.info.push(...result.info); + return pagenumber + 1; + }, + complete: (result) => { + props.info.push(...result.info); + console.log(`爬取完成,共获取 ${props.info.length} 条有效数据`); + try { + if (props.info.length > 0) { + this.queue.saveAnnouncements(props.name, props.info); + // this.writeFile(props); + this.queue.addMessage(props.name, props.info); + } + } catch (error) { + console.error("数据库操作失败:", error); + } + this.loopFetchIncrement(props); + }, + }); + } catch (error) { + console.error(`${props.options.name}全量爬取失败:`, error); + } + } + loopFetchIncrement(props) { + try { + loopCall(this.getInfo.bind(this), { + time: config.incrementFetchTime, // 5分钟间隔 + pagenumber: 1, + additional: props.options, + readyForNext: (pagenumber, result) => { + try { + let newInfo = this.queue.filterNewAnnouncements( + props.name, + result.info + ); + // 存在新数据 + if (newInfo.length > 0) { + console.log(`发现 ${newInfo.length} 条新数据`); + // props.info.push(...newInfo); + this.queue.saveAnnouncements(props.name, newInfo); + // this.writeFile(props); + this.queue.addMessage(props.name, newInfo); + // 全是新数据,继续下一页 + if (newInfo.length === result.info.length) { + return pagenumber + 1; + } else { + // 有部分重复数据,重新从第一页开始 + return 1; + } + } else { + console.log("没有发现新数据,继续监控..."); + return 1; // 重新从第一页开始 + } + } catch (error) { + console.error("数据库操作失败:", error); + } + }, + }); + } catch (error) { + console.error(`${props.options.name}增量爬取失败:`, error); + } + } + async getInfo(pagenumber = 1, config) { + let info = []; + console.log(`${config.name}--获取第 ${pagenumber} 页数据...`); + let result = await this.getList(pagenumber, config); + if (result[0]) { + // 出错, 记录错误日志 + console.error("获取页面数据失败:", result[0]); + return { pages: 0, info: [] }; + } else { + if (config.data) { + // 招募公示大厅 + let arr = result[1].data.records; + let pages = result[1].data.pageCount; + for (let i = 0; i < arr.length; i++) { + let item = arr[i]; + let endTime, publishTime; + endTime = item.deadlineTime; + publishTime = item.publishTime; + // 命中关键词 + if (keywordsInclude(item.title)) { + info.push({ + id: item.recruitId, + name: item.title, + publishTime: publishTime, + endTime: endTime, + urls: `https://srm.gwm.cn/#/portalBidding/vendorBiddingDetail?id=${item.recruitId}`, + }); + } + } + return { pages, info }; + } else { + // 公开寻源 + let arr = result[1].data.list; + let pages = result[1].data.pages; + + for (let i = 0; i < arr.length; i++) { + let item = arr[i]; + let endTime, publishTime; + endTime = item.publicEndTime; + publishTime = item.releaseDate; + // 命中关键词 + if (keywordsInclude(item.projectName)) { + info.push({ + id: item.reqHeadId, + name: item.projectName, + publishTime: publishTime, + endTime: endTime, + urls: `https://srm.gwm.cn/#/portal?id=${item.reqHeadId}`, + }); + } + } + return { pages, info }; + } + } + } + // 分页获取数据 + getList(pagenumber, config) { + let data = {}; + if (config.data) { + data = config.data; + data.payload.page.pageNum = pagenumber; + } else { + data = { pageNum: pagenumber, pageSize: 8 }; + } + return axios({ + url: config.url, + data: data, + method: "post", + }) + .then((res) => { + let result = res.data; + if (result.code == "0") { + return [null, result]; + } else { + return ["err", null]; + } + }) + .catch((err) => { + return [err, null]; + }); + } + + // writeFile(props) { + // fs.writeFileSync(props.filepath, JSON.stringify(props.info), "utf-8"); + // } + + // extractDeadlineTime(html) { + // // 匹配"预告报名截止时间:"后面的时间格式 + // const regex = /预告报名截止时间:(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2})/; + // const match = html.match(regex); + + // if (match) { + // return match[1]; + // } + + // return null; + // } +} + +new GreatWall(); diff --git a/jianghuai.js b/jianghuai.js new file mode 100644 index 0000000..5963124 --- /dev/null +++ b/jianghuai.js @@ -0,0 +1,385 @@ +import axios from "axios"; +import fs from "fs"; +import path from "path"; +import JSON5 from "json5"; +import { timestampToDate, loopCall, keywordsInclude } from "./utils.js"; +import config from "./config.js"; +import { SQLiteMessageQueue } from "./sqlite.js"; + +class JiangHuai { + constructor(jsonMap) { + this.axiosInstance = axios.create({ timeout: 30000, maxRedirects: 5 }); + this.axiosInstance.interceptors.request.use((config) => { + // 添加cookie到请求头 + const cookieString = Array.from(this.cookiePair.entries()) + .map(([name, value]) => `${name}=${value}`) + .join("; "); + config.headers.Cookie = cookieString; + return config; + }); + this.axiosInstance.interceptors.response.use( + (response) => { + // 更新cookie到请求头 + let cookieArr = response.headers["set-cookie"]; + this.extractCookie(cookieArr); + return response; + }, + (error) => { + return Promise.reject(error); + } + ); + this.cookiePair = new Map(); + this.csrfToken = ""; + this.jsonMap = jsonMap; + // [ + // { + // name: "江淮【招标公告】", + // info: [], + // options: { + // name: "江淮【招标公告】", + // url: "https://ahjhqc.youzhicai.com/domain/data-list-new", + // data: { + // pageIndex: 1, + // type: 1, + // companyId: "", + // title: "", + // ntype: 1, + // start_time: "", + // end_time: "", + // child: "", + // tenderType: 3, + // }, + // }, + // }, + // { + // name: "江淮【变更/澄清公告】", + // info: [], + // options: { + // name: "江淮【变更/澄清公告】", + // url: "https://ahjhqc.youzhicai.com/domain/data-list-new", + // data: { + // pageIndex: 1, + // type: 1, + // companyId: "", + // title: "", + // ntype: "4,6", + // start_time: "", + // end_time: "", + // child: "", + // tenderType: 3, + // }, + // }, + // }, + // ]; + console.log("江淮 爬虫启动..."); + this.queue = new SQLiteMessageQueue(); + this.start(); + } + + async start() { + try { + await this.init(); + } catch (err) { + console.error("启动失败:", err); + } + } + async init() { + for (let item of this.jsonMap) { + let announcements = this.queue.getAnnouncementsBySpider(item.name); + if (announcements.length > 0) { + this.loopFetchIncrement(item); + } else { + this.loopFetchFull(item); + } + } + } + async initializeCookie() { + try { + let headers = { + headers: { + "User-Agent": + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36", + Accept: + "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8", + "Accept-Language": "zh-CN,zh;q=0.9", + "Cache-Control": "no-cache", + Pragma: "no-cache", + "Sec-Fetch-Dest": "document", + "Sec-Fetch-Mode": "navigate", + "Sec-Fetch-Site": "none", + "Upgrade-Insecure-Requests": "1", + }, + }; + const homeResponse = await this.axiosInstance.get( + "https://ahjhqc.youzhicai.com/homeindex/noticeListNew.html?type=1", + headers + ); + // 提取csrf-token + let tokenMatch = homeResponse.data.match( + / { + return ( + pagenumber >= result.pages || pagenumber >= config.pageNumberLimit + ); + }, + readyForNext: (pagenumber, result) => { + props.info.push(...result.info); + return pagenumber + 1; + }, + complete: (result) => { + props.info.push(...result.info); + console.log(`爬取完成,共获取 ${props.info.length} 条有效数据`); + try { + if (props.info.length > 0) { + this.queue.saveAnnouncements(props.name, props.info); + this.queue.addMessage(props.name, props.info); + } + } catch (error) { + console.error("数据库操作失败:", error); + } + this.loopFetchIncrement(props); + }, + }); + } catch (error) { + console.error(`${props.options.name}全量爬取失败:`, error); + } + } + loopFetchIncrement(props) { + console.log("开始增量爬取"); + try { + loopCall(this.getInfo.bind(this), { + time: config.incrementFetchTime, // 5分钟间隔 + pagenumber: 1, + additional: props.options, + readyForNext: (pagenumber, result) => { + try { + let newInfo = this.queue.filterNewAnnouncements( + props.name, + result.info + ); + // 存在新数据 + if (newInfo.length > 0) { + console.log(`发现 ${newInfo.length} 条新数据`); + // props.info.push(...newInfo); + this.queue.saveAnnouncements(props.name, newInfo); + // this.writeFile(props); + this.queue.addMessage(props.name, newInfo); + // 全是新数据,继续下一页 + if (newInfo.length === result.info.length) { + return pagenumber + 1; + } else { + // 有部分重复数据,重新从第一页开始 + return 1; + } + } else { + console.log("没有发现新数据,继续监控..."); + return 1; // 重新从第一页开始 + } + } catch (error) { + console.error("数据库操作失败:", error); + } + }, + }); + } catch (error) { + console.error(`${props.options.name}增量爬取失败:`, error); + } + } + async getInfo(pagenumber = 1, config) { + let info = []; + console.log(`${config.name}--获取第 ${pagenumber} 页数据...`); + let result = await this.getList(pagenumber, config); + if (result[0]) { + // 出错, 记录错误日志 + console.error("获取页面数据失败: ", result[0]); + return { pages: 0, info: [] }; + } else { + // 公开寻源 + let arr = result[1].list; + let total = result[1].total; + let pages = Math.ceil(total / 10); + + for (let i = 0; i < arr.length; i++) { + let item = arr[i]; + let endTime, publishTime; + publishTime = new Date(item.startTime).toLocaleDateString(); + endTime = new Date(item.endTime).toLocaleDateString(); + // 命中关键词 + if ( + keywordsInclude(item.noticeTitle) && + item.endTime && + +new Date(item.endTime) >= Date.now() + ) { + console.log("处理项目:", item.noticeTitle); + info.push({ + id: item.bulletinSID, + name: item.noticeTitle, + publishTime: publishTime, + endTime: endTime, + urls: `https://ahjhqc.youzhicai.com/${item.Url}`, + }); + } + } + return { pages, info }; + } + } + async getList(pagenumber, config) { + let data = config.data; + data.pageIndex = pagenumber; + let headers = { + Accept: "text/plain, */*; q=0.01", + "Accept-Language": "zh-CN,zh;q=0.9", + "Cache-Control": "no-cache", + "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", + Origin: "https://ahjhqc.youzhicai.com", + Pragma: "no-cache", + Priority: "u=1, i", + Referer: + "https://ahjhqc.youzhicai.com/homeindex/noticeListNew.html?type=1", + "Sec-Ch-Ua": + '"Not)A;Brand";v="8", "Chromium";v="138", "Google Chrome";v="138"', + "Sec-Ch-Ua-Mobile": "?0", + "Sec-Ch-Ua-Platform": '"macOS"', + "Sec-Fetch-Dest": "empty", + "Sec-Fetch-Mode": "cors", + "Sec-Fetch-Site": "same-origin", + "User-Agent": + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36", + "X-Requested-With": "XMLHttpRequest", + "X-Csrf-Token": this.csrfToken, + }; + try { + const response = await this.axiosInstance({ + url: config.url, + data, + method: "post", + headers, + }); + let result = JSON5.parse(response.data); + if (result.list && result.list.length > 0) { + return [null, result]; + } else { + return ["err", null]; + } + } catch (err) { + console.log("cookie不对"); + try { + await this.initializeCookie(); + headers["X-Csrf-Token"] = this.csrfToken; + const retryResponse = await this.axiosInstance({ + url: config.url, + data, + method: "post", + headers, + }); + // console.log(retryResponse.data); + let result = JSON5.parse(retryResponse.data); + if (result.list && result.list.length > 0) { + return [null, result]; + } else { + return ["err", null]; + } + } catch (retryErr) { + return [retryErr, null]; + } + } + } + // 分页获取数据 + // getList(pagenumber, config) { + // let data = config.data; + // data.pageIndex = pagenumber; + // return axios({ + // url: config.url, + // data: data, + // method: "post", + // headers: { + // "Content-Type": "application/x-www-form-urlencoded", + // }, + // }) + // .then((res) => { + // let result = res.data; + // if (result.list && result.list.length > 0) { + // return [null, result]; + // } else { + // return ["err", null]; + // } + // }) + // .catch((err) => { + // return [err, null]; + // }); + // } +} + +new JiangHuai([ + { + name: "江淮【招标公告】", + info: [], + options: { + name: "江淮【招标公告】", + url: "https://ahjhqc.youzhicai.com/domain/data-list-new", + data: { + pageIndex: 1, + type: 1, + companyId: "", + title: "", + ntype: 1, + start_time: "", + end_time: "", + child: "", + tenderType: 3, + }, + }, + }, +]); +new JiangHuai([ + { + name: "江淮【变更/澄清公告】", + info: [], + options: { + name: "江淮【变更/澄清公告】", + url: "https://ahjhqc.youzhicai.com/domain/data-list-new", + data: { + pageIndex: 1, + type: 1, + companyId: "", + title: "", + ntype: "4,6", + start_time: "", + end_time: "", + child: "", + tenderType: 3, + }, + }, + }, +]); diff --git a/leapMotor.js b/leapMotor.js new file mode 100644 index 0000000..d99e73e --- /dev/null +++ b/leapMotor.js @@ -0,0 +1,193 @@ +import axios from "axios"; +import fs from "fs"; +import path from "path"; +import { timestampToDate, loopCall, keywordsInclude } from "./utils.js"; +import config from "./config.js"; +import { SQLiteMessageQueue } from "./sqlite.js"; +// import cheerio from "cheerio"; + +class LeapMotor { + constructor() { + this.url = + "https://lpsrm.leapmotor.com/cloud-srm/api-inq/inq-anon/reqhead/listPage"; + this.info = []; + console.log("零跑 爬虫启动..."); + this.queue = new SQLiteMessageQueue(); + this.start(); + } + + async start() { + try { + await this.init(); + } catch (err) { + console.error("启动失败:", err); + } + } + async init() { + let announcements = this.queue.getAnnouncementsBySpider("零跑"); + if (announcements.length > 0) { + // console.log(announcements); + await this.increment(); + } else { + await this.fullFetch(); + } + } + // 全量爬取 + async fullFetch() { + console.log("开始全量爬取..."); + try { + await loopCall(this.getInfo.bind(this), { + time: config.fullFetchTime, + pagenumber: 1, + stopWhen: (pagenumber, result) => { + return ( + pagenumber >= result.pages || pagenumber >= config.pageNumberLimit + ); + }, + readyForNext: (pagenumber, result) => { + this.info.push(...result.info); + return pagenumber + 1; + }, + complete: (result) => { + this.info.push(...result.info); + console.log(`爬取完成,共获取 ${this.info.length} 条有效数据`); + try { + this.queue.saveAnnouncements("零跑", this.info); + this.queue.addMessage("零跑", this.info); + } catch (error) { + console.error("数据库操作失败:", error); + } + }, + }); + } catch (error) { + console.error("全量爬取失败:", error); + } + console.log("开始增量爬取..."); + this.increment(); + } + + // 增量爬取 + async increment() { + console.log("开始增量爬取模式,每5分钟检查一次新数据..."); + try { + await loopCall(this.getInfo.bind(this), { + time: config.incrementFetchTime, // 5分钟间隔 + pagenumber: 1, + readyForNext: (pagenumber, result) => { + // 判断数据是否存在 + try { + let newInfo = this.queue.filterNewAnnouncements( + "零跑", + result.info + ); + // 有新数据 + if (newInfo.length > 0) { + console.log(`发现 ${newInfo.length} 条新数据`); + + this.queue.saveAnnouncements("零跑", newInfo); + this.queue.addMessage("零跑", newInfo); + + // 全是新数据,继续下一页 + if (newInfo.length === result.info.length) { + return pagenumber + 1; + } else { + // 有部分重复数据,重新从第一页开始 + return 1; + } + } else { + console.log("没有发现新数据,继续监控..."); + return 1; // 重新从第一页开始 + } + } catch (error) { + console.error("数据库操作失败:", error); + } + }, + }); + } catch (error) { + console.error("增量爬取失败:", error); + } + } + // 传入页码获取数据 + async getInfo(pagenumber = 1) { + let info = []; + console.log(`正在获取第 ${pagenumber} 页数据...`); + let result = await this.getList(pagenumber); + if (result[0]) { + // 出错, 记录错误日志 + console.error("获取页面数据失败:", result[0]); + return { pages: 0, info: [] }; + } else { + // let total = result[1].data.total; + let pages = result[1].data.pages; + let arr = result[1].data.list; + + for (let i = 0; i < arr.length; i++) { + let item = arr[i]; + // 命中关键词 + if (keywordsInclude(item.souReqTitile)) { + console.log("处理项目:", item.reqHeadId, item.souReqTitile); + let noticeRes = await this.getNoticeUrl(item.reqHeadId); + if (noticeRes[0]) { + // 获取招标公告内容报错 + console.error("获取公告链接失败:", noticeRes[0]); + } else { + info.push({ + id: item.reqHeadId, + name: item.souReqTitile, + publishTime: item.publishTime, + endTime: item.expirationTime, + urls: noticeRes[1], + }); + } + } + } + return { pages, info }; + } + } + getList(pagenumber) { + return axios({ + url: this.url, + data: { + pageNum: pagenumber, + pageSize: 8, + }, + method: "post", + }) + .then((res) => { + let result = res.data; + if (result.code === "0") { + return [null, result]; + } else { + return ["err", null]; + } + }) + .catch((err) => { + return [err, null]; + }); + } + + getNoticeUrl(id) { + return axios({ + url: `https://lpsrm.leapmotor.com/cloud-srm/api-inq/inq-anon/pj/reqhead/get?id=${id}`, + method: "get", + }) + .then((res) => { + let result = res.data; + if (result.code === "0") { + return [null, result.data.extNoticeLink]; + } else { + return ["err", null]; + } + }) + .catch((err) => { + console.log("err:", err); + return [err, null]; + }); + } + + // writeFile(info) { + // fs.writeFileSync(this.filepath, JSON.stringify(info), "utf-8"); + // } +} + +new LeapMotor(); diff --git a/mailer.js b/mailer.js new file mode 100644 index 0000000..b9c2ebb --- /dev/null +++ b/mailer.js @@ -0,0 +1,100 @@ +import nodemailer from "nodemailer"; +import path from "path"; + +class EmailSender { + constructor(config) { + this.transporter = nodemailer.createTransport(config); + this.defaultFrom = config.auth.user; + } + async sendEmail(options) { + try { + const mailOptions = { + from: options.from || this.defaultFrom, + to: options.to, + cc: options.cc, + bcc: options.bcc, + subject: options.subject, + text: options.text, + html: options.html, + attachments: options.attachments || [], + }; + + const info = await this.transporter.sendMail(mailOptions); + console.log(`邮件发送成功: ${options.to} - ${info.messageId}`); + return { success: true, messageId: info.messageId }; + } catch (error) { + console.error(`邮件发送失败: ${options.to} -`, error.message); + throw error; + } + } + async sendBasicEmail(to, subject, content) { + return await this.sendEmail({ to, subject, html: content }); + } + + async sendEmailWithAttachments(to, subject, content, attachmentPath) { + const attachments = []; + if (attachmentPath) { + attachments.push({ + filename: path.basename(attachmentPath), + path: attachmentPath, + }); + } + return await this.sendEmail({ to, subject, html: content, attachments }); + } + + async sendBulkEmail(recipients, subject, content) { + const results = []; + for (const recipient of recipients) { + try { + const result = await this.sendEmail({ + to: recipient, + subject, + html: content, + }); + results.push({ recipient, success: true, result }); + } catch (error) { + results.push({ recipient, success: false, error: error.message }); + } + await new Promise((resolve) => setTimeout(resolve, 1000)); + } + return results; + } + + async testConnection() { + try { + await this.transporter.verify(); + console.log("邮件服务器连接成功"); + return true; + } catch (error) { + console.error("邮件服务器连接失败:", error); + return false; + } + } +} + +// async function example() { +// let emailSender = new EmailSender({ +// host: "smtp.exmail.qq.com", +// port: 465, +// secure: true, +// auth: { +// user: "jiqiren@axbbaoxian.com", +// pass: "Am13579q", +// }, +// }); +// const isConnected = await emailSender.testConnection(); +// if (!isConnected) { +// console.log("邮件服务器连接失败"); +// return; +// } +// emailSender.sendBasicEmail( +// "cpw@axbbaoxian.com", +// "测试邮件", +// "这是测试邮件内容" +// ); +// } + +// example().catch((err) => { +// console.error("程序错误:", err); +// }); +export { EmailSender }; diff --git a/msgManager.js b/msgManager.js new file mode 100644 index 0000000..26b57dd --- /dev/null +++ b/msgManager.js @@ -0,0 +1,212 @@ +// msgQueue.js - 基于事件的消息队列 +import { EventEmitter } from "events"; +import fs from "fs"; +import path from "path"; +import { EmailSender } from "./mailer.js"; +import { SQLiteMessageQueue } from "./sqlite.js"; +import { md5 } from "./utils.js"; +import axios from "axios"; + +class MessageQueue extends EventEmitter { + constructor() { + super(); + this.queue = new SQLiteMessageQueue(); + this.processing = false; + // this.queueFile = path.resolve("message_queue.json");K + this.emailSender = new EmailSender({ + host: "smtp.exmail.qq.com", + port: 465, + secure: true, + auth: { + user: "jiqiren@axbbaoxian.com", + pass: "Am13579q", + }, + }); + this.recipients = [ + "huzhengrong@axbbaoxian.com", + ]; + + // 启动处理器 + this.startProcessor(); + } + + // 添加消息到队列 + + // 处理队列 + async startProcessor() { + setInterval(async () => { + // 清除状态 不等于 pending的数据 + console.log("开始处理队列"); + try { + const pendingMessages = this.queue.getPendingMessages(); + if (!this.processing && pendingMessages.length > 0) { + await this.processQueue(pendingMessages); + } + } catch (error) { + console.error(`❌ 获取待处理消息失败:`, error); + } + }, 60 * 60 * 1000); // 1h处理一次 + } + + async processQueue(pendingMessages) { + this.processing = true; + + let msgMap = {}; + for (const message of pendingMessages) { + try { + console.log(`📧 处理消息: ${message.spider_name}`); + // console.log(typeof message.data); + // let formdata = JSON.parse(message.data); + if (!msgMap[message.spider_name]) { + msgMap[message.spider_name] = message.data; + } else { + msgMap[message.spider_name].push(...message.data); + } + + message.status = "sent"; + message.sent_at = new Date().toISOString(); + this.queue.updateMessageStatus( + message.id, + message.status, + message.sent_at + ); + } catch (error) { + console.error(`❌ 消息处理失败: ${message.id}`, error); + message.status = "failed"; + message.error_message = error.message; + this.queue.updateMessageStatus( + message.id, + message.status, + null, + // message.sent_at, + message.error_message + ); + } + } + let html = ""; + for (const spiderName in msgMap) { + html += this.generateTable(spiderName, msgMap[spiderName]); + } + try { + this.emailSender.sendBulkEmail(this.recipients, "招标项目最新公告", html); + } catch (error) { + console.error(`❌ 通知发送失败: ${error}`); + } + + this.processing = false; + } + + generateTable(spiderName, data) { + let tableHtml = ` +
+

+ 🕷️ ${spiderName} (${data.length} 条新增) +

+ +
+ + + + + + + + + + + + `; + data.forEach((item, index) => { + const rowColor = index % 2 === 0 ? "#f8f9fa" : "white"; + // const publishTime = this.formatDateTime(item.publishTime); + // const endTime = this.formatDateTime(item.endTime); + const urls = this.formatUrls(item.urls); + + tableHtml += ` + + + + + + + + `; + }); + + tableHtml += ` + +
序号项目名称发布时间截止时间查看详情
+ ${index + 1} + +
+ ${item.name} +
+ +
+ ${item.publishTime} + +
${item.endTime}
+
+ ${urls} +
+
+
+ `; + + return tableHtml; + } + + getSign(timestamp) { + let secret = "cpwyyds"; + let uri = "/common/message/push"; + const url = uri + timestamp + secret; + const myCalc = md5(url); + let sign = + myCalc.substring(5, 13) + + myCalc.substring(29, 31) + + myCalc.substring(18, 27); + //sign 转大写 + sign = sign.toUpperCase(); + return sign; + } + + formatUrls(urls) { + if (!urls) { + return '无链接'; + } + + // 处理数组形式的URLs + if (Array.isArray(urls)) { + if (urls.length === 0) { + return '无链接'; + } + + if (urls.length === 1) { + return `📄 查看`; + } + + // 多个链接的情况 + let linksHtml = '
'; + urls.forEach((url, index) => { + linksHtml += `📄 链接${ + index + 1 + }`; + }); + linksHtml += "
"; + return linksHtml; + } + + // 处理字符串形式的URL + if (typeof urls === "string") { + return `📄 查看`; + } + + return '链接格式错误'; + } +} + +const messageQueue = new MessageQueue(); + +export { messageQueue }; + +// export default MessageQueue; diff --git a/nio.js b/nio.js new file mode 100644 index 0000000..c6d43f7 --- /dev/null +++ b/nio.js @@ -0,0 +1,170 @@ +import axios from "axios"; +import fs from "fs"; +import path from "path"; +import { + timestampToDate, + loopCall, + keywordsInclude, + getYiqiNoticeUrl, + parseToGgDetailsParams, +} from "./utils.js"; +import config from "./config.js"; +import * as cheerio from "cheerio"; +import { SQLiteMessageQueue } from "./sqlite.js"; + +class NIO { + constructor() { + // this.filepath = path.resolve("yiqi.json"); + this.info = []; + console.log("蔚来 爬虫启动..."); + this.queue = new SQLiteMessageQueue(); + this.start(); + } + + async start() { + try { + await this.init(); + } catch (err) { + console.error("启动失败:", err); + } + } + async init() { + let announcements = this.queue.getAnnouncementsBySpider("蔚来"); + if (announcements.length > 0) { + await this.increment(); + } else { + await this.fullFetch(); + } + } + // 全量爬取 + async fullFetch() { + console.log("开始全量爬取..."); + try { + await loopCall(this.getInfo.bind(this), { + time: config.fullFetchTime, + pagenumber: 1, + stopWhen: (pagenumber, result) => { + return ( + pagenumber >= result.pages || pagenumber >= config.pageNumberLimit + ); + }, + readyForNext: (pagenumber, result) => { + this.info.push(...result.info); + return pagenumber + 1; + }, + complete: (result) => { + this.info.push(...result.info); + console.log(`爬取完成,共获取 ${this.info.length} 条有效数据`); + try { + if (this.info.length > 0) { + this.queue.saveAnnouncements("蔚来", this.info); + // this.writeFile(this.info); + this.queue.addMessage("蔚来", this.info); + } + } catch (error) { + console.error("数据库操作失败:", error); + } + }, + }); + } catch (error) { + console.error("全量爬取失败:", error); + } + console.log("开始增量爬取..."); + this.increment(); + } + + // 增量爬取 + async increment() { + console.log("开始增量爬取模式,每5分钟检查一次新数据..."); + try { + await loopCall(this.getInfo.bind(this), { + time: config.incrementFetchTime, // 5分钟间隔 + pagenumber: 1, + readyForNext: (pagenumber, result) => { + try { + let newInfo = this.queue.filterNewAnnouncements( + "蔚来", + result.info + ); + // 存在新数据 + if (newInfo.length > 0) { + console.log(`发现 ${newInfo.length} 条新数据`); + // this.info.push(...newInfo); + this.queue.saveAnnouncements("蔚来", newInfo); + // this.writeFile(this.info); + this.queue.addMessage("蔚来", newInfo); + // 全是新数据,继续下一页 + if (newInfo.length === result.info.length) { + return pagenumber + 1; + } else { + // 有部分重复数据,重新从第一页开始 + return 1; + } + } else { + console.log("没有发现新数据,继续监控..."); + return 1; // 重新从第一页开始 + } + } catch (error) { + console.error("数据库操作失败:", error); + } + }, + }); + } catch (error) { + console.error("增量爬取失败:", error); + } + } + async getInfo(pagenumber = 1) { + let info = []; + console.log(`正在获取第 ${pagenumber} 页数据...`); + let result = await this.getHtml(pagenumber); + if (result[0]) { + // 出错, 记录错误日志 + console.error("获取页面数据失败:", result[0]); + return { pages: 0, info: [] }; + } else { + let pages = 1; + let html = result[1]; + const $ = cheerio.load(html); + let jsonStr = $("#__NEXT_DATA__").text(); + let data = JSON.parse(jsonStr).props.pageProps.tenderNotices; + // console.log(data); + data.forEach((item) => { + let id = item.id; + let name = item.title; + let publishTime = item.publishDate; + let endTime = item.dueTime; + let urls = item.documents[0].url; + if ( + endTime && + +new Date(endTime) >= Date.now() && + keywordsInclude(name) + ) { + info.push({ + id, + name, + publishTime, + endTime, + urls, + }); + } + }); + return { pages, info }; + } + } + // 分页获取数据 + getHtml(pagenumber) { + return axios({ + url: "https://www.nio.cn/partnership/tender-notices", + method: "get", + }) + .then((res) => { + let result = res.data; + return [null, result]; + }) + .catch((err) => { + return [err, null]; + }); + } +} + +new NIO(); diff --git a/package.json b/package.json new file mode 100644 index 0000000..6ebac6a --- /dev/null +++ b/package.json @@ -0,0 +1,23 @@ +{ + "name": "net-spider", + "version": "1.0.0", + "description": "", + "main": "index.js", + "type": "module", + "scripts": { + "test": "echo \"Error: no test specified\" && exit 1", + "start": "pm2 start ecosystem.config.cjs", + "stop": "pm2 stop all", + "stats": "node stats.js", + "restart": "pm2 restart all" + }, + "author": "", + "license": "ISC", + "dependencies": { + "axios": "^1.12.2", + "better-sqlite3": "^12.4.1", + "cheerio": "^1.1.2", + "json5": "^2.2.3", + "nodemailer": "^7.0.6" + } +} diff --git a/picc.js b/picc.js new file mode 100644 index 0000000..804d1f8 --- /dev/null +++ b/picc.js @@ -0,0 +1,214 @@ +import axios from "axios"; +import fs from "fs"; +import path from "path"; +import { timestampToDate, loopCall } from "./utils.js"; +import config from "./config.js"; +import { SQLiteMessageQueue } from "./sqlite.js"; + +class PICC { + constructor() { + this.info = []; + console.log("中国人民保险 爬虫启动..."); + this.queue = new SQLiteMessageQueue(); + this.start(); + } + + async start() { + try { + await this.init(); + } catch (err) { + console.error("启动失败:", err); + } + } + async init() { + let announcements = this.queue.getAnnouncementsBySpider("中国人民保险"); + if (announcements.length > 0) { + await this.increment(); + } else { + await this.fullFetch(); + } + } + // 全量爬取 + async fullFetch() { + console.log("开始全量爬取..."); + try { + await loopCall(this.getInfo.bind(this), { + time: config.fullFetchTime, + pagenumber: 1, + stopWhen: (pagenumber, result) => { + return ( + pagenumber >= result.pages || pagenumber >= config.pageNumberLimit + ); + }, + readyForNext: (pagenumber, result) => { + this.info.push(...result.info); + return pagenumber + 1; + }, + complete: (result) => { + this.info.push(...result.info); + console.log(`爬取完成,共获取 ${this.info.length} 条有效数据`); + try { + if (this.info.length > 0) { + this.queue.saveAnnouncements("中国人民保险", this.info); + // this.writeFile(this.info); + this.queue.addMessage("中国人民保险", this.info); + } + } catch (error) { + console.error("数据库操作失败:", error); + } + }, + }); + } catch (error) { + console.error("全量爬取失败:", error); + } + console.log("开始增量爬取..."); + this.increment(); + } + + // 增量爬取 + async increment() { + console.log("开始增量爬取模式,每5分钟检查一次新数据..."); + try { + await loopCall(this.getInfo.bind(this), { + time: config.incrementFetchTime, // 5分钟间隔 + pagenumber: 1, + readyForNext: (pagenumber, result) => { + try { + let newInfo = this.queue.filterNewAnnouncements( + "中国人民保险", + result.info + ); + // 存在新数据 + if (newInfo.length > 0) { + console.log(`发现 ${newInfo.length} 条新数据`); + // this.info.push(...newInfo); + this.queue.saveAnnouncements("中国人民保险", newInfo); + // this.writeFile(this.info); + this.queue.addMessage("中国人民保险", newInfo); + // 全是新数据,继续下一页 + if (newInfo.length === result.info.length) { + return pagenumber + 1; + } else { + // 有部分重复数据,重新从第一页开始 + return 1; + } + } else { + console.log("没有发现新数据,继续监控..."); + return 1; // 重新从第一页开始 + } + } catch (error) { + console.error("数据库操作失败:", error); + } + }, + }); + } catch (error) { + console.error("增量爬取失败:", error); + } + } + async getInfo(pagenumber = 1) { + let info = []; + console.log(`正在获取第 ${pagenumber} 页数据...`); + let result = await this.getList(pagenumber); + if (result[0]) { + // 出错, 记录错误日志 + console.error("获取页面数据失败:", result[0]); + return { pages: 0, info: [] }; + } else { + let total = result[1].res.total; + let pages = Math.ceil(total / 10); + let arr = result[1].res.rows; + + for (let i = 0; i < arr.length; i++) { + let item = arr[i]; + let endTime = timestampToDate( + new Date(item.tenderFileSaleEndTime).getTime(), + true + ); + // 命中关键词 + if ( + this.keywordsInclude(item.title) && + endTime && + +new Date(endTime) >= Date.now() + ) { + // console.log("处理项目:", item.sourcingId, item.title); + info.push({ + id: item.sourcingId, + name: item.title, + publishTime: timestampToDate( + new Date(item.tenderFileSaleBeginTime).getTime(), + true + ), + endTime: endTime, + urls: `https://ec.picc.com/cms/default/webfile${item.url}`, + }); + } + } + return { pages, info }; + } + } + // 分页获取数据 + getList(pagenumber) { + return axios({ + url: "https://ec.picc.com/cms/api/dynamicData/queryContentPage", + data: { + dto:{ + categoryId:"211,213,214,215,216,217", + city:"", + county:"", + purchaseMode:"", + siteId:"725" + }, + pageNo: pagenumber, + pageSize: 10, + }, + method: "post", + headers: { + 'Accept': 'application/json, text/javascript, */*; q=0.01', + 'Accept-Encoding': 'gzip, deflate, br, zstd', + 'Accept-Language': 'zh-CN,zh;q=0.9', + 'Connection': 'keep-alive', + 'Content-Type': 'application/json; charset=UTF-8', + 'Cookie': 'G_rbec_47_11_8080=22685.52745.19855.0000', + 'Host': 'ec.picc.com', + 'Origin': 'https://ec.picc.com', + 'Referer': 'https://ec.picc.com/cms/default/webfile/ywgg1/index.html', + 'Sec-Fetch-Dest': 'empty', + 'Sec-Fetch-Mode': 'cors', + 'Sec-Fetch-Site': 'same-origin', + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36', + 'X-Requested-With': 'XMLHttpRequest', + 'Sec-Ch-Ua': '"Google Chrome";v="141", "Not?A_Brand";v="8", "Chromium";v="141"', + 'Sec-Ch-Ua-Mobile': '?0', + 'Sec-Ch-Ua-Platform': "macOS", + } + }) + .then((res) => { + let result = res.data; + console.log("then",result) + if (result.msg === "操作成功" && result.code === 0) { + return [null, result]; + } else { + return ["err", null]; + } + }) + .catch((err) => { + console.log('catch', err) + return [err, null]; + }); + } + + keywordsInclude(name) { + let keywords = [ + "保险", + "车险", + "非车险", + "科技", + "大模型", + "承保", + "第三方平台", + ]; + return keywords.some((keyword) => name.includes(keyword)); + } +} + +new PICC(); diff --git a/readme.md b/readme.md new file mode 100644 index 0000000..329d7b1 --- /dev/null +++ b/readme.md @@ -0,0 +1,47 @@ +# 查看指定爬虫详细信息 + +pm2 show chery-spider + +# 查看指定爬虫状态 + +pm2 list | grep chery-spider + +# 实时监控指定爬虫 + +pm2 monit chery-spider + +# 停止指定爬虫(不删除) + +pm2 stop chery-spider + +# 彻底删除爬虫进程 + +pm2 delete chery-spider + +# 停止并删除 + +pm2 stop chery-spider && pm2 delete chery-spider + +# 查看指定爬虫的实时日志 + +pm2 logs chery-spider + +# 查看最近 100 行日志 + +pm2 logs chery-spider --lines 100 + +# 只查看标准输出日志 + +pm2 logs chery-spider --out + +# 只查看错误日志 + +pm2 logs chery-spider --err + +# 查看某个时间段的日志 + +pm2 logs chery-spider --timestamp + +# 清空日志 + +pm2 flush chery-spider diff --git a/sqlite.js b/sqlite.js new file mode 100644 index 0000000..a86d0f2 --- /dev/null +++ b/sqlite.js @@ -0,0 +1,320 @@ +import Database from "better-sqlite3"; +import fs from "fs"; +// import { wechatPush } from "./utils.js"; + +class SQLiteMessageQueue { + constructor() { + // this.db = new Database("message_queue.db"); + this.db = new Database("spider_data.db"); + this.init(); + this.setupGracefulShutdown(); + } + init() { + this.db.exec(` + CREATE TABLE IF NOT EXISTS announcements ( + id TEXT PRIMARY KEY, + spider_name TEXT NOT NULL, + name TEXT NOT NULL, + publish_time TEXT, + end_time TEXT, + urls TEXT, + created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP, + updated_at TEXT + ) + `); + + this.db.exec(` + CREATE TABLE IF NOT EXISTS messages ( + id TEXT PRIMARY KEY, + spider_name TEXT NOT NULL, + data TEXT NOT NULL, + timestamp TEXT NOT NULL, + status TEXT DEFAULT 'pending', + sent_at TEXT, + error_message TEXT + ) + `); + this.db.exec(` + CREATE INDEX IF NOT EXISTS idx_announcements_spider ON announcements(spider_name); + CREATE INDEX IF NOT EXISTS idx_announcements_time ON announcements(publish_time); + CREATE INDEX IF NOT EXISTS idx_announcements_created ON announcements(created_at); + CREATE INDEX IF NOT EXISTS idx_status ON messages(status); + CREATE INDEX IF NOT EXISTS idx_spider_status ON messages(spider_name, status); + CREATE INDEX IF NOT EXISTS idx_timestamp ON messages(timestamp); + `); + + this.insertAnnouncementStmt = this.db.prepare(` + INSERT OR REPLACE INTO announcements + (id, spider_name, name, publish_time, end_time, urls, created_at, updated_at) + VALUES (?, ?, ?, ?, ?, ?, ?, ?) + `); + + this.getAnnouncementStmt = this.db.prepare(` + SELECT * FROM announcements WHERE id = ? + `); + + this.getAnnouncementsBySpiderStmt = this.db.prepare(` + SELECT * FROM announcements WHERE spider_name = ? + ORDER BY created_at DESC + `); + + this.checkAnnouncementExistsStmt = this.db.prepare(` + SELECT COUNT(*) as count FROM announcements WHERE id = ? + `); + + // 预编译SQL语句(提高性能) + this.insertStmt = this.db.prepare(` + INSERT INTO messages (id, spider_name, data, timestamp, status) + VALUES (?, ?, ?, ?, ?) + `); + + this.getPendingStmt = this.db.prepare(` + SELECT * FROM messages WHERE status = 'pending' + ORDER BY timestamp ASC + `); + + this.getFailedStmt = this.db.prepare(` + SELECT * FROM messages WHERE status = 'failed' + ORDER BY timestamp ASC + `); + + this.updateStatusStmt = this.db.prepare(` + UPDATE messages + SET status = ?, sent_at = ?, error_message = ? + WHERE id = ? + `); + } + // safeExecute(methodName, operation, ...args) { + // } + saveAnnouncement(spiderName, announcement) { + const now = new Date().toISOString(); + const isNew = !this.isAnnouncementExists(announcement.id); + + this.insertAnnouncementStmt.run( + announcement.id, + spiderName, + announcement.name, + announcement.publishTime, + announcement.endTime, + announcement.urls, + isNew ? now : this.getAnnouncement(announcement.id)?.created_at || now, + now + ); + + return isNew; + } + /** + * 批量保存公告并返回新公告 + */ + saveAnnouncements(spiderName, announcements) { + const newAnnouncements = []; + + // 使用事务提高性能 + const saveMany = this.db.transaction((announcements) => { + for (const announcement of announcements) { + const isNew = this.saveAnnouncement(spiderName, announcement); + if (isNew) { + newAnnouncements.push(announcement); + } + } + }); + + saveMany(announcements); + + console.log(`💾 ${spiderName}: 保存 ${announcements.length} 条公告`); + return newAnnouncements; + } + /** + * 检查公告是否存在 + */ + isAnnouncementExists(announcementId) { + const result = this.checkAnnouncementExistsStmt.get(announcementId); + return result.count > 0; + } + + /** + * 获取单个公告 + */ + getAnnouncement(id) { + return this.getAnnouncementStmt.get(id); + } + + /** + * 获取指定爬虫的所有公告 + */ + getAnnouncementsBySpider(spiderName) { + return this.getAnnouncementsBySpiderStmt.all(spiderName); + } + /** + * 根据 spiderName 删除其所有公告 + */ + deleteAnnouncementsBySpider(spiderName) { + const stmt = this.db.prepare(`DELETE FROM announcements WHERE spider_name = ?`); + const info = stmt.run(spiderName); + console.log(`🗑️ 删除 ${spiderName} 的公告,共删除 ${info.changes} 条`); + return info.changes; + } + /** + * 过滤出新公告 + */ + filterNewAnnouncements(spiderName, announcements) { + return announcements.filter( + (announcement) => !this.isAnnouncementExists(announcement.id) + ); + } + + // ============= + // 消息队列相关方法 + // ============= + + addMessage(spiderName, data) { + const message = { + id: Date.now() + "-" + Math.random().toString(36).substr(2, 9), + spider_name: spiderName, + data: JSON.stringify(data), + timestamp: new Date().toISOString(), + status: "pending", + }; + this.insertStmt.run( + message.id, + message.spider_name, + message.data, + message.timestamp, + message.status + ); + // wechatPush(spiderName, data); + console.log(`📤 添加消息到队列: ${spiderName} - ${data.length} 条数据`); + return message.id; + } + + getPendingMessages() { + const rows = this.getPendingStmt.all(); + return rows.map((row) => ({ + ...row, + data: JSON.parse(row.data), + })); + } + + getFailedMessages() { + const rows = this.getFailedStmt.all(); + return rows.map((row) => ({ + ...row, + data: JSON.parse(row.data), + })); + } + + updateMessageStatus(id, status, sentAt = null, errorMessage = null) { + this.updateStatusStmt.run(status, sentAt, errorMessage, id); + } + migrateFromJsonFile(spiderName, jsonFilePath) { + try { + if (!fs.existsSync(jsonFilePath)) { + console.log(`📁 ${jsonFilePath} 不存在,跳过迁移`); + return 0; + } + + const data = JSON.parse(fs.readFileSync(jsonFilePath, "utf-8")); + if (!Array.isArray(data) || data.length === 0) { + console.log(`📁 ${jsonFilePath} 数据为空,跳过迁移`); + return 0; + } + + const migrateMany = this.db.transaction((announcements) => { + for (const announcement of announcements) { + this.saveAnnouncement(spiderName, announcement); + } + }); + + migrateMany(data); + console.log(`🔄 成功迁移 ${data.length} 条 ${spiderName} 数据到数据库`); + return data.length; + } catch (error) { + console.error(`❌ 迁移 ${jsonFilePath} 失败:`, error); + return 0; + } + } + cleanOldMessages(daysBefore = 30) { + const cutoffDate = new Date(); + cutoffDate.setDate(cutoffDate.getDate() - daysBefore); + + const stmt = this.db.prepare(` + DELETE FROM messages + WHERE status = 'sent' AND sent_at < ? + `); + + const result = stmt.run(cutoffDate.toISOString()); + console.log(`🧹 清理了 ${result.changes} 条旧消息`); + } + + /** + * 获取统计信息 + */ + getStats() { + const stats = {}; + + // 按爬虫统计公告数量 + const announcementStats = this.db + .prepare( + ` + SELECT spider_name, COUNT(*) as count + FROM announcements + GROUP BY spider_name + ` + ).all() + // .prepare(` + // SELECT spider_name, name + // FROM announcements WHERE spider_name = '吉利' + // `) + // .all(); + + // 消息状态统计(status == pending) + const messageStats = this.db + .prepare( + ` + SELECT status, data, sent_at + FROM messages WHERE status = 'pending' + ` + ) + .all(); + + stats.announcements = announcementStats; + stats.messages = messageStats; + + return stats; + } + setupGracefulShutdown() { + // 正常退出信号 + process.on("SIGINT", () => { + console.log("收到 SIGINT 信号,正在关闭数据库..."); + this.close(); + process.exit(0); + }); + + // 终止信号 + process.on("SIGTERM", () => { + console.log("收到 SIGTERM 信号,正在关闭数据库..."); + this.close(); + process.exit(0); + }); + + // 未捕获异常 + process.on("uncaughtException", (error) => { + console.error("未捕获异常:", error); + this.close(); + process.exit(1); + }); + + // 未处理的Promise拒绝 + process.on("unhandledRejection", (reason, promise) => { + console.error("未处理的Promise拒绝:", reason); + this.close(); + process.exit(1); + }); + } + // 关闭数据库连接 + close() { + this.db.close(); + } +} + +export { SQLiteMessageQueue }; diff --git a/stats.js b/stats.js new file mode 100644 index 0000000..ade7443 --- /dev/null +++ b/stats.js @@ -0,0 +1,80 @@ +import { SQLiteMessageQueue } from "./sqlite.js"; +import path from "path"; +import { md5 } from "./utils.js"; +import axios from "axios"; + +const queue = new SQLiteMessageQueue(); + +const stats = queue.getStats(); + +// function merge() { +// let files = [ +// { name: "长安", path: "changan.json" }, +// { name: "奇瑞变更公告", path: "chery_bg.json" }, +// { name: "奇瑞采购公告", path: "chery_cg.json" }, +// { name: "奇瑞寻源预告", path: "chery_xy.json" }, +// { name: "零跑", path: "leapMotor.json" }, +// { name: "吉利", path: "geely.json" }, +// { name: "一汽", path: "yiqi.json" }, +// ]; +// files.forEach((file) => { +// queue.migrateFromJsonFile(file.name, path.resolve(file.path)); +// }); +// } +// merge(); +// 把message中的数据状态改成pending +// queue.getFailedMessages() +// .forEach((message) => { +// queue.updateMessageStatus(message.id, "pending"); +// }); +// function getSign(timestamp) { +// let secret = "cpwyyds"; +// let uri = "/common/message/push"; +// const url = uri + timestamp + secret; +// console.log(url); +// const myCalc = md5(url); +// let sign = +// myCalc.substring(5, 13) + +// myCalc.substring(29, 31) + +// myCalc.substring(18, 27); +// //sign 转大写 +// sign = sign.toUpperCase(); +// return sign; +// } +// let time = new Date().getTime(); +// let data = { +// timestamp: time, +// sign: getSign(time), +// templateNo: "A002", +// url: "https://www.baidu.com/", +// paramList: [ +// { +// key: "thing8", +// value: "网站name", +// }, +// { +// key: "thing2", +// value: "项目name", +// }, +// { +// key: "time14", +// value: "2025-11-2", +// }, +// { +// key: "time17", +// value: "2025-11-3 00:00:00", +// }, +// ], +// }; +// axios({ +// url: "https://testadvert.shenlintech.com/platform/common/message/push", +// method: "post", +// data, +// }) +// .then((res) => { +// console.log(res.data); +// }) +// .catch((err) => { +// console.log(err); +// }); +console.log(stats); diff --git a/third.js b/third.js new file mode 100644 index 0000000..ca9afd3 --- /dev/null +++ b/third.js @@ -0,0 +1,309 @@ +import axios from "axios"; +import fs from "fs"; +import path from "path"; +import JSON5 from "json5"; +import { timestampToDate, loopCall, keywordsInclude } from "./utils.js"; +import config from "./config.js"; +import { SQLiteMessageQueue } from "./sqlite.js"; +import * as cheerio from "cheerio"; + +class Third { + constructor(jsonMap) { + this.axiosInstance = axios.create({ timeout: 30000, maxRedirects: 5 }); + this.axiosInstance.interceptors.request.use((config) => { + // 添加cookie到请求头 + const cookieString = Array.from(this.cookiePair.entries()) + .map(([name, value]) => `${name}=${value}`) + .join("; "); + config.headers.Cookie = cookieString; + // console.log(config); + return config; + }); + this.axiosInstance.interceptors.response.use( + (response) => { + // 更新cookie到请求头 + let cookieArr = response.headers["set-cookie"] || []; + this.extractCookie(cookieArr); + return response; + }, + (error) => { + return Promise.reject(error); + } + ); + this.cookiePair = new Map(); + // this.csrfToken = ""; + this.jsonMap = jsonMap; + console.log("三方平台 爬虫启动..."); + this.queue = new SQLiteMessageQueue(); + this.start(); + } + + async start() { + try { + await this.init(); + } catch (err) { + console.error("启动失败:", err); + } + } + async init() { + for (let item of this.jsonMap) { + let announcements = this.queue.getAnnouncementsBySpider(item.name); + if (announcements.length > 0) { + this.loopFetchIncrement(item); + } else { + this.loopFetchFull(item); + } + } + } + async initializeCookie() { + try { + let headers = { + headers: { + Accept: "text/plain, */*; q=0.01", + "Accept-Language": "zh-CN,zh;q=0.9", + "Cache-Control": "no-cache", + "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", + Origin: "https://www.chinabidding.com", + Pragma: "no-cache", + Priority: "u=1, i", + Referer: "https://www.chinabidding.com/search/proj.htm", + "Sec-Ch-Ua": + '"Not)A;Brand";v="8", "Chromium";v="138", "Google Chrome";v="138"', + "Sec-Ch-Ua-Mobile": "?0", + "Sec-Ch-Ua-Platform": '"macOS"', + "Sec-Fetch-Dest": "empty", + "Sec-Fetch-Mode": "cors", + "Sec-Fetch-Site": "same-origin", + "User-Agent": + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36", + "X-Requested-With": "XMLHttpRequest", + }, + }; + const homeResponse = await this.axiosInstance.get( + "https://www.chinabidding.com/search/proj.htm", + headers + ); + } catch (err) { + console.log("err", err); + throw err; + } + } + extractCookie(cookieArr) { + for (let cookie of cookieArr) { + let [key, value] = cookie.split(";")[0].split("="); + this.cookiePair.set(key, value); + } + // console.log(this.cookiePair); + } + // 全量爬取 + loopFetchFull(props) { + console.log("开始全量爬取"); + try { + loopCall(this.getInfo.bind(this), { + time: config.fullFetchTime, + pagenumber: 1, + additional: props.options, + stopWhen: (pagenumber, result) => { + return ( + pagenumber >= result.pages || pagenumber >= config.pageNumberLimit + ); + }, + readyForNext: (pagenumber, result) => { + props.info.push(...result.info); + return pagenumber + 1; + }, + complete: (result) => { + props.info.push(...result.info); + console.log(`爬取完成,共获取 ${props.info.length} 条有效数据`); + try { + if (props.info.length > 0) { + this.queue.saveAnnouncements(props.name, props.info); + this.queue.addMessage(props.name, props.info); + } + } catch (error) { + console.error("数据库操作失败:", error); + } + this.loopFetchIncrement(props); + }, + }); + } catch (error) { + console.error(`${props.options.name}全量爬取失败:`, error); + } + } + loopFetchIncrement(props) { + console.log("开始增量爬取"); + try { + loopCall(this.getInfo.bind(this), { + time: config.incrementFetchTime, // 5分钟间隔 + pagenumber: 1, + additional: props.options, + readyForNext: (pagenumber, result) => { + try { + let newInfo = this.queue.filterNewAnnouncements( + props.name, + result.info + ); + // 存在新数据 + if (newInfo.length > 0) { + console.log(`发现 ${newInfo.length} 条新数据`); + // props.info.push(...newInfo); + this.queue.saveAnnouncements(props.name, newInfo); + // this.writeFile(props); + this.queue.addMessage(props.name, newInfo); + // 全是新数据,继续下一页 + if (newInfo.length === result.info.length) { + return pagenumber + 1; + } else { + // 有部分重复数据,重新从第一页开始 + return 1; + } + } else { + console.log("没有发现新数据,继续监控..."); + return 1; // 重新从第一页开始 + } + } catch (error) { + console.error("数据库操作失败:", error); + } + }, + }); + } catch (error) { + console.error(`${props.options.name}增量爬取失败:`, error); + } + } + + async getNoticeDetail(url) { + try { + let result = await axios.get(url); + return result.data; + } catch (err) { + return "err"; + } + } + async getInfo(pagenumber = 1, config) { + let info = []; + console.log(`${config.name}--获取第 ${pagenumber} 页数据...`); + let result = await this.getList(pagenumber, config); + if (result[0]) { + // 出错, 记录错误日志 + console.error("获取页面数据失败: ", result[0]); + return { pages: 0, info: [] }; + } else { + let pages = 3; + let html = result[1]; + const $ = cheerio.load(html); + $(".as-pager-body li").each((index, element) => { + let idmatch = $(element) + .find(".as-pager-item") + .attr("href") + .match(/\/bidDetail\/(\d+)\.html/); + let id = idmatch ? idmatch[1] : ""; + let name = $(element).find(".txt").attr("title"); + + let url = $(element).find(".as-pager-item").attr("href"); + if (keywordsInclude(name)) { + console.log("处理项目:", name); + info.push({ + id: id, + name: name, + urls: url, + publishTime: "--", + endTime: "--", + }); + } + }); + return { pages, info }; + } + } + async getList(pagenumber, config) { + let data = config.data; + data.currentPage = pagenumber; + let headers = { + Accept: "text/plain, */*; q=0.01", + "Accept-Language": "zh-CN,zh;q=0.9", + "Cache-Control": "no-cache", + "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", + Origin: "https://www.chinabidding.com", + Pragma: "no-cache", + Priority: "u=1, i", + Referer: "https://www.chinabidding.com/search/proj.htm", + "Sec-Ch-Ua": + '"Not)A;Brand";v="8", "Chromium";v="138", "Google Chrome";v="138"', + "Sec-Ch-Ua-Mobile": "?0", + "Sec-Ch-Ua-Platform": '"macOS"', + "Sec-Fetch-Dest": "empty", + "Sec-Fetch-Mode": "cors", + "Sec-Fetch-Site": "same-origin", + "User-Agent": + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36", + "X-Requested-With": "XMLHttpRequest", + }; + try { + const response = await this.axiosInstance({ + url: config.url, + data, + method: "post", + headers, + }); + let result = response.data; + return [null, result]; + } catch (err) { + console.log("cookie不对"); + try { + await this.initializeCookie(); + const retryResponse = await this.axiosInstance({ + url: config.url, + data, + method: "post", + headers, + }); + let result = retryResponse.data; + return [null, result]; + } catch (retryErr) { + return [retryErr, null]; + } + } + } +} + +new Third([ + { + name: "机电项目招投标【招标公告】", + info: [], + options: { + name: "机电项目招投标【招标公告】", + url: "https://www.chinabidding.com/search/proj.htm", + data: { + fullText: "", + pubDate: "", + infoClassCodes: "(0105 0103)", + normIndustry: "", + zoneCode: "", + fundSourceCodes: "", + poClass: "BidNotice", + rangeType: "", + currentPage: 1, + }, + }, + }, +]); +new Third([ + { + name: "机电项目招投标【招标变更公告】", + info: [], + options: { + name: "机电项目招投标【招标变更公告】", + url: "https://www.chinabidding.com/search/proj.htm", + data: { + fullText: "", + pubDate: "", + infoClassCodes: "(0106 0104)", + normIndustry: "", + zoneCode: "", + fundSourceCodes: "", + poClass: "BidNotice", + rangeType: "", + currentPage: 1, + }, + }, + }, +]); diff --git a/utils.js b/utils.js new file mode 100644 index 0000000..5223a1c --- /dev/null +++ b/utils.js @@ -0,0 +1,271 @@ +import crypto from "crypto"; +import axios from "axios"; +/** + * 将时间戳(毫秒)转换为 yyyy-mm-dd 格式的字符串 + * @param {number} timestamp - 毫秒级时间戳 + * @returns {string} yyyy-mm-dd 格式日期 + */ +function timestampToDate(timestamp, mode) { + const date = new Date(timestamp); + const year = date.getFullYear(); + // 补零 + const month = String(date.getMonth() + 1).padStart(2, "0"); + const day = String(date.getDate()).padStart(2, "0"); + if (!mode) { + return `${year}-${month}-${day}`; + } else { + const hours = String(date.getHours()).padStart(2, "0"); + const minutes = String(date.getMinutes()).padStart(2, "0"); + const seconds = String(date.getSeconds()).padStart(2, "0"); + return `${year}-${month}-${day} ${hours}:${minutes}:${seconds}`; + } +} + +function md5(text, inputEncoding = "utf8", outputEncoding = "hex") { + return crypto + .createHash("md5") + .update(text, inputEncoding) + .digest(outputEncoding); +} +function getSign(timestamp) { + let secret = "cpwyyds"; + let uri = "/common/message/push"; + const url = uri + timestamp + secret; + const myCalc = md5(url); + let sign = + myCalc.substring(5, 13) + + myCalc.substring(29, 31) + + myCalc.substring(18, 27); + //sign 转大写 + sign = sign.toUpperCase(); + return sign; +} +// 微信推送 +// function wechatPush(spiderName, arr) { +// for (let item of arr) { +// let timestamp = new Date().getTime(); +// let sign = getSign(timestamp); +// let url = ""; +// if (typeof item.urls === "string") { +// url = item.urls; +// } else { +// url = item.urls[0]; +// } +// let data = { +// timestamp, +// sign, +// templateNo: "A002", +// url, +// paramList: [ +// { +// key: "thing8", +// value: spiderName, +// }, +// { +// key: "thing2", +// value: +// item.name.length > 20 +// ? item.name.substring(0, 16) + "..." +// : item.name, +// }, +// { +// key: "time14", +// value: item.publishTime, +// }, +// { +// key: "time17", +// value: item.endTime, +// }, +// ], +// }; +// axios({ +// url: "https://advert.shenlintech.com/platform/common/message/push", +// method: "post", +// data, +// }); +// } +// } +// 废弃 +function addToMessageQueue(spiderName, data) { + const message = { + id: Date.now() + "-" + Math.random().toString(36).substr(2, 9), + spiderName, + data, + timestamp: new Date().toISOString(), + status: "pending", + }; + let queue = []; + const queueFile = "message_queue.json"; + if (fs.existsSync(queueFile)) { + queue = JSON.parse(fs.readFileSync(queueFile, "utf-8")); + } + // 添加新消息 + queue.push(message); + + fs.writeFileSync(queueFile, JSON.stringify(queue, null, 2)); + console.log(`📤 添加消息到队列: ${spiderName} - ${data.length} 条数据`); +} + +async function loopCall(fn, options = {}) { + let { time, pagenumber, stopWhen, readyForNext, complete, additional } = + options; + let shouldContinue = true; + while (shouldContinue) { + try { + let result = await fn(pagenumber, additional); + // console.log(`页面 ${pagenumber} 处理完成`); + + // 检查停止条件 + if (stopWhen && stopWhen(pagenumber, result)) { + complete && complete(result); + shouldContinue = false; + } else { + pagenumber = readyForNext(pagenumber, result); + await new Promise((resolve) => setTimeout(resolve, time)); + } + } catch (err) { + console.error("loopCall 出错:", err); + shouldContinue = false; + } + } +} +function keywordsInclude(name) { + let keywords = [ + "海外", + "国际", + "内容", + "营销", + "运营", + "直播", + "品牌", + "事件", + "策略", + "传播", + "执行", + "社媒", + "视频", + "制作", + "拍摄", + "效果", + ]; + return keywords.some((keyword) => name.includes(keyword)); +} +// 一汽专用获取公告链接的方法 +function getYiqiNoticeUrl(gongGaoType, guid, version, origin) { + let baseUrl = "https://etp.faw.cn/"; + //是否对参数加密 + var isSecrect = false; + + //候选人公示加密 + if (gongGaoType == 7) { + isSecrect = true; + } + if (isSecrect) { + var url = baseUrl + "/gg/toGongGaoDetail"; + guid = encodeSixF(guid); + // var params = { + // guid: guid, + // gongGaoType: gongGaoType, + // version: dealNullAndUndefined(version), + // statusCode: 1, + // isNew: 1, + // }; + // try { + // await httpPostCurrent(url, params); + // } catch (err) { + // console.log(err); + // return "加密链接"; + // } + return "加密链接,请直接上对应网站查看"; + } else { + var url = + baseUrl + + "/gg/toGongGaoDetail?guid=" + + guid + + "&gongGaoType=" + + gongGaoType + + "&version=" + + version + + "&isNew=1"; + return url; + } +} +function parseToGgDetailsParams(funcStr) { + // funcStr = "toGgDetails('6','642ed424-cd9b-4cb0-8b74-9cc868d8f95a:2','2','1','')" + + const match = funcStr.match(/toGgDetails\(([^)]+)\)/); + if (match) { + // 解析参数字符串 + const paramsStr = match[1]; + // 简单的参数解析(处理引号包围的参数) + const params = paramsStr + .split(",") + .map((param) => param.trim().replace(/['"]/g, "")); + return params; + } + return null; +} +function encodeSixF(input) { + var keyStr = + "ABCDEFGHIJKLMNOP" + + "QRSTUVWXYZabcdef" + + "ghijklmnopqrstuv" + + "wxyz0123456789+/" + + "="; + var output = ""; + var chr1, + chr2, + chr3 = ""; + var enc1, + enc2, + enc3, + enc4 = ""; + var i = 0; + do { + chr1 = input.charCodeAt(i++); + chr2 = input.charCodeAt(i++); + chr3 = input.charCodeAt(i++); + enc1 = chr1 >> 2; + enc2 = ((chr1 & 3) << 4) | (chr2 >> 4); + enc3 = ((chr2 & 15) << 2) | (chr3 >> 6); + enc4 = chr3 & 63; + if (isNaN(chr2)) { + enc3 = enc4 = 64; + } else if (isNaN(chr3)) { + enc4 = 64; + } + output = + output + + keyStr.charAt(enc1) + + keyStr.charAt(enc2) + + keyStr.charAt(enc3) + + keyStr.charAt(enc4); + chr1 = chr2 = chr3 = ""; + enc1 = enc2 = enc3 = enc4 = ""; + } while (i < input.length); + + if (output != null && output.indexOf("=") != -1) { + var reg = new RegExp("=", "g"); + var outputNew = output.replace(reg, "r1e2p3l4"); + output = outputNew; + } + + return output + "+*+"; +} +function dealNullAndUndefined(value) { + if (typeof value == "undefined") return ""; + if (value == null) return ""; + if (value == "null") return ""; + if (value == "undefined") return ""; + return value; +} +export { + timestampToDate, + loopCall, + keywordsInclude, + getYiqiNoticeUrl, + parseToGgDetailsParams, + addToMessageQueue, + md5, + // wechatPush +}; diff --git a/yiqi.js b/yiqi.js new file mode 100644 index 0000000..37f895d --- /dev/null +++ b/yiqi.js @@ -0,0 +1,199 @@ +import axios from "axios"; +import fs from "fs"; +import path from "path"; +import { + timestampToDate, + loopCall, + keywordsInclude, + getYiqiNoticeUrl, + parseToGgDetailsParams, + // addToMessageQueue, +} from "./utils.js"; +import config from "./config.js"; +import * as cheerio from "cheerio"; +import { SQLiteMessageQueue } from "./sqlite.js"; +// import { messageQueue } from "./msgManager.js"; + +class YiQi { + constructor() { + // this.filepath = path.resolve("yiqi.json"); + this.info = []; + console.log("一汽 爬虫启动..."); + this.queue = new SQLiteMessageQueue(); + this.start(); + } + + async start() { + try { + await this.init(); + } catch (err) { + console.error("启动失败:", err); + } + } + async init() { + let announcements = this.queue.getAnnouncementsBySpider("一汽"); + if (announcements.length > 0) { + await this.increment(); + } else { + await this.fullFetch(); + } + // if (fs.existsSync(this.filepath)) { + // let data = fs.readFileSync(this.filepath, "utf-8"); + // this.info = data ? JSON.parse(data) : []; + // if (this.info.length > 0) { + // await this.increment(); + // } else { + // await this.fullFetch(); + // } + // } else { + // console.log("历史文件不存在,开始全量爬取"); + // await this.fullFetch(); + // } + } + // 全量爬取 + async fullFetch() { + console.log("开始全量爬取..."); + try { + await loopCall(this.getInfo.bind(this), { + time: config.fullFetchTime, + pagenumber: 1, + stopWhen: (pagenumber, result) => { + return ( + pagenumber >= result.pages || pagenumber >= config.pageNumberLimit + ); + }, + readyForNext: (pagenumber, result) => { + this.info.push(...result.info); + return pagenumber + 1; + }, + complete: (result) => { + this.info.push(...result.info); + console.log(`爬取完成,共获取 ${this.info.length} 条有效数据`); + try { + this.queue.saveAnnouncements("一汽", this.info); + // this.writeFile(this.info); + this.queue.addMessage("一汽", this.info); + } catch (error) { + console.error("数据库操作失败:", error); + } + }, + }); + } catch (error) { + console.error("全量爬取失败:", error); + } + console.log("开始增量爬取..."); + this.increment(); + } + + // 增量爬取 + async increment() { + console.log("开始增量爬取模式,每5分钟检查一次新数据..."); + try { + await loopCall(this.getInfo.bind(this), { + time: config.incrementFetchTime, // 5分钟间隔 + pagenumber: 1, + readyForNext: (pagenumber, result) => { + try { + let newInfo = this.queue.filterNewAnnouncements( + "一汽", + result.info + ); + // let newInfo = result.info.filter( + // (item) => !this.info.some((info) => info.id === item.id) + // ); + // 存在新数据 + if (newInfo.length > 0) { + console.log(`发现 ${newInfo.length} 条新数据`); + // this.info.push(...newInfo); + this.queue.saveAnnouncements("一汽", newInfo); + // this.writeFile(this.info); + this.queue.addMessage("一汽", newInfo); + // 全是新数据,继续下一页 + if (newInfo.length === result.info.length) { + return pagenumber + 1; + } else { + // 有部分重复数据,重新从第一页开始 + return 1; + } + } else { + console.log("没有发现新数据,继续监控..."); + return 1; // 重新从第一页开始 + } + } catch (error) { + console.error("数据库操作失败:", error); + } + }, + }); + } catch (error) { + console.error("增量爬取失败:", error); + } + } + async getInfo(pagenumber = 1) { + let info = []; + console.log(`正在获取第 ${pagenumber} 页数据...`); + let result = await this.getHtml(pagenumber); + if (result[0]) { + // 出错, 记录错误日志 + console.error("获取页面数据失败:", result[0]); + return { pages: 30, info: [] }; + } else { + let pages = 30; + let html = result[1]; + const $ = cheerio.load(html); + let noticeEl = $(".zl-list-main .zl-col-6"); + noticeEl.each((index, element) => { + let id = $(element).find(".zl-desc-item:contains('项目编号')").text(); + let name = $(element).find(".title").text(); + let publishTime = $(element) + .find(".zl-desc-item:contains('发布时间')") + .text(); + let endTime = $(element).find(".daojishi").attr("data-time"); + // 获取生产链接的参数 + let funcStr = $(element).find(".jump").attr("onclick"); + + let funcArgs = parseToGgDetailsParams(funcStr); + // 公告未过期 && 命中关键词 + if (endTime && keywordsInclude(name)) { + let noticeUrl = getYiqiNoticeUrl(...funcArgs); + info.push({ + id: id.replace("项目编号:", ""), + name: name.trim(), + publishTime: publishTime.replace("发布时间:", "").trim(), + endTime: timestampToDate(Number(endTime)), + urls: noticeUrl, + }); + } + }); + + return { pages, info }; + } + } + // 分页获取数据 + getHtml(pagenumber) { + return axios({ + url: "https://etp.faw.cn/gg/allJYTypeGGList?hangYeType=-1&xmLeiXing=&ggStartTimeEnd=&gongGaoType=5&isNew=1", + data: { + searchType: "", + searchText: "", + currentPage: pagenumber, + }, + headers: { + "Content-Type": "application/x-www-form-urlencoded", + }, + method: "post", + }) + .then((res) => { + let result = res.data; + return [null, result]; + }) + .catch((err) => { + return [err, null]; + }); + } + + // writeFile(info) { + // fs.writeFileSync(this.filepath, JSON.stringify(info), "utf-8"); + // } +} + +new YiQi(); diff --git a/youzhicai.js b/youzhicai.js new file mode 100644 index 0000000..ca6f15e --- /dev/null +++ b/youzhicai.js @@ -0,0 +1,406 @@ +import axios from "axios"; +import fs from "fs"; +import path from "path"; +import JSON5 from "json5"; +import { timestampToDate, loopCall, keywordsInclude } from "./utils.js"; +import config from "./config.js"; +import { SQLiteMessageQueue } from "./sqlite.js"; +import * as cheerio from "cheerio"; + +class YouZhiCai { + constructor(jsonMap) { + this.axiosInstance = axios.create({ timeout: 30000, maxRedirects: 5 }); + this.axiosInstance.interceptors.request.use((config) => { + // 添加cookie到请求头 + const cookieString = Array.from(this.cookiePair.entries()) + .map(([name, value]) => `${name}=${value}`) + .join("; "); + config.headers.Cookie = cookieString; + return config; + }); + this.axiosInstance.interceptors.response.use( + (response) => { + // 更新cookie到请求头 + let cookieArr = response.headers["set-cookie"] || []; + this.extractCookie(cookieArr); + return response; + }, + (error) => { + return Promise.reject(error); + } + ); + this.cookiePair = new Map(); + // this.csrfToken = ""; + this.jsonMap = jsonMap; + console.log("优质采 爬虫启动..."); + this.queue = new SQLiteMessageQueue(); + this.start(); + } + + async start() { + try { + await this.init(); + } catch (err) { + console.error("启动失败:", err); + } + } + async init() { + for (let item of this.jsonMap) { + let announcements = this.queue.getAnnouncementsBySpider(item.name); + if (announcements.length > 0) { + this.loopFetchIncrement(item); + } else { + this.loopFetchFull(item); + } + } + } + async initializeCookie() { + try { + let headers = { + headers: { + Accept: "text/plain, */*; q=0.01", + "Accept-Language": "zh-CN,zh;q=0.9", + "Cache-Control": "no-cache", + "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", + Origin: "https://www.youzhicai.com", + Pragma: "no-cache", + Priority: "u=1, i", + Referer: "https://www.youzhicai.com/s/1_1_0_0_.html", + "Sec-Ch-Ua": + '"Not)A;Brand";v="8", "Chromium";v="138", "Google Chrome";v="138"', + "Sec-Ch-Ua-Mobile": "?0", + "Sec-Ch-Ua-Platform": '"macOS"', + "Sec-Fetch-Dest": "empty", + "Sec-Fetch-Mode": "cors", + "Sec-Fetch-Site": "same-origin", + "User-Agent": + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36", + "X-Requested-With": "XMLHttpRequest", + }, + }; + const homeResponse = await this.axiosInstance.get( + "https://www.youzhicai.com/s/1_1_0_0_.html", + headers + ); + // // 提取csrf-token + // let tokenMatch = homeResponse.data.match( + // / { + return ( + pagenumber >= result.pages || pagenumber >= config.pageNumberLimit + ); + }, + readyForNext: (pagenumber, result) => { + props.info.push(...result.info); + return pagenumber + 1; + }, + complete: (result) => { + props.info.push(...result.info); + console.log(`爬取完成,共获取 ${props.info.length} 条有效数据`); + try { + if (props.info.length > 0) { + this.queue.saveAnnouncements(props.name, props.info); + this.queue.addMessage(props.name, props.info); + } + } catch (error) { + console.error("数据库操作失败:", error); + } + this.loopFetchIncrement(props); + }, + }); + } catch (error) { + console.error(`${props.options.name}全量爬取失败:`, error); + } + } + loopFetchIncrement(props) { + console.log("开始增量爬取"); + try { + loopCall(this.getInfo.bind(this), { + time: config.incrementFetchTime, // 5分钟间隔 + pagenumber: 1, + additional: props.options, + readyForNext: (pagenumber, result) => { + try { + let newInfo = this.queue.filterNewAnnouncements( + props.name, + result.info + ); + // 存在新数据 + if (newInfo.length > 0) { + console.log(`发现 ${newInfo.length} 条新数据`); + // props.info.push(...newInfo); + this.queue.saveAnnouncements(props.name, newInfo); + // this.writeFile(props); + this.queue.addMessage(props.name, newInfo); + // 全是新数据,继续下一页 + if (newInfo.length === result.info.length) { + return pagenumber + 1; + } else { + // 有部分重复数据,重新从第一页开始 + return 1; + } + } else { + console.log("没有发现新数据,继续监控..."); + return 1; // 重新从第一页开始 + } + } catch (error) { + console.error("数据库操作失败:", error); + } + }, + }); + } catch (error) { + console.error(`${props.options.name}增量爬取失败:`, error); + } + } + async getInfo(pagenumber = 1, config) { + let info = []; + console.log(`${config.name}--获取第 ${pagenumber} 页数据...`); + let result = await this.getList(pagenumber, config); + if (result[0]) { + // 出错, 记录错误日志 + console.error("获取页面数据失败: ", result[0]); + return { pages: 0, info: [] }; + } else { + // 后面的都要验证码 + + // let pages = 2; + let html = result[1]; + const $ = cheerio.load(html); + let total = $("#recommendMsg .info-num-value").text(); + let pages = Math.ceil(total / 15); + if (pages > 2) { + pages = 2; + } + $(".project-li").each((index, element) => { + let id = $(element).find(".project-name0").attr("href"); + let name = $(element).find(".project-name0").attr("title"); + let publishTime = $(element).find(".pub-value0").text(); + let leftDay = $(element).find(".left-day .emOrange:eq(0)").text(); + let endTime = new Date( + +new Date(publishTime) + leftDay * 24 * 60 * 60 * 1000 + ).toLocaleDateString(); + // console.log(endTime); + let urls = "https://www.youzhicai.com" + id; + if (keywordsInclude(name)) { + console.log("处理项目:", name, publishTime, endTime); + info.push({ + id: id, + name: name, + publishTime: publishTime, + endTime: endTime, + urls: urls, + }); + } + }); + return { pages, info }; + } + } + async getList(pagenumber, config) { + let data = config.data; + data.PageIndex = pagenumber; + if (this.cookiePair.get("__RequestVerificationToken")) { + data.__RequestVerificationToken = this.cookiePair.get( + "__RequestVerificationToken" + ); + } + let headers = { + Accept: "text/plain, */*; q=0.01", + "Accept-Language": "zh-CN,zh;q=0.9", + "Cache-Control": "no-cache", + "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", + Origin: "https://www.youzhicai.com", + Pragma: "no-cache", + Priority: "u=1, i", + Referer: "https://www.youzhicai.com/s/1_1_0_0_.html", + "Sec-Ch-Ua": + '"Not)A;Brand";v="8", "Chromium";v="138", "Google Chrome";v="138"', + "Sec-Ch-Ua-Mobile": "?0", + "Sec-Ch-Ua-Platform": '"macOS"', + "Sec-Fetch-Dest": "empty", + "Sec-Fetch-Mode": "cors", + "Sec-Fetch-Site": "same-origin", + "User-Agent": + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36", + "X-Requested-With": "XMLHttpRequest", + }; + try { + const response = await this.axiosInstance({ + url: config.url, + data, + method: "post", + headers, + }); + let result = response.data; + return [null, result]; + } catch (err) { + console.log("cookie不对"); + try { + await this.initializeCookie(); + data.__RequestVerificationToken = this.cookiePair.get( + "__RequestVerificationToken" + ); + const retryResponse = await this.axiosInstance({ + url: config.url, + data, + method: "post", + headers, + }); + // console.log(retryResponse.data); + let result = retryResponse.data; + return [null, result]; + } catch (retryErr) { + return [retryErr, null]; + } + } + } +} + +new YouZhiCai([ + { + name: "优质采【招标公告】", + info: [], + options: { + name: "优质采【招标公告】", + url: "https://www.youzhicai.com/s/1_1_0_0_.html", + data: { + MsProvince: "", + MsCity: "", + MsStartDate: "", + MsEndDate: "", + AutoOr: 0, + BackOr: 0, + NoticeTitle: "", + searchAccuracy: "precise", + matchType: "precise", + TenderType: "", + MsBidderType: 1, + MsNoticeType: 1, + MsPublishType: 0, + MsSingUpType: 1, + MsSort: 2, + MsProvince: "", + PageIndex: 1, + PageSize: 15, + AgencyId: "", + SecondSearch: "", + SecondSearchType: "", + TotalSize: 10000, + SearchRange: 3, + year: "", + key1: "", + key2: "", + key3: "", + }, + }, + }, +]); +new YouZhiCai([ + { + name: "优质采【澄清/变更公告】", + info: [], + options: { + name: "优质采【澄清/变更公告】", + url: "https://www.youzhicai.com/s/1_1_0_0_.html", + data: { + MsProvince: "", + MsCity: "", + MsStartDate: "", + MsEndDate: "", + AutoOr: 0, + BackOr: 0, + NoticeTitle: "", + searchAccuracy: "precise", + matchType: "precise", + TenderType: "", + MsBidderType: 1, + MsNoticeType: 5, + MsPublishType: 0, + MsSingUpType: 1, + MsSort: 2, + MsProvince: "", + PageIndex: 1, + PageSize: 15, + AgencyId: "", + SecondSearch: "", + SecondSearchType: "", + TotalSize: 10000, + SearchRange: 3, + year: "", + key1: "", + key2: "", + key3: "", + }, + }, + }, +]); +new YouZhiCai([ + { + name: "优质采【招标项目计划】", + info: [], + options: { + name: "优质采【招标项目计划】", + url: "https://www.youzhicai.com/s/1_1_0_0_.html", + data: { + MsProvince: "", + MsCity: "", + MsStartDate: "", + MsEndDate: "", + AutoOr: 0, + BackOr: 0, + NoticeTitle: "", + searchAccuracy: "precise", + matchType: "precise", + TenderType: "", + MsBidderType: 1, + MsNoticeType: 7, + MsPublishType: 0, + MsSingUpType: 1, + MsSort: 2, + MsProvince: "", + PageIndex: 1, + PageSize: 15, + AgencyId: "", + SecondSearch: "", + SecondSearchType: "", + TotalSize: 10000, + SearchRange: 3, + year: "", + key1: "", + key2: "", + key3: "", + }, + }, + }, +]);