diff --git a/ecosystem.config.cjs b/ecosystem.config.cjs index ba9ab48..fc98f63 100644 --- a/ecosystem.config.cjs +++ b/ecosystem.config.cjs @@ -161,5 +161,21 @@ module.exports = { log_file: "./logs/ccic-combined.log", time: true, }, + // {//太平保险险爬虫 + // name: "cntaiping-spider", + // script: "./service/cntaiping.js", + // instances: 1, + // autorestart: true, + // watch: false, + // max_memory_restart: "300M", + // env: { + // NODE_ENV: "production", + // SPIDER_NAME: "cntaiping", + // }, + // error_file: "./logs/cntaiping-error.log", + // out_file: "./logs/cntaiping-out.log", + // log_file: "./logs/cntaiping-combined.log", + // time: true, + // }, ], }; diff --git a/package.json b/package.json index 6ebac6a..8293262 100644 --- a/package.json +++ b/package.json @@ -18,6 +18,9 @@ "better-sqlite3": "^12.4.1", "cheerio": "^1.1.2", "json5": "^2.2.3", - "nodemailer": "^7.0.6" + "nodemailer": "^7.0.6", + "puppeteer": "^24.27.0", + "puppeteer-extra": "^3.3.6", + "puppeteer-extra-plugin-stealth": "^2.11.2" } } diff --git a/service/cntaiping.js b/service/cntaiping.js new file mode 100644 index 0000000..db97822 --- /dev/null +++ b/service/cntaiping.js @@ -0,0 +1,187 @@ +import axios from "axios"; +import fs from "fs"; +import path from "path"; +import puppeteer from 'puppeteer'; +import { timestampToDate, loopCall, keywordsInclude } from "../utils.js"; +import config from "../config.js"; +import { SQLiteMessageQueue } from "../sqlite.js"; + +class CNTAIPING { + constructor() { + this.info = []; + console.log("太平保险 爬虫启动..."); + this.queue = new SQLiteMessageQueue(); + this.start(); + } + + async start() { + try { + await this.init(); + } catch (err) { + console.error("启动失败:", err); + } + } + async init() { + let announcements = this.queue.getAnnouncementsBySpider("太平保险"); + if (announcements.length > 0) { + await this.increment(); + } else { + await this.fullFetch(); + } + } + // 全量爬取 + async fullFetch() { + console.log("开始全量爬取..."); + try { + await loopCall(this.getInfo.bind(this), { + time: config.fullFetchTime, + pagenumber: 1, + stopWhen: (pagenumber, result) => { + return ( + pagenumber >=1 + ); + }, + readyForNext: (pagenumber, result) => { + this.info.push(...result.info); + return pagenumber + 1; + }, + complete: (result) => { + this.info.push(...result.info); + console.log(`爬取完成,共获取 ${this.info.length} 条有效数据`); + try { + if (this.info.length > 0) { + this.queue.saveAnnouncements("太平保险", this.info); + // this.writeFile(this.info); + this.queue.addMessage("太平保险", this.info); + } + } catch (error) { + console.error("数据库操作失败:", error); + } + }, + }); + } catch (error) { + console.error("全量爬取失败:", error); + } + // console.log("开始增量爬取..."); + // this.increment(); + } + + // 增量爬取 + // async increment() { + // console.log("开始增量爬取模式,每5分钟检查一次新数据..."); + // try { + // await loopCall(this.getInfo.bind(this), { + // time: config.incrementFetchTime, // 5分钟间隔 + // pagenumber: 1, + // readyForNext: (pagenumber, result) => { + // try { + // let newInfo = this.queue.filterNewAnnouncements( + // "太平保险", + // result.info + // ); + // // 存在新数据 + // if (newInfo.length > 0) { + // console.log(`发现 ${newInfo.length} 条新数据`); + // // this.info.push(...newInfo); + // this.queue.saveAnnouncements("太平保险", newInfo); + // // this.writeFile(this.info); + // this.queue.addMessage("太平保险", newInfo); + // // 全是新数据,继续下一页 + // if (newInfo.length === result.info.length) { + // return pagenumber + 1; + // } else { + // // 有部分重复数据,重新从第一页开始 + // return 1; + // } + // } else { + // console.log("没有发现新数据,继续监控..."); + // return 1; // 重新从第一页开始 + // } + // } catch (error) { + // console.error("数据库操作失败:", error); + // } + // }, + // }); + // } catch (error) { + // console.error("增量爬取失败:", error); + // } + // } + async getInfo(pagenumber = 1) { + let pages = 5; + let info = []; + console.log(`正在获取第 ${pagenumber} 页数据...`); + try{ + const browser = await puppeteer.launch({headless: false}); + const page = await browser.newPage(); + await page.setRequestInterception(true); + page.on('console', msg => console.log('PAGE LOG:', msg.text())); + // 2. 监听并修改请求 + page.on('request', (interceptedRequest) => { + // 获取原始头 + const headers = interceptedRequest.headers(); + + // 3. 修改关键的头信息 + // 设置一个常见的User-Agent,注意版本号 + headers['user-agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Safari/605.1.15'; + + // 移除或修改Cache-Control + delete headers['cache-control']; // 或者设置为 'no-cache' 等其他值 + // headers['cache-control'] = 'no-cache'; + + // 注意:sec-ch-ua等客户端提示头通常由浏览器自动设置以匹配User-Agent, + // 设置User-Agent后,Puppeteer可能会自动调整它们以保持一致性[citation:2]。 + // 手动覆盖这些头可能不必要且复杂,建议优先设置User-Agent。 + + // 4. 继续请求并传入修改后的头 + interceptedRequest.continue({ headers }); + }); + + await page.goto('https://eps.cntaiping.com/cms/default/webfile/2ywgg1/index.html'); + await page.setViewport({width: 1080, height: 1024}); + await page.$$eval('.content-ul li', elements => { + elements.map(e => { + console.log("e",e); + let name = e.querySelector('a span').textContent.trim(); + let urls = 'https://eps.cntaiping.com/'+e.querySelector('a').href; + let publishTime = e.querySelector('.f-right span').textContent.trim(); + info.push({ + id: urls, + name: name, + publishTime: publishTime, + urls: urls + }); + }); + return elements; + }); + // await browser.close(); + }catch (error) { + console.log(error,"______获取页面数据失败"); + } + console.log("info",info); + return { pages, info }; + } + // 分页获取数据 + // getList(pagenumber) { + // return axios({ + // url: "https://spcn.byd.com/api/srm-sou-sp/supplier/supplier/getTenderAnnouncementInfo", + // data: { + // pageNo: pagenumber, + // pageSize: 10, + // }, + // method: "post", + // }) + // .then((res) => { + // let result = res.data; + // if (result.msg === "成功" && result.code === "000000") { + // return [null, result]; + // } else { + // return ["err", null]; + // } + // }) + // .catch((err) => { + // return [err, null]; + // }); + // } +} + +new CNTAIPING();