feat(太平保险暂停):

This commit is contained in:
huzhengrong 2025-10-30 15:41:21 +08:00
parent 15a5d1b60a
commit 1ff2434fa2
3 changed files with 207 additions and 1 deletions

View File

@ -161,5 +161,21 @@ module.exports = {
log_file: "./logs/ccic-combined.log", log_file: "./logs/ccic-combined.log",
time: true, time: true,
}, },
// {//太平保险险爬虫
// name: "cntaiping-spider",
// script: "./service/cntaiping.js",
// instances: 1,
// autorestart: true,
// watch: false,
// max_memory_restart: "300M",
// env: {
// NODE_ENV: "production",
// SPIDER_NAME: "cntaiping",
// },
// error_file: "./logs/cntaiping-error.log",
// out_file: "./logs/cntaiping-out.log",
// log_file: "./logs/cntaiping-combined.log",
// time: true,
// },
], ],
}; };

View File

@ -18,6 +18,9 @@
"better-sqlite3": "^12.4.1", "better-sqlite3": "^12.4.1",
"cheerio": "^1.1.2", "cheerio": "^1.1.2",
"json5": "^2.2.3", "json5": "^2.2.3",
"nodemailer": "^7.0.6" "nodemailer": "^7.0.6",
"puppeteer": "^24.27.0",
"puppeteer-extra": "^3.3.6",
"puppeteer-extra-plugin-stealth": "^2.11.2"
} }
} }

187
service/cntaiping.js Normal file
View File

@ -0,0 +1,187 @@
import axios from "axios";
import fs from "fs";
import path from "path";
import puppeteer from 'puppeteer';
import { timestampToDate, loopCall, keywordsInclude } from "../utils.js";
import config from "../config.js";
import { SQLiteMessageQueue } from "../sqlite.js";
class CNTAIPING {
constructor() {
this.info = [];
console.log("太平保险 爬虫启动...");
this.queue = new SQLiteMessageQueue();
this.start();
}
async start() {
try {
await this.init();
} catch (err) {
console.error("启动失败:", err);
}
}
async init() {
let announcements = this.queue.getAnnouncementsBySpider("太平保险");
if (announcements.length > 0) {
await this.increment();
} else {
await this.fullFetch();
}
}
// 全量爬取
async fullFetch() {
console.log("开始全量爬取...");
try {
await loopCall(this.getInfo.bind(this), {
time: config.fullFetchTime,
pagenumber: 1,
stopWhen: (pagenumber, result) => {
return (
pagenumber >=1
);
},
readyForNext: (pagenumber, result) => {
this.info.push(...result.info);
return pagenumber + 1;
},
complete: (result) => {
this.info.push(...result.info);
console.log(`爬取完成,共获取 ${this.info.length} 条有效数据`);
try {
if (this.info.length > 0) {
this.queue.saveAnnouncements("太平保险", this.info);
// this.writeFile(this.info);
this.queue.addMessage("太平保险", this.info);
}
} catch (error) {
console.error("数据库操作失败:", error);
}
},
});
} catch (error) {
console.error("全量爬取失败:", error);
}
// console.log("开始增量爬取...");
// this.increment();
}
// 增量爬取
// async increment() {
// console.log("开始增量爬取模式每5分钟检查一次新数据...");
// try {
// await loopCall(this.getInfo.bind(this), {
// time: config.incrementFetchTime, // 5分钟间隔
// pagenumber: 1,
// readyForNext: (pagenumber, result) => {
// try {
// let newInfo = this.queue.filterNewAnnouncements(
// "太平保险",
// result.info
// );
// // 存在新数据
// if (newInfo.length > 0) {
// console.log(`发现 ${newInfo.length} 条新数据`);
// // this.info.push(...newInfo);
// this.queue.saveAnnouncements("太平保险", newInfo);
// // this.writeFile(this.info);
// this.queue.addMessage("太平保险", newInfo);
// // 全是新数据,继续下一页
// if (newInfo.length === result.info.length) {
// return pagenumber + 1;
// } else {
// // 有部分重复数据,重新从第一页开始
// return 1;
// }
// } else {
// console.log("没有发现新数据,继续监控...");
// return 1; // 重新从第一页开始
// }
// } catch (error) {
// console.error("数据库操作失败:", error);
// }
// },
// });
// } catch (error) {
// console.error("增量爬取失败:", error);
// }
// }
async getInfo(pagenumber = 1) {
let pages = 5;
let info = [];
console.log(`正在获取第 ${pagenumber} 页数据...`);
try{
const browser = await puppeteer.launch({headless: false});
const page = await browser.newPage();
await page.setRequestInterception(true);
page.on('console', msg => console.log('PAGE LOG:', msg.text()));
// 2. 监听并修改请求
page.on('request', (interceptedRequest) => {
// 获取原始头
const headers = interceptedRequest.headers();
// 3. 修改关键的头信息
// 设置一个常见的User-Agent注意版本号
headers['user-agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Safari/605.1.15';
// 移除或修改Cache-Control
delete headers['cache-control']; // 或者设置为 'no-cache' 等其他值
// headers['cache-control'] = 'no-cache';
// 注意sec-ch-ua等客户端提示头通常由浏览器自动设置以匹配User-Agent
// 设置User-Agent后Puppeteer可能会自动调整它们以保持一致性[citation:2]。
// 手动覆盖这些头可能不必要且复杂建议优先设置User-Agent。
// 4. 继续请求并传入修改后的头
interceptedRequest.continue({ headers });
});
await page.goto('https://eps.cntaiping.com/cms/default/webfile/2ywgg1/index.html');
await page.setViewport({width: 1080, height: 1024});
await page.$$eval('.content-ul li', elements => {
elements.map(e => {
console.log("e",e);
let name = e.querySelector('a span').textContent.trim();
let urls = 'https://eps.cntaiping.com/'+e.querySelector('a').href;
let publishTime = e.querySelector('.f-right span').textContent.trim();
info.push({
id: urls,
name: name,
publishTime: publishTime,
urls: urls
});
});
return elements;
});
// await browser.close();
}catch (error) {
console.log(error,"______获取页面数据失败");
}
console.log("info",info);
return { pages, info };
}
// 分页获取数据
// getList(pagenumber) {
// return axios({
// url: "https://spcn.byd.com/api/srm-sou-sp/supplier/supplier/getTenderAnnouncementInfo",
// data: {
// pageNo: pagenumber,
// pageSize: 10,
// },
// method: "post",
// })
// .then((res) => {
// let result = res.data;
// if (result.msg === "成功" && result.code === "000000") {
// return [null, result];
// } else {
// return ["err", null];
// }
// })
// .catch((err) => {
// return [err, null];
// });
// }
}
new CNTAIPING();