✨ feat(太平保险暂停):
This commit is contained in:
parent
15a5d1b60a
commit
1ff2434fa2
|
|
@ -161,5 +161,21 @@ module.exports = {
|
|||
log_file: "./logs/ccic-combined.log",
|
||||
time: true,
|
||||
},
|
||||
// {//太平保险险爬虫
|
||||
// name: "cntaiping-spider",
|
||||
// script: "./service/cntaiping.js",
|
||||
// instances: 1,
|
||||
// autorestart: true,
|
||||
// watch: false,
|
||||
// max_memory_restart: "300M",
|
||||
// env: {
|
||||
// NODE_ENV: "production",
|
||||
// SPIDER_NAME: "cntaiping",
|
||||
// },
|
||||
// error_file: "./logs/cntaiping-error.log",
|
||||
// out_file: "./logs/cntaiping-out.log",
|
||||
// log_file: "./logs/cntaiping-combined.log",
|
||||
// time: true,
|
||||
// },
|
||||
],
|
||||
};
|
||||
|
|
|
|||
|
|
@ -18,6 +18,9 @@
|
|||
"better-sqlite3": "^12.4.1",
|
||||
"cheerio": "^1.1.2",
|
||||
"json5": "^2.2.3",
|
||||
"nodemailer": "^7.0.6"
|
||||
"nodemailer": "^7.0.6",
|
||||
"puppeteer": "^24.27.0",
|
||||
"puppeteer-extra": "^3.3.6",
|
||||
"puppeteer-extra-plugin-stealth": "^2.11.2"
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,187 @@
|
|||
import axios from "axios";
|
||||
import fs from "fs";
|
||||
import path from "path";
|
||||
import puppeteer from 'puppeteer';
|
||||
import { timestampToDate, loopCall, keywordsInclude } from "../utils.js";
|
||||
import config from "../config.js";
|
||||
import { SQLiteMessageQueue } from "../sqlite.js";
|
||||
|
||||
class CNTAIPING {
|
||||
constructor() {
|
||||
this.info = [];
|
||||
console.log("太平保险 爬虫启动...");
|
||||
this.queue = new SQLiteMessageQueue();
|
||||
this.start();
|
||||
}
|
||||
|
||||
async start() {
|
||||
try {
|
||||
await this.init();
|
||||
} catch (err) {
|
||||
console.error("启动失败:", err);
|
||||
}
|
||||
}
|
||||
async init() {
|
||||
let announcements = this.queue.getAnnouncementsBySpider("太平保险");
|
||||
if (announcements.length > 0) {
|
||||
await this.increment();
|
||||
} else {
|
||||
await this.fullFetch();
|
||||
}
|
||||
}
|
||||
// 全量爬取
|
||||
async fullFetch() {
|
||||
console.log("开始全量爬取...");
|
||||
try {
|
||||
await loopCall(this.getInfo.bind(this), {
|
||||
time: config.fullFetchTime,
|
||||
pagenumber: 1,
|
||||
stopWhen: (pagenumber, result) => {
|
||||
return (
|
||||
pagenumber >=1
|
||||
);
|
||||
},
|
||||
readyForNext: (pagenumber, result) => {
|
||||
this.info.push(...result.info);
|
||||
return pagenumber + 1;
|
||||
},
|
||||
complete: (result) => {
|
||||
this.info.push(...result.info);
|
||||
console.log(`爬取完成,共获取 ${this.info.length} 条有效数据`);
|
||||
try {
|
||||
if (this.info.length > 0) {
|
||||
this.queue.saveAnnouncements("太平保险", this.info);
|
||||
// this.writeFile(this.info);
|
||||
this.queue.addMessage("太平保险", this.info);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("数据库操作失败:", error);
|
||||
}
|
||||
},
|
||||
});
|
||||
} catch (error) {
|
||||
console.error("全量爬取失败:", error);
|
||||
}
|
||||
// console.log("开始增量爬取...");
|
||||
// this.increment();
|
||||
}
|
||||
|
||||
// 增量爬取
|
||||
// async increment() {
|
||||
// console.log("开始增量爬取模式,每5分钟检查一次新数据...");
|
||||
// try {
|
||||
// await loopCall(this.getInfo.bind(this), {
|
||||
// time: config.incrementFetchTime, // 5分钟间隔
|
||||
// pagenumber: 1,
|
||||
// readyForNext: (pagenumber, result) => {
|
||||
// try {
|
||||
// let newInfo = this.queue.filterNewAnnouncements(
|
||||
// "太平保险",
|
||||
// result.info
|
||||
// );
|
||||
// // 存在新数据
|
||||
// if (newInfo.length > 0) {
|
||||
// console.log(`发现 ${newInfo.length} 条新数据`);
|
||||
// // this.info.push(...newInfo);
|
||||
// this.queue.saveAnnouncements("太平保险", newInfo);
|
||||
// // this.writeFile(this.info);
|
||||
// this.queue.addMessage("太平保险", newInfo);
|
||||
// // 全是新数据,继续下一页
|
||||
// if (newInfo.length === result.info.length) {
|
||||
// return pagenumber + 1;
|
||||
// } else {
|
||||
// // 有部分重复数据,重新从第一页开始
|
||||
// return 1;
|
||||
// }
|
||||
// } else {
|
||||
// console.log("没有发现新数据,继续监控...");
|
||||
// return 1; // 重新从第一页开始
|
||||
// }
|
||||
// } catch (error) {
|
||||
// console.error("数据库操作失败:", error);
|
||||
// }
|
||||
// },
|
||||
// });
|
||||
// } catch (error) {
|
||||
// console.error("增量爬取失败:", error);
|
||||
// }
|
||||
// }
|
||||
async getInfo(pagenumber = 1) {
|
||||
let pages = 5;
|
||||
let info = [];
|
||||
console.log(`正在获取第 ${pagenumber} 页数据...`);
|
||||
try{
|
||||
const browser = await puppeteer.launch({headless: false});
|
||||
const page = await browser.newPage();
|
||||
await page.setRequestInterception(true);
|
||||
page.on('console', msg => console.log('PAGE LOG:', msg.text()));
|
||||
// 2. 监听并修改请求
|
||||
page.on('request', (interceptedRequest) => {
|
||||
// 获取原始头
|
||||
const headers = interceptedRequest.headers();
|
||||
|
||||
// 3. 修改关键的头信息
|
||||
// 设置一个常见的User-Agent,注意版本号
|
||||
headers['user-agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Safari/605.1.15';
|
||||
|
||||
// 移除或修改Cache-Control
|
||||
delete headers['cache-control']; // 或者设置为 'no-cache' 等其他值
|
||||
// headers['cache-control'] = 'no-cache';
|
||||
|
||||
// 注意:sec-ch-ua等客户端提示头通常由浏览器自动设置以匹配User-Agent,
|
||||
// 设置User-Agent后,Puppeteer可能会自动调整它们以保持一致性[citation:2]。
|
||||
// 手动覆盖这些头可能不必要且复杂,建议优先设置User-Agent。
|
||||
|
||||
// 4. 继续请求并传入修改后的头
|
||||
interceptedRequest.continue({ headers });
|
||||
});
|
||||
|
||||
await page.goto('https://eps.cntaiping.com/cms/default/webfile/2ywgg1/index.html');
|
||||
await page.setViewport({width: 1080, height: 1024});
|
||||
await page.$$eval('.content-ul li', elements => {
|
||||
elements.map(e => {
|
||||
console.log("e",e);
|
||||
let name = e.querySelector('a span').textContent.trim();
|
||||
let urls = 'https://eps.cntaiping.com/'+e.querySelector('a').href;
|
||||
let publishTime = e.querySelector('.f-right span').textContent.trim();
|
||||
info.push({
|
||||
id: urls,
|
||||
name: name,
|
||||
publishTime: publishTime,
|
||||
urls: urls
|
||||
});
|
||||
});
|
||||
return elements;
|
||||
});
|
||||
// await browser.close();
|
||||
}catch (error) {
|
||||
console.log(error,"______获取页面数据失败");
|
||||
}
|
||||
console.log("info",info);
|
||||
return { pages, info };
|
||||
}
|
||||
// 分页获取数据
|
||||
// getList(pagenumber) {
|
||||
// return axios({
|
||||
// url: "https://spcn.byd.com/api/srm-sou-sp/supplier/supplier/getTenderAnnouncementInfo",
|
||||
// data: {
|
||||
// pageNo: pagenumber,
|
||||
// pageSize: 10,
|
||||
// },
|
||||
// method: "post",
|
||||
// })
|
||||
// .then((res) => {
|
||||
// let result = res.data;
|
||||
// if (result.msg === "成功" && result.code === "000000") {
|
||||
// return [null, result];
|
||||
// } else {
|
||||
// return ["err", null];
|
||||
// }
|
||||
// })
|
||||
// .catch((err) => {
|
||||
// return [err, null];
|
||||
// });
|
||||
// }
|
||||
}
|
||||
|
||||
new CNTAIPING();
|
||||
Loading…
Reference in New Issue