insurance-spider/service/cntaiping.js

188 lines
6.2 KiB
JavaScript
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import axios from "axios";
import fs from "fs";
import path from "path";
import puppeteer from 'puppeteer';
import { timestampToDate, loopCall, keywordsInclude } from "../utils.js";
import config from "../config.js";
import { SQLiteMessageQueue } from "../sqlite.js";
class CNTAIPING {
constructor() {
this.info = [];
console.log("太平保险 爬虫启动...");
this.queue = new SQLiteMessageQueue();
this.start();
}
async start() {
try {
await this.init();
} catch (err) {
console.error("启动失败:", err);
}
}
async init() {
let announcements = this.queue.getAnnouncementsBySpider("太平保险");
if (announcements.length > 0) {
await this.increment();
} else {
await this.fullFetch();
}
}
// 全量爬取
async fullFetch() {
console.log("开始全量爬取...");
try {
await loopCall(this.getInfo.bind(this), {
time: config.fullFetchTime,
pagenumber: 1,
stopWhen: (pagenumber, result) => {
return (
pagenumber >=1
);
},
readyForNext: (pagenumber, result) => {
this.info.push(...result.info);
return pagenumber + 1;
},
complete: (result) => {
this.info.push(...result.info);
console.log(`爬取完成,共获取 ${this.info.length} 条有效数据`);
try {
if (this.info.length > 0) {
this.queue.saveAnnouncements("太平保险", this.info);
// this.writeFile(this.info);
this.queue.addMessage("太平保险", this.info);
}
} catch (error) {
console.error("数据库操作失败:", error);
}
},
});
} catch (error) {
console.error("全量爬取失败:", error);
}
// console.log("开始增量爬取...");
// this.increment();
}
// 增量爬取
// async increment() {
// console.log("开始增量爬取模式每5分钟检查一次新数据...");
// try {
// await loopCall(this.getInfo.bind(this), {
// time: config.incrementFetchTime, // 5分钟间隔
// pagenumber: 1,
// readyForNext: (pagenumber, result) => {
// try {
// let newInfo = this.queue.filterNewAnnouncements(
// "太平保险",
// result.info
// );
// // 存在新数据
// if (newInfo.length > 0) {
// console.log(`发现 ${newInfo.length} 条新数据`);
// // this.info.push(...newInfo);
// this.queue.saveAnnouncements("太平保险", newInfo);
// // this.writeFile(this.info);
// this.queue.addMessage("太平保险", newInfo);
// // 全是新数据,继续下一页
// if (newInfo.length === result.info.length) {
// return pagenumber + 1;
// } else {
// // 有部分重复数据,重新从第一页开始
// return 1;
// }
// } else {
// console.log("没有发现新数据,继续监控...");
// return 1; // 重新从第一页开始
// }
// } catch (error) {
// console.error("数据库操作失败:", error);
// }
// },
// });
// } catch (error) {
// console.error("增量爬取失败:", error);
// }
// }
async getInfo(pagenumber = 1) {
let pages = 5;
let info = [];
console.log(`正在获取第 ${pagenumber} 页数据...`);
try{
const browser = await puppeteer.launch({headless: false});
const page = await browser.newPage();
await page.setRequestInterception(true);
page.on('console', msg => console.log('PAGE LOG:', msg.text()));
// 2. 监听并修改请求
page.on('request', (interceptedRequest) => {
// 获取原始头
const headers = interceptedRequest.headers();
// 3. 修改关键的头信息
// 设置一个常见的User-Agent注意版本号
headers['user-agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Safari/605.1.15';
// 移除或修改Cache-Control
delete headers['cache-control']; // 或者设置为 'no-cache' 等其他值
// headers['cache-control'] = 'no-cache';
// 注意sec-ch-ua等客户端提示头通常由浏览器自动设置以匹配User-Agent
// 设置User-Agent后Puppeteer可能会自动调整它们以保持一致性[citation:2]。
// 手动覆盖这些头可能不必要且复杂建议优先设置User-Agent。
// 4. 继续请求并传入修改后的头
interceptedRequest.continue({ headers });
});
await page.goto('https://eps.cntaiping.com/cms/default/webfile/2ywgg1/index.html');
await page.setViewport({width: 1080, height: 1024});
await page.$$eval('.content-ul li', elements => {
elements.map(e => {
console.log("e",e);
let name = e.querySelector('a span').textContent.trim();
let urls = 'https://eps.cntaiping.com/'+e.querySelector('a').href;
let publishTime = e.querySelector('.f-right span').textContent.trim();
info.push({
id: urls,
name: name,
publishTime: publishTime,
urls: urls
});
});
return elements;
});
// await browser.close();
}catch (error) {
console.log(error,"______获取页面数据失败");
}
console.log("info",info);
return { pages, info };
}
// 分页获取数据
// getList(pagenumber) {
// return axios({
// url: "https://spcn.byd.com/api/srm-sou-sp/supplier/supplier/getTenderAnnouncementInfo",
// data: {
// pageNo: pagenumber,
// pageSize: 10,
// },
// method: "post",
// })
// .then((res) => {
// let result = res.data;
// if (result.msg === "成功" && result.code === "000000") {
// return [null, result];
// } else {
// return ["err", null];
// }
// })
// .catch((err) => {
// return [err, null];
// });
// }
}
new CNTAIPING();