import axios from "axios"; import fs from "fs"; import path from "path"; import puppeteer from 'puppeteer'; import { timestampToDate, loopCall, keywordsInclude } from "../utils.js"; import config from "../config.js"; import { SQLiteMessageQueue } from "../sqlite.js"; class CNTAIPING { constructor() { this.info = []; console.log("太平保险 爬虫启动..."); this.queue = new SQLiteMessageQueue(); this.start(); } async start() { try { await this.init(); } catch (err) { console.error("启动失败:", err); } } async init() { let announcements = this.queue.getAnnouncementsBySpider("太平保险"); if (announcements.length > 0) { await this.increment(); } else { await this.fullFetch(); } } // 全量爬取 async fullFetch() { console.log("开始全量爬取..."); try { await loopCall(this.getInfo.bind(this), { time: config.fullFetchTime, pagenumber: 1, stopWhen: (pagenumber, result) => { return ( pagenumber >=1 ); }, readyForNext: (pagenumber, result) => { this.info.push(...result.info); return pagenumber + 1; }, complete: (result) => { this.info.push(...result.info); console.log(`爬取完成,共获取 ${this.info.length} 条有效数据`); try { if (this.info.length > 0) { this.queue.saveAnnouncements("太平保险", this.info); // this.writeFile(this.info); this.queue.addMessage("太平保险", this.info); } } catch (error) { console.error("数据库操作失败:", error); } }, }); } catch (error) { console.error("全量爬取失败:", error); } // console.log("开始增量爬取..."); // this.increment(); } // 增量爬取 // async increment() { // console.log("开始增量爬取模式,每5分钟检查一次新数据..."); // try { // await loopCall(this.getInfo.bind(this), { // time: config.incrementFetchTime, // 5分钟间隔 // pagenumber: 1, // readyForNext: (pagenumber, result) => { // try { // let newInfo = this.queue.filterNewAnnouncements( // "太平保险", // result.info // ); // // 存在新数据 // if (newInfo.length > 0) { // console.log(`发现 ${newInfo.length} 条新数据`); // // this.info.push(...newInfo); // this.queue.saveAnnouncements("太平保险", newInfo); // // this.writeFile(this.info); // this.queue.addMessage("太平保险", newInfo); // // 全是新数据,继续下一页 // if (newInfo.length === result.info.length) { // return pagenumber + 1; // } else { // // 有部分重复数据,重新从第一页开始 // return 1; // } // } else { // console.log("没有发现新数据,继续监控..."); // return 1; // 重新从第一页开始 // } // } catch (error) { // console.error("数据库操作失败:", error); // } // }, // }); // } catch (error) { // console.error("增量爬取失败:", error); // } // } async getInfo(pagenumber = 1) { let pages = 5; let info = []; console.log(`正在获取第 ${pagenumber} 页数据...`); try{ const browser = await puppeteer.launch({headless: false}); const page = await browser.newPage(); await page.setRequestInterception(true); page.on('console', msg => console.log('PAGE LOG:', msg.text())); // 2. 监听并修改请求 page.on('request', (interceptedRequest) => { // 获取原始头 const headers = interceptedRequest.headers(); // 3. 修改关键的头信息 // 设置一个常见的User-Agent,注意版本号 headers['user-agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Safari/605.1.15'; // 移除或修改Cache-Control delete headers['cache-control']; // 或者设置为 'no-cache' 等其他值 // headers['cache-control'] = 'no-cache'; // 注意:sec-ch-ua等客户端提示头通常由浏览器自动设置以匹配User-Agent, // 设置User-Agent后,Puppeteer可能会自动调整它们以保持一致性[citation:2]。 // 手动覆盖这些头可能不必要且复杂,建议优先设置User-Agent。 // 4. 继续请求并传入修改后的头 interceptedRequest.continue({ headers }); }); await page.goto('https://eps.cntaiping.com/cms/default/webfile/2ywgg1/index.html'); await page.setViewport({width: 1080, height: 1024}); await page.$$eval('.content-ul li', elements => { elements.map(e => { console.log("e",e); let name = e.querySelector('a span').textContent.trim(); let urls = 'https://eps.cntaiping.com/'+e.querySelector('a').href; let publishTime = e.querySelector('.f-right span').textContent.trim(); info.push({ id: urls, name: name, publishTime: publishTime, urls: urls }); }); return elements; }); // await browser.close(); }catch (error) { console.log(error,"______获取页面数据失败"); } console.log("info",info); return { pages, info }; } // 分页获取数据 // getList(pagenumber) { // return axios({ // url: "https://spcn.byd.com/api/srm-sou-sp/supplier/supplier/getTenderAnnouncementInfo", // data: { // pageNo: pagenumber, // pageSize: 10, // }, // method: "post", // }) // .then((res) => { // let result = res.data; // if (result.msg === "成功" && result.code === "000000") { // return [null, result]; // } else { // return ["err", null]; // } // }) // .catch((err) => { // return [err, null]; // }); // } } new CNTAIPING();