import axios from "axios"; import fs from "fs"; import path from "path"; import { timestampToDate, loopCall, keywordsInclude } from "../utils.js"; import config from "../config.js"; import { SQLiteMessageQueue } from "../sqlite.js"; import puppeteer from 'puppeteer'; class DF { constructor() { this.jsonMap = [ { name: "中国人寿保险", info: [], options: { name: "中国人寿保险", url: "https://cpmsx.e-chinalife.com/xycms/#/procurementAnncmnt", homeIndex: "", }, }, ]; console.log("中国人寿保险 爬虫启动..."); this.queue = new SQLiteMessageQueue(); this.start(); } async start() { try { await this.init(); } catch (err) { console.error("启动失败:", err); } } async init() { for (let item of this.jsonMap) { let announcements = this.queue.getAnnouncementsBySpider(item.name); if (announcements.length > 0) { this.loopFetchIncrement(item); } else { this.loopFetchFull(item); } } } // 全量爬取 loopFetchFull(props) { try { loopCall(this.getInfo.bind(this), { time: config.fullFetchTime, pagenumber: 1, additional: props.options, stopWhen: (pagenumber, result) => { return ( pagenumber >= result.pages || pagenumber >= config.pageNumberLimit ); }, readyForNext: (pagenumber, result) => { props.info.push(...result.info); return pagenumber + 1; }, complete: (result) => { props.info.push(...result.info); console.log(`爬取完成,共获取 ${props.info.length} 条有效数据`); try { if (props.info.length > 0) { this.queue.saveAnnouncements(props.name, props.info); // this.writeFile(props); this.queue.addMessage(props.name, props.info); } } catch (error) { console.error("数据库操作失败:", error); } this.loopFetchIncrement(props); }, }); } catch (error) { console.error(`${props.options.name}全量爬取失败:`, error); } } loopFetchIncrement(props) { try { loopCall(this.getInfo.bind(this), { time: config.incrementFetchTime, // 5分钟间隔 pagenumber: 1, additional: props.options, readyForNext: (pagenumber, result) => { try { let newInfo = this.queue.filterNewAnnouncements( props.name, result.info ); // 存在新数据 if (newInfo.length > 0) { console.log(`发现 ${newInfo.length} 条新数据`); // props.info.push(...newInfo); this.queue.saveAnnouncements(props.name, newInfo); // this.writeFile(props); this.queue.addMessage(props.name, newInfo); // 全是新数据,继续下一页 if (newInfo.length === result.info.length) { return pagenumber + 1; } else { // 有部分重复数据,重新从第一页开始 return 1; } } else { console.log("没有发现新数据,继续监控..."); return 1; // 重新从第一页开始 } } catch (error) { console.error("数据库操作失败:", error); } }, }); } catch (error) { console.error(`${props.options.name}增量爬取失败:`, error); } } async getInfo(pagenumber = 1, config) { let info = []; console.log(`${config.name}--获取第 ${pagenumber} 页数据...`); let result = await this.getList(pagenumber, config); if (result[0]) { // 出错, 记录错误日志 console.error("获取页面数据失败:", result[0].status); return { pages: 0, info: [] }; } else { // 列表没有结束时间字段,默认5页 let pages = 5; let html = result[1]; let browser = result[2]; try{ const listData = await html.$$eval('.cardPadding', items => { return items.map(item => { console.log("item:", item); let id = item.querySelector('a').innerHTML; let name = item.querySelector('a').innerHTML; let publishTime = item.querySelector('.releaseDate').innerHTML; let endTime = null; let urls = item.querySelector('a').getAttribute('href'); // if ( // keywordsInclude(name) // ) { console.log("处理项目:", id, name); info.push({ id: id, name: name, publishTime: publishTime, endTime: endTime, urls: urls, }); // } }); }); } catch (error) { console.log("getInfo失败:", error); } await browser.close(); return { pages, info }; } } // 分页获取数据 async getList(pagenumber, config) { let url = config.url; try{ const browser = await puppeteer.launch(); const page = await browser.newPage(); await page.goto(url) console.log("页面加载完成-page",page); // if (pagenumber != 1) { // await page.locator('.el-pagination__jump input').fill(pagenumber) // await page.keyboard.press('Enter'); // await page.waitForResponse('https://cpmsx.e-chinalife.com/ocps/ocps-anncmnt-backend/microapp/web/exp/outer/getCmsContentList'); // } return [null,page,browser] }catch (error) { console.log("getList失败:", error); return [error, null]; } // return axios({ // url: url, // method: "get", // }) // .then((res) => { // let result = res.data; // return [null, result]; // }) // .catch((err) => { // return [err, null]; // }); } } new DF();