From 50a74a353f4adb546b414b13e87b0d86fffe29ed Mon Sep 17 00:00:00 2001 From: huzhengrong Date: Mon, 27 Oct 2025 17:32:33 +0800 Subject: [PATCH] =?UTF-8?q?=E4=B8=AD=E5=9B=BD=E4=BA=BA=E5=AF=BF=E4=BF=9D?= =?UTF-8?q?=E9=99=A9=E6=9A=82=E5=81=9C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ecosystem.config.cjs | 16 ++++ service/china-life.js | 197 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 213 insertions(+) create mode 100644 service/china-life.js diff --git a/ecosystem.config.cjs b/ecosystem.config.cjs index a6e812f..736d73f 100644 --- a/ecosystem.config.cjs +++ b/ecosystem.config.cjs @@ -49,5 +49,21 @@ module.exports = { log_file: "./logs/cpic-combined.log", time: true, }, + // {//中国人寿保险爬虫 + // name: "china-life-spider", + // script: "./service/china-life.js", + // instances: 1, + // autorestart: true, + // watch: false, + // max_memory_restart: "300M", + // env: { + // NODE_ENV: "production", + // SPIDER_NAME: "china-life", + // }, + // error_file: "./logs/china-life-error.log", + // out_file: "./logs/china-life-out.log", + // log_file: "./logs/china-life-combined.log", + // time: true, + // }, ], }; diff --git a/service/china-life.js b/service/china-life.js new file mode 100644 index 0000000..a4fa4f7 --- /dev/null +++ b/service/china-life.js @@ -0,0 +1,197 @@ +import axios from "axios"; +import fs from "fs"; +import path from "path"; +import { timestampToDate, loopCall, keywordsInclude } from "../utils.js"; +import config from "../config.js"; +import { SQLiteMessageQueue } from "../sqlite.js"; +import puppeteer from 'puppeteer'; + +class DF { + constructor() { + this.jsonMap = [ + { + name: "中国人寿保险", + info: [], + options: { + name: "中国人寿保险", + url: "https://cpmsx.e-chinalife.com/xycms/#/procurementAnncmnt", + homeIndex: "", + }, + }, + ]; + console.log("中国人寿保险 爬虫启动..."); + this.queue = new SQLiteMessageQueue(); + this.start(); + } + + async start() { + try { + await this.init(); + } catch (err) { + console.error("启动失败:", err); + } + } + async init() { + for (let item of this.jsonMap) { + let announcements = this.queue.getAnnouncementsBySpider(item.name); + if (announcements.length > 0) { + this.loopFetchIncrement(item); + } else { + this.loopFetchFull(item); + } + } + } + // 全量爬取 + loopFetchFull(props) { + try { + loopCall(this.getInfo.bind(this), { + time: config.fullFetchTime, + pagenumber: 1, + additional: props.options, + stopWhen: (pagenumber, result) => { + return ( + pagenumber >= result.pages || pagenumber >= config.pageNumberLimit + ); + }, + readyForNext: (pagenumber, result) => { + props.info.push(...result.info); + return pagenumber + 1; + }, + complete: (result) => { + props.info.push(...result.info); + console.log(`爬取完成,共获取 ${props.info.length} 条有效数据`); + try { + if (props.info.length > 0) { + this.queue.saveAnnouncements(props.name, props.info); + // this.writeFile(props); + this.queue.addMessage(props.name, props.info); + } + } catch (error) { + console.error("数据库操作失败:", error); + } + this.loopFetchIncrement(props); + }, + }); + } catch (error) { + console.error(`${props.options.name}全量爬取失败:`, error); + } + } + loopFetchIncrement(props) { + try { + loopCall(this.getInfo.bind(this), { + time: config.incrementFetchTime, // 5分钟间隔 + pagenumber: 1, + additional: props.options, + readyForNext: (pagenumber, result) => { + try { + let newInfo = this.queue.filterNewAnnouncements( + props.name, + result.info + ); + // 存在新数据 + if (newInfo.length > 0) { + console.log(`发现 ${newInfo.length} 条新数据`); + // props.info.push(...newInfo); + this.queue.saveAnnouncements(props.name, newInfo); + // this.writeFile(props); + this.queue.addMessage(props.name, newInfo); + // 全是新数据,继续下一页 + if (newInfo.length === result.info.length) { + return pagenumber + 1; + } else { + // 有部分重复数据,重新从第一页开始 + return 1; + } + } else { + console.log("没有发现新数据,继续监控..."); + return 1; // 重新从第一页开始 + } + } catch (error) { + console.error("数据库操作失败:", error); + } + }, + }); + } catch (error) { + console.error(`${props.options.name}增量爬取失败:`, error); + } + } + async getInfo(pagenumber = 1, config) { + let info = []; + console.log(`${config.name}--获取第 ${pagenumber} 页数据...`); + let result = await this.getList(pagenumber, config); + if (result[0]) { + // 出错, 记录错误日志 + console.error("获取页面数据失败:", result[0].status); + return { pages: 0, info: [] }; + } else { + // 列表没有结束时间字段,默认5页 + let pages = 5; + let html = result[1]; + let browser = result[2]; + try{ + + const listData = await html.$$eval('.cardPadding', items => { + return items.map(item => { + console.log("item:", item); + let id = item.querySelector('a').innerHTML; + let name = item.querySelector('a').innerHTML; + let publishTime = item.querySelector('.releaseDate').innerHTML; + let endTime = null; + let urls = item.querySelector('a').getAttribute('href'); + + // if ( + // keywordsInclude(name) + // ) { + console.log("处理项目:", id, name); + info.push({ + id: id, + name: name, + publishTime: publishTime, + endTime: endTime, + urls: urls, + }); + // } + }); + }); + } catch (error) { + console.log("getInfo失败:", error); + } + + await browser.close(); + return { pages, info }; + } + } + // 分页获取数据 + async getList(pagenumber, config) { + let url = config.url; + try{ + const browser = await puppeteer.launch(); + const page = await browser.newPage(); + await page.goto(url) + console.log("页面加载完成-page",page); + // if (pagenumber != 1) { + // await page.locator('.el-pagination__jump input').fill(pagenumber) + // await page.keyboard.press('Enter'); + // await page.waitForResponse('https://cpmsx.e-chinalife.com/ocps/ocps-anncmnt-backend/microapp/web/exp/outer/getCmsContentList'); + // } + return [null,page,browser] + }catch (error) { + console.log("getList失败:", error); + return [error, null]; + } + + // return axios({ + // url: url, + // method: "get", + // }) + // .then((res) => { + // let result = res.data; + // return [null, result]; + // }) + // .catch((err) => { + // return [err, null]; + // }); + } +} + +new DF();