From e4679caf0bf77816b899b3d27bbd3d24bbc94906 Mon Sep 17 00:00:00 2001 From: huzhengrong Date: Mon, 27 Oct 2025 18:14:22 +0800 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20feat(cic):=20=E6=B7=BB=E5=8A=A0?= =?UTF-8?q?=E4=B8=AD=E5=8D=8E=E4=BF=9D=E9=99=A9=E7=88=AC=E8=99=AB=E5=8A=9F?= =?UTF-8?q?=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ecosystem.config.cjs | 16 ++++ service/cic.js | 178 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 194 insertions(+) create mode 100644 service/cic.js diff --git a/ecosystem.config.cjs b/ecosystem.config.cjs index 736d73f..de4a37f 100644 --- a/ecosystem.config.cjs +++ b/ecosystem.config.cjs @@ -65,5 +65,21 @@ module.exports = { // log_file: "./logs/china-life-combined.log", // time: true, // }, + {//中华保险爬虫 + name: "cic-spider", + script: "./service/cic.js", + instances: 1, + autorestart: true, + watch: false, + max_memory_restart: "300M", + env: { + NODE_ENV: "production", + SPIDER_NAME: "cic", + }, + error_file: "./logs/cic-error.log", + out_file: "./logs/cic-out.log", + log_file: "./logs/cic-combined.log", + time: true, + }, ], }; diff --git a/service/cic.js b/service/cic.js new file mode 100644 index 0000000..20ff434 --- /dev/null +++ b/service/cic.js @@ -0,0 +1,178 @@ +//中华保险 +import axios from "axios"; +import fs from "fs"; +import path from "path"; +import { timestampToDate, loopCall, keywordsInclude } from "../utils.js"; +import config from "../config.js"; +import { SQLiteMessageQueue } from "../sqlite.js"; +import * as cheerio from "cheerio"; + +class CIC { + constructor() { + this.jsonMap = [ + { + name: "中华保险", + info: [], + options: { + name: "中华保险", + url: "https://www.cic.cn/notice/", + homeIndex: "index.jhtml", + }, + }, + ]; + console.log("中华保险 爬虫启动..."); + this.queue = new SQLiteMessageQueue(); + this.start(); + } + + async start() { + try { + await this.init(); + } catch (err) { + console.error("启动失败:", err); + } + } + async init() { + for (let item of this.jsonMap) { + let announcements = this.queue.getAnnouncementsBySpider(item.name); + if (announcements.length > 0) { + this.loopFetchIncrement(item); + } else { + this.loopFetchFull(item); + } + } + } + // 全量爬取 + loopFetchFull(props) { + try { + loopCall(this.getInfo.bind(this), { + time: config.fullFetchTime, + pagenumber: 1, + additional: props.options, + stopWhen: (pagenumber, result) => { + return ( + pagenumber >= result.pages || pagenumber >= config.pageNumberLimit + ); + }, + readyForNext: (pagenumber, result) => { + props.info.push(...result.info); + return pagenumber + 1; + }, + complete: (result) => { + props.info.push(...result.info); + console.log(`爬取完成,共获取 ${props.info.length} 条有效数据`); + try { + if (props.info.length > 0) { + this.queue.saveAnnouncements(props.name, props.info); + // this.writeFile(props); + this.queue.addMessage(props.name, props.info); + } + } catch (error) { + console.error("数据库操作失败:", error); + } + this.loopFetchIncrement(props); + }, + }); + } catch (error) { + console.error(`${props.options.name}全量爬取失败:`, error); + } + } + loopFetchIncrement(props) { + try { + loopCall(this.getInfo.bind(this), { + time: config.incrementFetchTime, // 5分钟间隔 + pagenumber: 1, + additional: props.options, + readyForNext: (pagenumber, result) => { + try { + let newInfo = this.queue.filterNewAnnouncements( + props.name, + result.info + ); + // 存在新数据 + if (newInfo.length > 0) { + console.log(`发现 ${newInfo.length} 条新数据`); + // props.info.push(...newInfo); + this.queue.saveAnnouncements(props.name, newInfo); + // this.writeFile(props); + this.queue.addMessage(props.name, newInfo); + // 全是新数据,继续下一页 + if (newInfo.length === result.info.length) { + return pagenumber + 1; + } else { + // 有部分重复数据,重新从第一页开始 + return 1; + } + } else { + console.log("没有发现新数据,继续监控..."); + return 1; // 重新从第一页开始 + } + } catch (error) { + console.error("数据库操作失败:", error); + } + }, + }); + } catch (error) { + console.error(`${props.options.name}增量爬取失败:`, error); + } + } + async getInfo(pagenumber = 1, config) { + let info = []; + console.log(`${config.name}--获取第 ${pagenumber} 页数据...`); + let result = await this.getList(pagenumber, config); + if (result[0]) { + // 出错, 记录错误日志 + console.error("获取页面数据失败:", result[0].status); + return { pages: 0, info: [] }; + } else { + // 列表没有结束时间字段,默认5页 + let pages = 5; + let html = result[1]; + const $ = cheerio.load(html); + $(".g_ulconList li").each((index, element) => { + let id = $(element).find("a").attr("href"); + let name = $(element).find("span").text(); + let publishTime = $(element).find("em").text(); + let endTime = null; + let urls = $(element).find("a").attr("href"); + if ( + // endTime && + // +new Date(endTime) >= Date.now() && + keywordsInclude(name) + ) { + console.log("处理项目:", id, name); + info.push({ + id: id, + name: name, + publishTime: publishTime, + endTime: endTime, + urls: urls, + }); + } + }); + return { pages, info }; + } + } + // 分页获取数据 + getList(pagenumber, config) { + let url = config.url; + if (pagenumber === 1) { + url += config.homeIndex; + } else { + url += `index_${pagenumber}.jhtml`; + } + return axios({ + url: url, + method: "get", + }) + .then((res) => { + let result = res.data; + return [null, result]; + }) + .catch((err) => { + return [err, null]; + }); + } +} + +new CIC();