From 1f1ef2504494984217f438539cf1eece5df64d0c Mon Sep 17 00:00:00 2001 From: huzhengrong Date: Wed, 29 Oct 2025 16:32:00 +0800 Subject: [PATCH] =?UTF-8?q?=F0=9F=90=9B=20feat(huahai-spider):=20=E6=B7=BB?= =?UTF-8?q?=E5=8A=A0=E5=8D=8E=E6=B5=B7=E4=BF=9D=E9=99=A9=E7=88=AC=E8=99=AB?= =?UTF-8?q?=E5=8F=8A=E7=9B=B8=E5=85=B3=E9=80=BB=E8=BE=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ecosystem.config.cjs | 16 ++++ service/huahai.js | 178 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 194 insertions(+) create mode 100644 service/huahai.js diff --git a/ecosystem.config.cjs b/ecosystem.config.cjs index 011313f..66934f9 100644 --- a/ecosystem.config.cjs +++ b/ecosystem.config.cjs @@ -129,5 +129,21 @@ module.exports = { log_file: "./logs/qhr-combined.log", time: true, }, + {//华海保险爬虫 + name: "huahai-spider", + script: "./service/huahai.js", + instances: 1, + autorestart: true, + watch: false, + max_memory_restart: "300M", + env: { + NODE_ENV: "production", + SPIDER_NAME: "huahai", + }, + error_file: "./logs/huahai-error.log", + out_file: "./logs/huahai-out.log", + log_file: "./logs/huahai-combined.log", + time: true, + }, ], }; diff --git a/service/huahai.js b/service/huahai.js new file mode 100644 index 0000000..1d9bcfb --- /dev/null +++ b/service/huahai.js @@ -0,0 +1,178 @@ +//华海保险 +import axios from "axios"; +import fs from "fs"; +import path from "path"; +import { timestampToDate, loopCall, keywordsInclude } from "../utils.js"; +import config from "../config.js"; +import { SQLiteMessageQueue } from "../sqlite.js"; +import * as cheerio from "cheerio"; + +class HUAHAI { + constructor() { + this.jsonMap = [ + { + name: "华海保险", + info: [], + options: { + name: "华海保险", + url: "https://www.cnoic.com/zbgg/", + homeIndex: "index.jhtml", + }, + }, + ]; + console.log("华海保险 爬虫启动..."); + this.queue = new SQLiteMessageQueue(); + this.start(); + } + + async start() { + try { + await this.init(); + } catch (err) { + console.error("启动失败:", err); + } + } + async init() { + for (let item of this.jsonMap) { + let announcements = this.queue.getAnnouncementsBySpider(item.name); + if (announcements.length > 0) { + this.loopFetchIncrement(item); + } else { + this.loopFetchFull(item); + } + } + } + // 全量爬取 + loopFetchFull(props) { + try { + loopCall(this.getInfo.bind(this), { + time: config.fullFetchTime, + pagenumber: 1, + additional: props.options, + stopWhen: (pagenumber, result) => { + return ( + pagenumber >= result.pages || pagenumber >= config.pageNumberLimit + ); + }, + readyForNext: (pagenumber, result) => { + props.info.push(...result.info); + return pagenumber + 1; + }, + complete: (result) => { + props.info.push(...result.info); + console.log(`爬取完成,共获取 ${props.info.length} 条有效数据`); + try { + if (props.info.length > 0) { + this.queue.saveAnnouncements(props.name, props.info); + // this.writeFile(props); + this.queue.addMessage(props.name, props.info); + } + } catch (error) { + console.error("数据库操作失败:", error); + } + this.loopFetchIncrement(props); + }, + }); + } catch (error) { + console.error(`${props.options.name}全量爬取失败:`, error); + } + } + loopFetchIncrement(props) { + try { + loopCall(this.getInfo.bind(this), { + time: config.incrementFetchTime, // 5分钟间隔 + pagenumber: 1, + additional: props.options, + readyForNext: (pagenumber, result) => { + try { + let newInfo = this.queue.filterNewAnnouncements( + props.name, + result.info + ); + // 存在新数据 + if (newInfo.length > 0) { + console.log(`发现 ${newInfo.length} 条新数据`); + // props.info.push(...newInfo); + this.queue.saveAnnouncements(props.name, newInfo); + // this.writeFile(props); + this.queue.addMessage(props.name, newInfo); + // 全是新数据,继续下一页 + if (newInfo.length === result.info.length) { + return pagenumber + 1; + } else { + // 有部分重复数据,重新从第一页开始 + return 1; + } + } else { + console.log("没有发现新数据,继续监控..."); + return 1; // 重新从第一页开始 + } + } catch (error) { + console.error("数据库操作失败:", error); + } + }, + }); + } catch (error) { + console.error(`${props.options.name}增量爬取失败:`, error); + } + } + async getInfo(pagenumber = 1, config) { + let info = []; + console.log(`${config.name}--获取第 ${pagenumber} 页数据...`); + let result = await this.getList(pagenumber, config); + if (result[0]) { + // 出错, 记录错误日志 + console.error("获取页面数据失败:", result[0].status); + return { pages: 0, info: [] }; + } else { + // 列表没有结束时间字段,默认5页 + let pages = 5; + let html = result[1]; + const $ = cheerio.load(html); + $(".callbids-list li").each((index, element) => { + let id = $(element).find("a").attr("href"); + let name = $(element).find("h6").text(); + let publishTime = $(element).find("callbids-time").text(); + let endTime = null; + let urls = $(element).find("a").attr("href"); + if ( + // endTime && + // +new Date(endTime) >= Date.now() && + keywordsInclude(name) + ) { + console.log("处理项目:", id, name); + info.push({ + id: id, + name: name, + publishTime: publishTime, + endTime: endTime, + urls: urls, + }); + } + }); + return { pages, info }; + } + } + // 分页获取数据 + getList(pagenumber, config) { + let url = config.url; + if (pagenumber === 1) { + url += config.homeIndex; + } else { + url += `index_${pagenumber}.jhtml`; + } + return axios({ + url: url, + method: "get", + }) + .then((res) => { + let result = res.data; + return [null, result]; + }) + .catch((err) => { + return [err, null]; + }); + } +} + +new HUAHAI();