diff --git a/ecosystem.config.cjs b/ecosystem.config.cjs index 6bd6c4c..011313f 100644 --- a/ecosystem.config.cjs +++ b/ecosystem.config.cjs @@ -113,5 +113,21 @@ module.exports = { log_file: "./logs/sinosafe-combined.log", time: true, }, + {//前海再保险爬虫 + name: "qhr-spider", + script: "./service/qhr.js", + instances: 1, + autorestart: true, + watch: false, + max_memory_restart: "300M", + env: { + NODE_ENV: "production", + SPIDER_NAME: "qhr", + }, + error_file: "./logs/qhr-error.log", + out_file: "./logs/qhr-out.log", + log_file: "./logs/qhr-combined.log", + time: true, + }, ], }; diff --git a/service/qhr.js b/service/qhr.js new file mode 100644 index 0000000..5ade18f --- /dev/null +++ b/service/qhr.js @@ -0,0 +1,180 @@ +//前海再保险 +import axios from "axios"; +import fs from "fs"; +import path from "path"; +import { timestampToDate, loopCall, keywordsInclude, extractDateTimeAndTitle } from "../utils.js"; +import config from "../config.js"; +import { SQLiteMessageQueue } from "../sqlite.js"; +import * as cheerio from "cheerio"; + +class QHR { + constructor() { + this.jsonMap = [ + { + name: "前海再保险", + info: [], + options: { + name: "前海再保险", + url: "https://www.qianhaire.com/", + homeIndex: "company-info-10.html", + }, + }, + ]; + console.log("前海再保险 爬虫启动..."); + this.queue = new SQLiteMessageQueue(); + this.start(); + } + + async start() { + try { + await this.init(); + } catch (err) { + console.error("启动失败:", err); + } + } + async init() { + for (let item of this.jsonMap) { + let announcements = this.queue.getAnnouncementsBySpider(item.name); + if (announcements.length > 0) { + this.loopFetchIncrement(item); + } else { + this.loopFetchFull(item); + } + } + } + // 全量爬取 + loopFetchFull(props) { + try { + loopCall(this.getInfo.bind(this), { + time: config.fullFetchTime, + pagenumber: 1, + additional: props.options, + stopWhen: (pagenumber, result) => { + //只有一页 + return ( + pagenumber >1 + ); + }, + readyForNext: (pagenumber, result) => { + props.info.push(...result.info); + return pagenumber + 1; + }, + complete: (result) => { + props.info.push(...result.info); + console.log(`爬取完成,共获取 ${props.info.length} 条有效数据`); + try { + if (props.info.length > 0) { + this.queue.saveAnnouncements(props.name, props.info); + // this.writeFile(props); + this.queue.addMessage(props.name, props.info); + } + } catch (error) { + console.error("数据库操作失败:", error); + } + this.loopFetchIncrement(props); + }, + }); + } catch (error) { + console.error(`${props.options.name}全量爬取失败:`, error); + } + } + loopFetchIncrement(props) { + try { + loopCall(this.getInfo.bind(this), { + time: config.incrementFetchTime, // 5分钟间隔 + pagenumber: 1, + additional: props.options, + readyForNext: (pagenumber, result) => { + try { + let newInfo = this.queue.filterNewAnnouncements( + props.name, + result.info + ); + // 存在新数据 + if (newInfo.length > 0) { + console.log(`发现 ${newInfo.length} 条新数据`); + // props.info.push(...newInfo); + this.queue.saveAnnouncements(props.name, newInfo); + // this.writeFile(props); + this.queue.addMessage(props.name, newInfo); + // 全是新数据,继续下一页 + if (newInfo.length === result.info.length) { + return pagenumber + 1; + } else { + // 有部分重复数据,重新从第一页开始 + return 1; + } + } else { + console.log("没有发现新数据,继续监控..."); + return 1; // 重新从第一页开始 + } + } catch (error) { + console.error("数据库操作失败:", error); + } + }, + }); + } catch (error) { + console.error(`${props.options.name}增量爬取失败:`, error); + } + } + async getInfo(pagenumber = 1, config) { + let info = []; + console.log(`${config.name}--获取第 ${pagenumber} 页数据...`); + let result = await this.getList(config); + if (result[0]) { + // 出错, 记录错误日志 + console.error("获取页面数据失败:", result[0].status); + return { pages: 0, info: [] }; + } else { + // 列表没有结束时间字段,默认5页 + let pages = 5; + let html = result[1]; + const $ = cheerio.load(html); + $(".uk-container .uk-card").each((index, element) => { + let id = $(element).find("a").attr("href"); + let processedText = extractDateTimeAndTitle($(element).find("a").text()); + let name = processedText.title; + let publishTime = processedText.date; + let endTime = null; + let urls = 'https://www.qianhaire.com/'+$(element).find("a").attr("href"); + if ( + // endTime && + // +new Date(endTime) >= Date.now() && + keywordsInclude(name) + ) { + console.log("处理项目:", id, name); + info.push({ + id: id, + name: name, + publishTime: publishTime, + endTime: endTime, + urls: urls, + }); + } + }); + return { pages, info }; + } + } + // 分页获取数据 + getList(config) { + let url = config.url; + // if (pagenumber === 1) { + url += config.homeIndex; + // } else { + // url += `index_${pagenumber}.html`; + // } + return axios({ + url: url, + method: "get", + }) + .then((res) => { + let result = res.data; + return [null, result]; + }) + .catch((err) => { + return [err, null]; + }); + } +} + +new QHR(); diff --git a/utils.js b/utils.js index 85042d6..bb9da80 100644 --- a/utils.js +++ b/utils.js @@ -271,6 +271,29 @@ async function sendQYWechatMessage(message) { throw error; // 重新抛出错误以便调用方可以处理 } } + +/** + * 从字符串中提取日期和标题 + * @param {string} input - 输入字符串,格式如"2025.05.27 万得软件及数据服务项目单一来源采购结果公示" + * @returns {object|null} - 包含date和title属性的对象,如果不符合格式则返回null + */ +function extractDateTimeAndTitle(input) { + if (!input) return null; + + // 匹配日期格式 YYYY.MM.DD 或 YYYY-MM-DD + const dateRegex = /^(\d{4}[.\-]\d{2}[.\-]\d{2})\s+(.+)$/; + const match = input.match(dateRegex); + + if (match) { + return { + date: match[1], + title: match[2].trim() + }; + } + + return null; +} + export { timestampToDate, loopCall, @@ -279,6 +302,7 @@ export { parseToGgDetailsParams, addToMessageQueue, md5, - sendQYWechatMessage + sendQYWechatMessage, + extractDateTimeAndTitle // wechatPush };