🐛 feat(qhr-spider): 添加前海再保险爬虫及相关逻辑

This commit is contained in:
huzhengrong 2025-10-29 15:40:18 +08:00
parent ad69302e17
commit c35ab2bc3c
3 changed files with 221 additions and 1 deletions

View File

@ -113,5 +113,21 @@ module.exports = {
log_file: "./logs/sinosafe-combined.log",
time: true,
},
{//前海再保险爬虫
name: "qhr-spider",
script: "./service/qhr.js",
instances: 1,
autorestart: true,
watch: false,
max_memory_restart: "300M",
env: {
NODE_ENV: "production",
SPIDER_NAME: "qhr",
},
error_file: "./logs/qhr-error.log",
out_file: "./logs/qhr-out.log",
log_file: "./logs/qhr-combined.log",
time: true,
},
],
};

180
service/qhr.js Normal file
View File

@ -0,0 +1,180 @@
//前海再保险
import axios from "axios";
import fs from "fs";
import path from "path";
import { timestampToDate, loopCall, keywordsInclude, extractDateTimeAndTitle } from "../utils.js";
import config from "../config.js";
import { SQLiteMessageQueue } from "../sqlite.js";
import * as cheerio from "cheerio";
class QHR {
constructor() {
this.jsonMap = [
{
name: "前海再保险",
info: [],
options: {
name: "前海再保险",
url: "https://www.qianhaire.com/",
homeIndex: "company-info-10.html",
},
},
];
console.log("前海再保险 爬虫启动...");
this.queue = new SQLiteMessageQueue();
this.start();
}
async start() {
try {
await this.init();
} catch (err) {
console.error("启动失败:", err);
}
}
async init() {
for (let item of this.jsonMap) {
let announcements = this.queue.getAnnouncementsBySpider(item.name);
if (announcements.length > 0) {
this.loopFetchIncrement(item);
} else {
this.loopFetchFull(item);
}
}
}
// 全量爬取
loopFetchFull(props) {
try {
loopCall(this.getInfo.bind(this), {
time: config.fullFetchTime,
pagenumber: 1,
additional: props.options,
stopWhen: (pagenumber, result) => {
//只有一页
return (
pagenumber >1
);
},
readyForNext: (pagenumber, result) => {
props.info.push(...result.info);
return pagenumber + 1;
},
complete: (result) => {
props.info.push(...result.info);
console.log(`爬取完成,共获取 ${props.info.length} 条有效数据`);
try {
if (props.info.length > 0) {
this.queue.saveAnnouncements(props.name, props.info);
// this.writeFile(props);
this.queue.addMessage(props.name, props.info);
}
} catch (error) {
console.error("数据库操作失败:", error);
}
this.loopFetchIncrement(props);
},
});
} catch (error) {
console.error(`${props.options.name}全量爬取失败:`, error);
}
}
loopFetchIncrement(props) {
try {
loopCall(this.getInfo.bind(this), {
time: config.incrementFetchTime, // 5分钟间隔
pagenumber: 1,
additional: props.options,
readyForNext: (pagenumber, result) => {
try {
let newInfo = this.queue.filterNewAnnouncements(
props.name,
result.info
);
// 存在新数据
if (newInfo.length > 0) {
console.log(`发现 ${newInfo.length} 条新数据`);
// props.info.push(...newInfo);
this.queue.saveAnnouncements(props.name, newInfo);
// this.writeFile(props);
this.queue.addMessage(props.name, newInfo);
// 全是新数据,继续下一页
if (newInfo.length === result.info.length) {
return pagenumber + 1;
} else {
// 有部分重复数据,重新从第一页开始
return 1;
}
} else {
console.log("没有发现新数据,继续监控...");
return 1; // 重新从第一页开始
}
} catch (error) {
console.error("数据库操作失败:", error);
}
},
});
} catch (error) {
console.error(`${props.options.name}增量爬取失败:`, error);
}
}
async getInfo(pagenumber = 1, config) {
let info = [];
console.log(`${config.name}--获取第 ${pagenumber} 页数据...`);
let result = await this.getList(config);
if (result[0]) {
// 出错, 记录错误日志
console.error("获取页面数据失败:", result[0].status);
return { pages: 0, info: [] };
} else {
// 列表没有结束时间字段默认5页
let pages = 5;
let html = result[1];
const $ = cheerio.load(html);
$(".uk-container .uk-card").each((index, element) => {
let id = $(element).find("a").attr("href");
let processedText = extractDateTimeAndTitle($(element).find("a").text());
let name = processedText.title;
let publishTime = processedText.date;
let endTime = null;
let urls = 'https://www.qianhaire.com/'+$(element).find("a").attr("href");
if (
// endTime &&
// +new Date(endTime) >= Date.now() &&
keywordsInclude(name)
) {
console.log("处理项目:", id, name);
info.push({
id: id,
name: name,
publishTime: publishTime,
endTime: endTime,
urls: urls,
});
}
});
return { pages, info };
}
}
// 分页获取数据
getList(config) {
let url = config.url;
// if (pagenumber === 1) {
url += config.homeIndex;
// } else {
// url += `index_${pagenumber}.html`;
// }
return axios({
url: url,
method: "get",
})
.then((res) => {
let result = res.data;
return [null, result];
})
.catch((err) => {
return [err, null];
});
}
}
new QHR();

View File

@ -271,6 +271,29 @@ async function sendQYWechatMessage(message) {
throw error; // 重新抛出错误以便调用方可以处理
}
}
/**
* 从字符串中提取日期和标题
* @param {string} input - 输入字符串格式如"2025.05.27 万得软件及数据服务项目单一来源采购结果公示"
* @returns {object|null} - 包含date和title属性的对象如果不符合格式则返回null
*/
function extractDateTimeAndTitle(input) {
if (!input) return null;
// 匹配日期格式 YYYY.MM.DD 或 YYYY-MM-DD
const dateRegex = /^(\d{4}[.\-]\d{2}[.\-]\d{2})\s+(.+)$/;
const match = input.match(dateRegex);
if (match) {
return {
date: match[1],
title: match[2].trim()
};
}
return null;
}
export {
timestampToDate,
loopCall,
@ -279,6 +302,7 @@ export {
parseToGgDetailsParams,
addToMessageQueue,
md5,
sendQYWechatMessage
sendQYWechatMessage,
extractDateTimeAndTitle
// wechatPush
};