feat(picc): 增强爬虫功能,添加关键词过滤,优化日志输出

This commit is contained in:
huzhengrong 2025-10-23 15:11:29 +08:00
parent 55db569680
commit 3cce44351f
2 changed files with 31 additions and 50 deletions

58
picc.js
View File

@ -1,14 +1,16 @@
import axios from "axios"; import axios from "axios";
import fs from "fs"; import fs from "fs";
import path from "path"; import path from "path";
import { timestampToDate, loopCall } from "./utils.js"; import { timestampToDate, loopCall, keywordsInclude } from "./utils.js";
import config from "./config.js"; import config from "./config.js";
import { SQLiteMessageQueue } from "./sqlite.js"; import { SQLiteMessageQueue } from "./sqlite.js";
class PICC { class PICC {
constructor() { constructor(key,categoryId) {
this.key = key;
this.categoryId = categoryId;
this.info = []; this.info = [];
console.log("中国人民保险 爬虫启动..."); console.log(`中国人民保险-${key} 爬虫启动...`);
this.queue = new SQLiteMessageQueue(); this.queue = new SQLiteMessageQueue();
this.start(); this.start();
} }
@ -17,7 +19,7 @@ class PICC {
try { try {
await this.init(); await this.init();
} catch (err) { } catch (err) {
console.error("启动失败:", err); console.error(`启动失败:-${this.key}`, err);
} }
} }
async init() { async init() {
@ -30,7 +32,7 @@ class PICC {
} }
// 全量爬取 // 全量爬取
async fullFetch() { async fullFetch() {
console.log("开始全量爬取..."); console.log(`${this.key}-开始全量爬取...`);
try { try {
await loopCall(this.getInfo.bind(this), { await loopCall(this.getInfo.bind(this), {
time: config.fullFetchTime, time: config.fullFetchTime,
@ -46,7 +48,7 @@ class PICC {
}, },
complete: (result) => { complete: (result) => {
this.info.push(...result.info); this.info.push(...result.info);
console.log(`爬取完成,共获取 ${this.info.length} 条有效数据`); console.log(`${this.key}-爬取完成,共获取 ${this.info.length} 条有效数据`);
try { try {
if (this.info.length > 0) { if (this.info.length > 0) {
this.queue.saveAnnouncements("中国人民保险", this.info); this.queue.saveAnnouncements("中国人民保险", this.info);
@ -54,20 +56,20 @@ class PICC {
this.queue.addMessage("中国人民保险", this.info); this.queue.addMessage("中国人民保险", this.info);
} }
} catch (error) { } catch (error) {
console.error("数据库操作失败:", error); console.error(`${this.key}-数据库操作失败:`, error);
} }
}, },
}); });
} catch (error) { } catch (error) {
console.error("全量爬取失败:", error); console.error(`${this.key}-全量爬取失败:`, error);
} }
console.log("开始增量爬取..."); console.log(`开始增量爬取...-${this.key}`);
this.increment(); this.increment();
} }
// 增量爬取 // 增量爬取
async increment() { async increment() {
console.log("开始增量爬取模式每5分钟检查一次新数据..."); console.log(`${this.key}-开始增量爬取模式每5分钟检查一次新数据...`);
try { try {
await loopCall(this.getInfo.bind(this), { await loopCall(this.getInfo.bind(this), {
time: config.incrementFetchTime, // 5分钟间隔 time: config.incrementFetchTime, // 5分钟间隔
@ -80,7 +82,7 @@ class PICC {
); );
// 存在新数据 // 存在新数据
if (newInfo.length > 0) { if (newInfo.length > 0) {
console.log(`发现 ${newInfo.length} 条新数据`); console.log(`${this.key}-发现 ${newInfo.length} 条新数据`);
// this.info.push(...newInfo); // this.info.push(...newInfo);
this.queue.saveAnnouncements("中国人民保险", newInfo); this.queue.saveAnnouncements("中国人民保险", newInfo);
// this.writeFile(this.info); // this.writeFile(this.info);
@ -93,25 +95,25 @@ class PICC {
return 1; return 1;
} }
} else { } else {
console.log("没有发现新数据,继续监控..."); console.log(`${this.key}-没有发现新数据,继续监控...`);
return 1; // 重新从第一页开始 return 1; // 重新从第一页开始
} }
} catch (error) { } catch (error) {
console.error("数据库操作失败:", error); console.error(`${this.key}-数据库操作失败:`, error);
} }
}, },
}); });
} catch (error) { } catch (error) {
console.error("增量爬取失败:", error); console.error(`${this.key}-增量爬取失败:`, error);
} }
} }
async getInfo(pagenumber = 1) { async getInfo(pagenumber = 1) {
let info = []; let info = [];
console.log(`正在获取第 ${pagenumber} 页数据...`); console.log(`${this.key}-正在获取第 ${pagenumber} 页数据...`);
let result = await this.getList(pagenumber); let result = await this.getList(pagenumber);
if (result[0]) { if (result[0]) {
// 出错, 记录错误日志 // 出错, 记录错误日志
console.error("获取页面数据失败:", result[0]); console.error(`${this.key}-获取页面数据失败:`, result[0]);
return { pages: 0, info: [] }; return { pages: 0, info: [] };
} else { } else {
let total = result[1].res.total; let total = result[1].res.total;
@ -126,7 +128,7 @@ class PICC {
); );
// 命中关键词 // 命中关键词
if ( if (
this.keywordsInclude(item.title) && keywordsInclude(item.title) &&
endTime && endTime &&
+new Date(endTime) >= Date.now() +new Date(endTime) >= Date.now()
) { ) {
@ -152,7 +154,7 @@ class PICC {
url: "https://ec.picc.com/cms/api/dynamicData/queryContentPage", url: "https://ec.picc.com/cms/api/dynamicData/queryContentPage",
data: { data: {
dto:{ dto:{
categoryId:"211,213,214,215,216,217", categoryId:this.categoryId,
city:"", city:"",
county:"", county:"",
purchaseMode:"", purchaseMode:"",
@ -184,7 +186,7 @@ class PICC {
}) })
.then((res) => { .then((res) => {
let result = res.data; let result = res.data;
console.log("then",JSON.stringify(result.res.rows, null, 2)) console.log(`${this.key}-then`,JSON.stringify(result.res.rows.map(item=>item.title), null, 2))
if (result.msg === "操作成功" && result.code === 0) { if (result.msg === "操作成功" && result.code === 0) {
return [null, result]; return [null, result];
} else { } else {
@ -192,23 +194,11 @@ class PICC {
} }
}) })
.catch((err) => { .catch((err) => {
console.log('catch', err) console.log(`${this.key}-catch`, err)
return [err, null]; return [err, null];
}); });
} }
keywordsInclude(name) {
let keywords = [
"保险",
"车险",
"非车险",
"科技",
"大模型",
"承保",
"第三方平台",
];
return keywords.some((keyword) => name.includes(keyword));
}
} }
new PICC(); new PICC("集中采购","211,213,214,215,216,217");
new PICC("分散采购","251,253,254,255,256,257");

View File

@ -131,22 +131,13 @@ async function loopCall(fn, options = {}) {
} }
function keywordsInclude(name) { function keywordsInclude(name) {
let keywords = [ let keywords = [
"海外", "保险",
"国际", "车险",
"内容", "非车险",
"营销", "科技",
"运营", "大模型",
"直播", "承保",
"品牌", "第三方平台",
"事件",
"策略",
"传播",
"执行",
"社媒",
"视频",
"制作",
"拍摄",
"效果",
]; ];
return keywords.some((keyword) => name.includes(keyword)); return keywords.some((keyword) => name.includes(keyword));
} }