208 lines
6.8 KiB
JavaScript
208 lines
6.8 KiB
JavaScript
import axios from "axios";
|
||
import fs from "fs";
|
||
import path from "path";
|
||
import { timestampToDate, loopCall, keywordsInclude } from "../utils.js";
|
||
import config from "../config.js";
|
||
import { SQLiteMessageQueue } from "../sqlite.js";
|
||
|
||
class PICC {
|
||
constructor(key,categoryId) {
|
||
this.key = key;
|
||
this.categoryId = categoryId;
|
||
this.info = [];
|
||
console.log(`中国人民保险-${key} 爬虫启动...`);
|
||
this.queue = new SQLiteMessageQueue();
|
||
this.start();
|
||
}
|
||
|
||
async start() {
|
||
try {
|
||
await this.init();
|
||
} catch (err) {
|
||
console.error(`启动失败:-${this.key}`, err);
|
||
}
|
||
}
|
||
async init() {
|
||
let announcements = this.queue.getAnnouncementsBySpider("中国人民保险");
|
||
if (announcements.length > 0) {
|
||
await this.increment();
|
||
} else {
|
||
await this.fullFetch();
|
||
}
|
||
}
|
||
// 全量爬取
|
||
async fullFetch() {
|
||
console.log(`${this.key}-开始全量爬取...`);
|
||
try {
|
||
await loopCall(this.getInfo.bind(this), {
|
||
time: config.fullFetchTime,
|
||
pagenumber: 1,
|
||
stopWhen: (pagenumber, result) => {
|
||
return (
|
||
pagenumber >= result.pages || result.stopFlag
|
||
);
|
||
},
|
||
readyForNext: (pagenumber, result) => {
|
||
this.info.push(...result.info);
|
||
return pagenumber + 1;
|
||
},
|
||
complete: (result) => {
|
||
this.info.push(...result.info);
|
||
console.log(`${this.key}-爬取完成,共获取 ${this.info.length} 条有效数据`);
|
||
try {
|
||
if (this.info.length > 0) {
|
||
this.queue.saveAnnouncements("中国人民保险", this.info);
|
||
// this.writeFile(this.info);
|
||
this.queue.addMessage("中国人民保险", this.info);
|
||
}
|
||
} catch (error) {
|
||
console.error(`${this.key}-数据库操作失败:`, error);
|
||
}
|
||
},
|
||
});
|
||
} catch (error) {
|
||
console.error(`${this.key}-全量爬取失败:`, error);
|
||
}
|
||
console.log(`开始增量爬取...-${this.key}`);
|
||
this.increment();
|
||
}
|
||
|
||
// 增量爬取
|
||
async increment() {
|
||
console.log(`${this.key}-开始增量爬取模式,每5分钟检查一次新数据...`);
|
||
try {
|
||
await loopCall(this.getInfo.bind(this), {
|
||
time: config.incrementFetchTime, // 5分钟间隔
|
||
pagenumber: 1,
|
||
readyForNext: (pagenumber, result) => {
|
||
try {
|
||
let newInfo = this.queue.filterNewAnnouncements(
|
||
"中国人民保险",
|
||
result.info
|
||
);
|
||
// 存在新数据
|
||
if (newInfo.length > 0) {
|
||
console.log(`${this.key}-发现 ${newInfo.length} 条新数据`);
|
||
// this.info.push(...newInfo);
|
||
this.queue.saveAnnouncements("中国人民保险", newInfo);
|
||
// this.writeFile(this.info);
|
||
this.queue.addMessage("中国人民保险", newInfo);
|
||
// 全是新数据,继续下一页
|
||
if (newInfo.length === result.info.length) {
|
||
return pagenumber + 1;
|
||
} else {
|
||
// 有部分重复数据,重新从第一页开始
|
||
return 1;
|
||
}
|
||
} else {
|
||
console.log(`${this.key}-没有发现新数据,继续监控...`);
|
||
return 1; // 重新从第一页开始
|
||
}
|
||
} catch (error) {
|
||
console.error(`${this.key}-数据库操作失败:`, error);
|
||
}
|
||
},
|
||
});
|
||
} catch (error) {
|
||
console.error(`${this.key}-增量爬取失败:`, error);
|
||
}
|
||
}
|
||
async getInfo(pagenumber = 1) {
|
||
let info = [];
|
||
console.log(`${this.key}-正在获取第 ${pagenumber} 页数据...`);
|
||
let result = await this.getList(pagenumber);
|
||
if (result[0]) {
|
||
// 出错, 记录错误日志
|
||
console.error(`${this.key}-获取页面数据失败:`, result[0]);
|
||
return { pages: 0, info: [] };
|
||
} else {
|
||
let total = result[1].res.total;
|
||
let pages = Math.ceil(total / 10);
|
||
let arr = result[1].res.rows;
|
||
let stopFlag = false;
|
||
|
||
for (let i = 0; i < arr.length; i++) {
|
||
let item = arr[i];
|
||
let endTime = timestampToDate(
|
||
new Date(item.quoteEndTime).getTime(),
|
||
true
|
||
);
|
||
// 命中关键词
|
||
if (
|
||
keywordsInclude(item.title) &&
|
||
endTime &&
|
||
+new Date(endTime) >= Date.now()
|
||
) {
|
||
// console.log("处理项目:", item.sourcingId, item.title);
|
||
info.push({
|
||
id: item.url,
|
||
name: item.title,
|
||
publishTime: timestampToDate(
|
||
new Date(item.quoteBeginTime?item.quoteBeginTime:item.publishDate).getTime(),
|
||
true
|
||
),
|
||
endTime: endTime,
|
||
urls: `https://ec.picc.com/cms/default/webfile${item.url}`,
|
||
});
|
||
}
|
||
if(endTime&&+new Date(endTime) < Date.now()){//初始化时,遇到结束时间 早于 当前时间则停止
|
||
stopFlag = true;
|
||
}
|
||
}
|
||
return { pages, info, stopFlag };
|
||
}
|
||
}
|
||
// 分页获取数据
|
||
getList(pagenumber) {
|
||
return axios({
|
||
url: "https://ec.picc.com/cms/api/dynamicData/queryContentPage",
|
||
data: {
|
||
dto:{
|
||
categoryId:this.categoryId,
|
||
city:"",
|
||
county:"",
|
||
purchaseMode:"",
|
||
siteId:"725"
|
||
},
|
||
pageNo: pagenumber,
|
||
pageSize: 10,
|
||
},
|
||
method: "post",
|
||
headers: {
|
||
'Accept': 'application/json, text/javascript, */*; q=0.01',
|
||
'Accept-Encoding': 'gzip, deflate, br, zstd',
|
||
'Accept-Language': 'zh-CN,zh;q=0.9',
|
||
'Connection': 'keep-alive',
|
||
'Content-Type': 'application/json; charset=UTF-8',
|
||
'Cookie': 'G_rbec_47_11_8080=22685.52745.19855.0000',
|
||
'Origin': 'https://ec.picc.com',
|
||
'Referer': 'https://ec.picc.com/cms/default/webfile/ywgg1/index.html',
|
||
'Sec-Fetch-Dest': 'empty',
|
||
'Sec-Fetch-Mode': 'cors',
|
||
'Sec-Fetch-Site': 'same-origin',
|
||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36',
|
||
'X-Requested-With': 'XMLHttpRequest',
|
||
'Sec-Ch-Ua': '"Google Chrome";v="141", "Not?A_Brand";v="8", "Chromium";v="141"',
|
||
'Sec-Ch-Ua-Mobile': '?0',
|
||
'Sec-Ch-Ua-Platform': "macOS",
|
||
}
|
||
})
|
||
.then((res) => {
|
||
let result = res.data;
|
||
console.log(`${this.key}-then`,JSON.stringify(result.res.rows.map(item=>item.title), null, 2))
|
||
if (result.msg === "操作成功" && result.code === 0) {
|
||
return [null, result];
|
||
} else {
|
||
return ["err", null];
|
||
}
|
||
})
|
||
.catch((err) => {
|
||
console.log(`${this.key}-catch`, err)
|
||
return [err, null];
|
||
});
|
||
}
|
||
}
|
||
|
||
new PICC("集中采购","211,213,214,215,216,217");
|
||
new PICC("分散采购","251,253,254,255,256,257");
|