insurance-spider/picc.js

291 lines
9.6 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import axios from "axios";
import fs from "fs";
import path from "path";
import { timestampToDate, loopCall, keywordsInclude } from "./utils.js";
import config from "./config.js";
import { SQLiteMessageQueue } from "./sqlite.js";
class PICC {
constructor(key,categoryId) {
this.key = key;
this.categoryId = categoryId;
this.info = [];
console.log(`中国人民保险-${key} 爬虫启动...`);
this.queue = new SQLiteMessageQueue();
this.start();
}
async start() {
try {
await this.init();
} catch (err) {
console.error(`启动失败:-${this.key}`, err);
}
}
async init() {
let announcements = this.queue.getAnnouncementsBySpider("中国人民保险");
if (announcements.length > 0) {
await this.increment();
} else {
await this.fullFetch();
}
}
// 全量爬取
async fullFetch() {
console.log(`${this.key}-开始全量爬取...`);
try {
await loopCall(this.getInfo.bind(this), {
time: config.fullFetchTime,
pagenumber: 1,
stopWhen: (pagenumber, result) => {
return (
pagenumber >= result.pages || pagenumber >= config.pageNumberLimit
);
},
readyForNext: (pagenumber, result) => {
this.info.push(...result.info);
return pagenumber + 1;
},
complete: (result) => {
this.info.push(...result.info);
console.log(`${this.key}-爬取完成,共获取 ${this.info.length} 条有效数据`);
try {
if (this.info.length > 0) {
this.queue.saveAnnouncements("中国人民保险", this.info);
// this.writeFile(this.info);
this.queue.addMessage("中国人民保险", this.info);
}
} catch (error) {
console.error(`${this.key}-数据库操作失败:`, error);
}
// 全量爬取完成后,开始增量爬取
this.increment();
},
});
} catch (error) {
console.error(`${this.key}-全量爬取失败:`, error);
}
}
// 增量爬取
async increment() {
console.log(`${this.key}-开始增量爬取模式每天9点检查一次新数据...`);
// 计算到明天9点的时间间隔
const now = new Date();
const nextRun = new Date();
nextRun.setHours(9, 0, 0, 0); // 设置为今天的9点
// 如果当前时间已经过了今天的9点则设置为明天的9点
if (now > nextRun) {
nextRun.setDate(nextRun.getDate() + 1);
}
const timeUntilNextRun = nextRun - now;
console.log(`${this.key}-下次执行时间: ${nextRun.toString()}`);
// 使用setTimeout等待到下次执行时间
setTimeout(async () => {
try {
console.log("setTimeout-增量执行启动");
// 网络连接测试
await this.testNetworkConnection();
await this.executeIncrement();
// 执行完后,设置每天重复执行
this.scheduleDailyIncrement();
} catch (error) {
console.error(`${this.key}-增量爬取失败:`, error.message);
// 即使出错也继续安排下一次执行
this.scheduleDailyIncrement();
}
}, timeUntilNextRun);
}
// 测试网络连接
async testNetworkConnection() {
try {
const controller = new AbortController();
const timeoutId = setTimeout(() => controller.abort(), 5000); // 5秒超时
const response = await fetch('https://ec.picc.com', {
method: 'HEAD',
signal: controller.signal,
mode: 'no-cors' // 不需要CORS因为我们只是测试连接
});
clearTimeout(timeoutId);
console.log(`${this.key}-网络连接测试通过`);
return true;
} catch (error) {
console.warn(`${this.key}-网络连接测试失败:`, error.message);
// 即使网络测试失败,我们也继续执行,让后续的请求自己处理错误
return false;
}
}
// 执行增量爬取的具体逻辑
async executeIncrement() {
console.log(`${this.key}-开始执行增量爬取...`);
try {
const result = await this.getInfo(1);
// 检查结果是否有效
if (!result || result[0]) {
console.log(`${this.key}-获取数据失败:`, result ? result[0] : "未知错误");
return;
}
try {
let newInfo = this.queue.filterNewAnnouncements(
"中国人民保险",
result.info
);
// 存在新数据
if (newInfo.length > 0) {
console.log(`${this.key}-发现 ${newInfo.length} 条新数据`);
this.queue.saveAnnouncements("中国人民保险", newInfo);
this.queue.addMessage("中国人民保险", newInfo);
} else {
console.log(`${this.key}-没有发现新数据,继续监控...`);
}
} catch (error) {
console.error(`${this.key}-数据库操作失败:`, error);
}
} catch (error) {
console.error(`${this.key}-获取数据失败:`, error.message);
// 根据错误类型给出具体提示
if (error.code === 'ENOTFOUND') {
console.log(`${this.key}-DNS解析失败请检查网络连接或域名是否正确`);
} else if (error.code === 'ECONNREFUSED') {
console.log(`${this.key}-连接被拒绝,请检查服务器是否正常运行`);
} else if (error.code === 'ECONNRESET') {
console.log(`${this.key}-连接被重置,请稍后重试`);
} else if (error.code === 'ETIMEDOUT') {
console.log(`${this.key}-请求超时,请检查网络连接`);
}
}
}
// 设置每天9点重复执行
scheduleDailyIncrement() {
// 每天间隔24小时执行一次
setInterval(async () => {
try {
await this.executeIncrement();
} catch (error) {
console.error(`${this.key}-定时增量爬取失败:`, error);
}
}, 24 * 60 * 60 * 1000); // 24小时
console.log(`${this.key}-已设置每天9点执行增量爬取`);
}
async getInfo(pagenumber = 1) {
let info = [];
console.log(`${this.key}-正在获取第 ${pagenumber} 页数据...`);
let result = await this.getList(pagenumber);
if (result[0]) {
// 出错, 记录错误日志
console.error(`${this.key}-获取页面数据失败:`, result[0]);
return { pages: 0, info: [] };
} else {
let total = result[1].res.total;
let pages = Math.ceil(total / 10);
let arr = result[1].res.rows;
for (let i = 0; i < arr.length; i++) {
let item = arr[i];
let endTime = timestampToDate(
new Date(item.tenderFileSaleEndTime).getTime(),
true
);
// 命中关键词
if (
keywordsInclude(item.title) &&
endTime &&
+new Date(endTime) >= Date.now()
) {
// console.log("处理项目:", item.sourcingId, item.title);
info.push({
id: item.url,
name: item.title,
publishTime: timestampToDate(
new Date(item.tenderFileSaleBeginTime).getTime(),
true
),
endTime: endTime,
urls: `https://ec.picc.com/cms/default/webfile${item.url}`,
});
}
}
return { pages, info };
}
}
// 分页获取数据
getList(pagenumber) {
return axios({
url: "https://ec.picc.com/cms/api/dynamicData/queryContentPage",
data: {
dto:{
categoryId:this.categoryId,
city:"",
county:"",
purchaseMode:"",
siteId:"725"
},
pageNo: pagenumber,
pageSize: 10,
},
method: "post",
headers: {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate, br, zstd',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Content-Type': 'application/json; charset=UTF-8',
'Cookie': 'G_rbec_47_11_8080=22685.52745.19855.0000',
'Origin': 'https://ec.picc.com',
'Referer': 'https://ec.picc.com/cms/default/webfile/ywgg1/index.html',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
'Sec-Ch-Ua': '"Google Chrome";v="141", "Not?A_Brand";v="8", "Chromium";v="141"',
'Sec-Ch-Ua-Mobile': '?0',
'Sec-Ch-Ua-Platform': "macOS",
},
timeout: 10000,
})
.then((res) => {
let result = res.data;
console.log(`${this.key}-then`,JSON.stringify(result.res.rows.map(item=>item.title), null, 2))
if (result.msg === "操作成功" && result.code === 0) {
return [null, result];
} else {
return ["err", null];
}
})
.catch((err) => {
console.log(`${this.key}-catch`, err.message);
if (err.code === 'ENOTFOUND') {
console.log(`${this.key}-DNS解析失败请检查网络连接或域名是否正确`);
} else if (err.code === 'ECONNREFUSED') {
console.log(`${this.key}-连接被拒绝,请检查服务器是否正常运行`);
} else if (err.code === 'ECONNRESET') {
console.log(`${this.key}-连接被重置,请稍后重试`);
} else if (err.code === 'ETIMEDOUT') {
console.log(`${this.key}-请求超时,请检查网络连接`);
}
return [err, null];
});
}
}
new PICC("集中采购","211,213,214,215,216,217");
new PICC("分散采购","251,253,254,255,256,257");