insurance-spider/chery.js

252 lines
7.3 KiB
JavaScript

import axios from "axios";
import fs from "fs";
import path from "path";
import {
timestampToDate,
loopCall,
keywordsInclude,
// addToMessageQueue,
} from "./utils.js";
import config from "./config.js";
import { SQLiteMessageQueue } from "./sqlite.js";
// import { messageQueue } from "./msgManager.js";
// import cheerio from "cheerio";
class Chery {
constructor() {
this.jsonMap = [
{
name: "奇瑞采购公告",
// filepath: path.resolve("chery_cg.json"),
info: [],
options: {
name: "采购公告",
url: "https://ebd.mychery.com/cms/api/dynamicData/queryContentPage",
categoryId: "5035",
siteId: "747",
},
},
{
name: "奇瑞寻源预告",
// filepath: path.resolve("chery_xy.json"),
info: [],
options: {
name: "寻源预告",
url: "https://ebd.mychery.com/cms/api/dynamicData/queryContentPage",
categoryId: "965901485789413376",
siteId: "747",
},
},
{
name: "奇瑞变更公告",
// filepath: path.resolve("chery_bg.json"),
info: [],
options: {
name: "变更公告",
url: "https://ebd.mychery.com/cms/api/dynamicData/queryContentPage",
categoryId: "5032",
siteId: "747",
},
},
];
console.log("奇瑞 爬虫启动...");
this.queue = new SQLiteMessageQueue();
this.start();
}
async start() {
try {
await this.init();
} catch (err) {
console.error("启动失败:", err);
}
}
async init() {
for (let item of this.jsonMap) {
let announcements = this.queue.getAnnouncementsBySpider(item.name);
if (announcements.length > 0) {
this.loopFetchIncrement(item);
} else {
this.loopFetchFull(item);
}
// if (fs.existsSync(item.filepath)) {
// let data = fs.readFileSync(item.filepath, "utf-8");
// item.info = data ? JSON.parse(data) : [];
// if (item.info.length > 0) {
// // await this.increment(item);
// console.log(`${item.name} 历史文件存在,开始增量爬取`);
// this.loopFetchIncrement(item);
// } else {
// this.loopFetchFull(item);
// }
// } else {
// console.log(`${item.name}历史文件不存在,开始全量爬取`);
// this.loopFetchFull(item);
// }
}
}
// 全量爬取
loopFetchFull(props) {
try {
loopCall(this.getInfo.bind(this), {
time: config.fullFetchTime,
pagenumber: 1,
additional: props.options,
stopWhen: (pagenumber, result) => {
return (
pagenumber >= result.pages || pagenumber >= config.pageNumberLimit
);
},
readyForNext: (pagenumber, result) => {
props.info.push(...result.info);
return pagenumber + 1;
},
complete: (result) => {
props.info.push(...result.info);
console.log(`爬取完成,共获取 ${props.info.length} 条有效数据`);
try {
this.queue.saveAnnouncements(props.name, props.info);
// this.writeFile(props);
this.queue.addMessage(props.name, props.info);
} catch (error) {
console.error("数据库操作失败:", error);
}
this.loopFetchIncrement(props);
},
});
} catch (error) {
console.error(`奇瑞${props.options.name}全量爬取失败:`, error);
}
}
loopFetchIncrement(props) {
try {
loopCall(this.getInfo.bind(this), {
time: config.incrementFetchTime, // 5分钟间隔
pagenumber: 1,
additional: props.options,
readyForNext: (pagenumber, result) => {
try {
let newInfo = this.queue.filterNewAnnouncements(
props.name,
result.info
);
// 存在新数据
if (newInfo.length > 0) {
console.log(`发现 ${newInfo.length} 条新数据`);
// props.info.push(...newInfo);
this.queue.saveAnnouncements(props.name, newInfo);
// this.writeFile(props);
this.queue.addMessage(props.name, newInfo);
// 全是新数据,继续下一页
if (newInfo.length === result.info.length) {
return pagenumber + 1;
} else {
// 有部分重复数据,重新从第一页开始
return 1;
}
} else {
console.log("没有发现新数据,继续监控...");
return 1; // 重新从第一页开始
}
} catch (error) {
console.error("数据库操作失败:", error);
}
},
});
} catch (error) {
console.error(`奇瑞${props.options.name}增量爬取失败:`, error);
}
}
async getInfo(pagenumber = 1, config) {
let info = [];
console.log(`${config.name}--获取第 ${pagenumber} 页数据...`);
let result = await this.getList(pagenumber, config);
if (result[0]) {
// 出错, 记录错误日志
console.error("获取页面数据失败:", result[0]);
return { pages: 30, info: [] };
} else {
let pages = 30;
let arr = result[1].res.rows;
for (let i = 0; i < arr.length; i++) {
let item = arr[i];
let endTime, publishTime;
if (config.categoryId === "965901485789413376") {
publishTime = item.publishDate.replace("T", " ").split(".")[0];
endTime = this.extractDeadlineTime(item.text);
} else {
endTime = item.signUpEndTime.replace("T", " ").split(".")[0];
publishTime = item.signUpBeginTime.replace("T", " ").split(".")[0];
}
// 命中关键词
if (
endTime &&
keywordsInclude(item.title) &&
+new Date(endTime) >= Date.now()
) {
// console.log("处理项目:", item.id, item.projectName);
info.push({
id: item.url,
name: item.title,
publishTime: publishTime,
endTime: endTime,
urls: `https://ebd.mychery.com/cms` + item.url,
});
}
}
return { pages, info };
}
}
// 分页获取数据
getList(pagenumber, config) {
return axios({
url: config.url,
data: {
dto: {
bidType: "",
categoryId: config.categoryId,
city: "",
county: "",
province: "",
purchaseMode: "",
secondCompanyId: "",
siteId: config.siteId,
},
pageNo: pagenumber,
pageSize: "10",
},
method: "post",
})
.then((res) => {
let result = res.data;
if (result.code === 0) {
return [null, result];
} else {
return ["err", null];
}
})
.catch((err) => {
return [err, null];
});
}
// writeFile(props) {
// fs.writeFileSync(props.filepath, JSON.stringify(props.info), "utf-8");
// }
extractDeadlineTime(html) {
// 匹配"预告报名截止时间:"后面的时间格式
const regex = /预告报名截止时间:(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2})/;
const match = html.match(regex);
if (match) {
return match[1];
}
return null;
}
}
new Chery();