252 lines
7.3 KiB
JavaScript
252 lines
7.3 KiB
JavaScript
import axios from "axios";
|
|
import fs from "fs";
|
|
import path from "path";
|
|
import {
|
|
timestampToDate,
|
|
loopCall,
|
|
keywordsInclude,
|
|
// addToMessageQueue,
|
|
} from "./utils.js";
|
|
import config from "./config.js";
|
|
import { SQLiteMessageQueue } from "./sqlite.js";
|
|
// import { messageQueue } from "./msgManager.js";
|
|
// import cheerio from "cheerio";
|
|
|
|
class Chery {
|
|
constructor() {
|
|
this.jsonMap = [
|
|
{
|
|
name: "奇瑞采购公告",
|
|
// filepath: path.resolve("chery_cg.json"),
|
|
info: [],
|
|
options: {
|
|
name: "采购公告",
|
|
url: "https://ebd.mychery.com/cms/api/dynamicData/queryContentPage",
|
|
categoryId: "5035",
|
|
siteId: "747",
|
|
},
|
|
},
|
|
{
|
|
name: "奇瑞寻源预告",
|
|
// filepath: path.resolve("chery_xy.json"),
|
|
info: [],
|
|
options: {
|
|
name: "寻源预告",
|
|
url: "https://ebd.mychery.com/cms/api/dynamicData/queryContentPage",
|
|
categoryId: "965901485789413376",
|
|
siteId: "747",
|
|
},
|
|
},
|
|
{
|
|
name: "奇瑞变更公告",
|
|
// filepath: path.resolve("chery_bg.json"),
|
|
info: [],
|
|
options: {
|
|
name: "变更公告",
|
|
url: "https://ebd.mychery.com/cms/api/dynamicData/queryContentPage",
|
|
categoryId: "5032",
|
|
siteId: "747",
|
|
},
|
|
},
|
|
];
|
|
console.log("奇瑞 爬虫启动...");
|
|
this.queue = new SQLiteMessageQueue();
|
|
this.start();
|
|
}
|
|
|
|
async start() {
|
|
try {
|
|
await this.init();
|
|
} catch (err) {
|
|
console.error("启动失败:", err);
|
|
}
|
|
}
|
|
async init() {
|
|
for (let item of this.jsonMap) {
|
|
let announcements = this.queue.getAnnouncementsBySpider(item.name);
|
|
if (announcements.length > 0) {
|
|
this.loopFetchIncrement(item);
|
|
} else {
|
|
this.loopFetchFull(item);
|
|
}
|
|
// if (fs.existsSync(item.filepath)) {
|
|
// let data = fs.readFileSync(item.filepath, "utf-8");
|
|
// item.info = data ? JSON.parse(data) : [];
|
|
// if (item.info.length > 0) {
|
|
// // await this.increment(item);
|
|
// console.log(`${item.name} 历史文件存在,开始增量爬取`);
|
|
// this.loopFetchIncrement(item);
|
|
// } else {
|
|
// this.loopFetchFull(item);
|
|
// }
|
|
// } else {
|
|
// console.log(`${item.name}历史文件不存在,开始全量爬取`);
|
|
// this.loopFetchFull(item);
|
|
// }
|
|
}
|
|
}
|
|
// 全量爬取
|
|
loopFetchFull(props) {
|
|
try {
|
|
loopCall(this.getInfo.bind(this), {
|
|
time: config.fullFetchTime,
|
|
pagenumber: 1,
|
|
additional: props.options,
|
|
stopWhen: (pagenumber, result) => {
|
|
return (
|
|
pagenumber >= result.pages || pagenumber >= config.pageNumberLimit
|
|
);
|
|
},
|
|
readyForNext: (pagenumber, result) => {
|
|
props.info.push(...result.info);
|
|
return pagenumber + 1;
|
|
},
|
|
complete: (result) => {
|
|
props.info.push(...result.info);
|
|
console.log(`爬取完成,共获取 ${props.info.length} 条有效数据`);
|
|
try {
|
|
this.queue.saveAnnouncements(props.name, props.info);
|
|
// this.writeFile(props);
|
|
this.queue.addMessage(props.name, props.info);
|
|
} catch (error) {
|
|
console.error("数据库操作失败:", error);
|
|
}
|
|
this.loopFetchIncrement(props);
|
|
},
|
|
});
|
|
} catch (error) {
|
|
console.error(`奇瑞${props.options.name}全量爬取失败:`, error);
|
|
}
|
|
}
|
|
loopFetchIncrement(props) {
|
|
try {
|
|
loopCall(this.getInfo.bind(this), {
|
|
time: config.incrementFetchTime, // 5分钟间隔
|
|
pagenumber: 1,
|
|
additional: props.options,
|
|
readyForNext: (pagenumber, result) => {
|
|
try {
|
|
let newInfo = this.queue.filterNewAnnouncements(
|
|
props.name,
|
|
result.info
|
|
);
|
|
// 存在新数据
|
|
if (newInfo.length > 0) {
|
|
console.log(`发现 ${newInfo.length} 条新数据`);
|
|
// props.info.push(...newInfo);
|
|
this.queue.saveAnnouncements(props.name, newInfo);
|
|
// this.writeFile(props);
|
|
this.queue.addMessage(props.name, newInfo);
|
|
// 全是新数据,继续下一页
|
|
if (newInfo.length === result.info.length) {
|
|
return pagenumber + 1;
|
|
} else {
|
|
// 有部分重复数据,重新从第一页开始
|
|
return 1;
|
|
}
|
|
} else {
|
|
console.log("没有发现新数据,继续监控...");
|
|
return 1; // 重新从第一页开始
|
|
}
|
|
} catch (error) {
|
|
console.error("数据库操作失败:", error);
|
|
}
|
|
},
|
|
});
|
|
} catch (error) {
|
|
console.error(`奇瑞${props.options.name}增量爬取失败:`, error);
|
|
}
|
|
}
|
|
async getInfo(pagenumber = 1, config) {
|
|
let info = [];
|
|
console.log(`${config.name}--获取第 ${pagenumber} 页数据...`);
|
|
let result = await this.getList(pagenumber, config);
|
|
if (result[0]) {
|
|
// 出错, 记录错误日志
|
|
console.error("获取页面数据失败:", result[0]);
|
|
return { pages: 30, info: [] };
|
|
} else {
|
|
let pages = 30;
|
|
let arr = result[1].res.rows;
|
|
|
|
for (let i = 0; i < arr.length; i++) {
|
|
let item = arr[i];
|
|
let endTime, publishTime;
|
|
if (config.categoryId === "965901485789413376") {
|
|
publishTime = item.publishDate.replace("T", " ").split(".")[0];
|
|
endTime = this.extractDeadlineTime(item.text);
|
|
} else {
|
|
endTime = item.signUpEndTime.replace("T", " ").split(".")[0];
|
|
publishTime = item.signUpBeginTime.replace("T", " ").split(".")[0];
|
|
}
|
|
// 命中关键词
|
|
if (
|
|
endTime &&
|
|
keywordsInclude(item.title) &&
|
|
+new Date(endTime) >= Date.now()
|
|
) {
|
|
// console.log("处理项目:", item.id, item.projectName);
|
|
info.push({
|
|
id: item.url,
|
|
name: item.title,
|
|
publishTime: publishTime,
|
|
endTime: endTime,
|
|
urls: `https://ebd.mychery.com/cms` + item.url,
|
|
});
|
|
}
|
|
}
|
|
return { pages, info };
|
|
}
|
|
}
|
|
// 分页获取数据
|
|
getList(pagenumber, config) {
|
|
return axios({
|
|
url: config.url,
|
|
data: {
|
|
dto: {
|
|
bidType: "",
|
|
categoryId: config.categoryId,
|
|
city: "",
|
|
county: "",
|
|
province: "",
|
|
purchaseMode: "",
|
|
secondCompanyId: "",
|
|
siteId: config.siteId,
|
|
},
|
|
pageNo: pagenumber,
|
|
pageSize: "10",
|
|
},
|
|
method: "post",
|
|
})
|
|
.then((res) => {
|
|
let result = res.data;
|
|
if (result.code === 0) {
|
|
return [null, result];
|
|
} else {
|
|
return ["err", null];
|
|
}
|
|
})
|
|
.catch((err) => {
|
|
return [err, null];
|
|
});
|
|
}
|
|
|
|
// writeFile(props) {
|
|
// fs.writeFileSync(props.filepath, JSON.stringify(props.info), "utf-8");
|
|
// }
|
|
|
|
extractDeadlineTime(html) {
|
|
// 匹配"预告报名截止时间:"后面的时间格式
|
|
const regex = /预告报名截止时间:(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2})/;
|
|
const match = html.match(regex);
|
|
|
|
if (match) {
|
|
return match[1];
|
|
}
|
|
|
|
return null;
|
|
}
|
|
}
|
|
|
|
new Chery();
|