238 lines
7.1 KiB
JavaScript
238 lines
7.1 KiB
JavaScript
import axios from "axios";
|
||
import fs from "fs";
|
||
import path from "path";
|
||
import { timestampToDate, loopCall } from "./utils.js";
|
||
import config from "./config.js";
|
||
import { SQLiteMessageQueue } from "./sqlite.js";
|
||
// import cheerio from "cheerio";
|
||
// import { messageQueue } from "./msgManager.js";
|
||
|
||
class GEELY {
|
||
constructor() {
|
||
this.url = "https://glzb.geely.com/gpmp/notice/listnotice";
|
||
// this.filepath = path.resolve("geely.json");
|
||
this.info = [];
|
||
console.log("GEELY 爬虫启动...");
|
||
this.queue = new SQLiteMessageQueue();
|
||
this.start();
|
||
}
|
||
|
||
async start() {
|
||
try {
|
||
await this.init();
|
||
} catch (err) {
|
||
console.error("启动失败:", err);
|
||
}
|
||
}
|
||
async init() {
|
||
let announcements = this.queue.getAnnouncementsBySpider("吉利");
|
||
if (announcements.length > 0) {
|
||
await this.increment();
|
||
} else {
|
||
await this.fullFetch();
|
||
}
|
||
// if (fs.existsSync(this.filepath)) {
|
||
// let data = fs.readFileSync(this.filepath, "utf-8");
|
||
// this.info = data ? JSON.parse(data) : [];
|
||
// if (this.info.length > 0) {
|
||
// await this.increment();
|
||
// } else {
|
||
// await this.fullFetch();
|
||
// }
|
||
// } else {
|
||
// console.log("历史文件不存在,开始全量爬取");
|
||
// await this.fullFetch();
|
||
// }
|
||
}
|
||
// 全量爬取
|
||
async fullFetch() {
|
||
console.log("开始全量爬取...");
|
||
try {
|
||
await loopCall(this.getInfo.bind(this), {
|
||
time: config.fullFetchTime,
|
||
pagenumber: 1,
|
||
stopWhen: (pagenumber, result) => {
|
||
return (
|
||
pagenumber >= result.pages || pagenumber >= config.pageNumberLimit
|
||
); // 限制最多2页用于测试
|
||
},
|
||
readyForNext: (pagenumber, result) => {
|
||
this.info.push(...result.info);
|
||
return pagenumber + 1;
|
||
},
|
||
complete: (result) => {
|
||
this.info.push(...result.info);
|
||
console.log(`爬取完成,共获取 ${this.info.length} 条有效数据`);
|
||
try {
|
||
this.queue.saveAnnouncements("吉利", this.info);
|
||
// this.writeFile(this.info);
|
||
this.queue.addMessage("吉利", this.info);
|
||
} catch (error) {
|
||
console.error("数据库操作失败:", error);
|
||
}
|
||
},
|
||
});
|
||
} catch (error) {
|
||
console.error("全量爬取失败:", error);
|
||
}
|
||
console.log("开始增量爬取...");
|
||
this.increment();
|
||
}
|
||
|
||
// 增量爬取
|
||
async increment() {
|
||
console.log("开始增量爬取模式,每5分钟检查一次新数据...");
|
||
try {
|
||
await loopCall(this.getInfo.bind(this), {
|
||
time: config.incrementFetchTime, // 5分钟间隔
|
||
pagenumber: 1,
|
||
readyForNext: (pagenumber, result) => {
|
||
try {
|
||
let newInfo = this.queue.filterNewAnnouncements(
|
||
"吉利",
|
||
result.info
|
||
);
|
||
// 存在新数据
|
||
if (newInfo.length > 0) {
|
||
console.log(`发现 ${newInfo.length} 条新数据`);
|
||
this.queue.saveAnnouncements("吉利", newInfo);
|
||
this.queue.addMessage("吉利", newInfo);
|
||
// 全是新数据,继续下一页
|
||
if (newInfo.length === result.info.length) {
|
||
return pagenumber + 1;
|
||
} else {
|
||
// 有部分重复数据,重新从第一页开始
|
||
return 1;
|
||
}
|
||
} else {
|
||
console.log("没有发现新数据,继续监控...");
|
||
return 1; // 重新从第一页开始
|
||
}
|
||
} catch (error) {
|
||
console.error("数据库操作失败:", error);
|
||
}
|
||
},
|
||
});
|
||
} catch (error) {
|
||
console.error("增量爬取失败:", error);
|
||
}
|
||
}
|
||
// 传入页码获取数据
|
||
async getInfo(pagenumber = 1) {
|
||
let today = new Date().setHours(0, 0, 0, 0);
|
||
let beforeOneMonth = today - 30 * 24 * 60 * 60 * 1000;
|
||
let info = [];
|
||
console.log(`正在获取第 ${pagenumber} 页数据...`);
|
||
let result = await this.getList(pagenumber);
|
||
if (result[0]) {
|
||
// 出错, 记录错误日志
|
||
console.error("获取页面数据失败:", result[0]);
|
||
return { pages: 0, info: [] };
|
||
} else {
|
||
let total = result[1].data.total;
|
||
let pages = Math.ceil(total / 20);
|
||
let arr = result[1].data.items;
|
||
|
||
for (let i = 0; i < arr.length; i++) {
|
||
let item = arr[i];
|
||
if (item.endtime >= today && item.publishtime >= beforeOneMonth) {
|
||
console.log("处理项目:", item.pjtnoticeid, item.pjtnoticename);
|
||
let noticeRes = await this.getNoticeUrl(item.pjtnoticeid);
|
||
if (noticeRes[0]) {
|
||
// 获取招标公告内容报错
|
||
console.error("获取公告详情失败:", noticeRes[0]);
|
||
} else {
|
||
info.push({
|
||
id: item.pjtnoticeid,
|
||
name: item.pjtnoticename,
|
||
publishTime: timestampToDate(item.publishtime),
|
||
endTime: timestampToDate(item.endtime),
|
||
urls: noticeRes[1],
|
||
});
|
||
}
|
||
}
|
||
}
|
||
return { pages, info };
|
||
}
|
||
}
|
||
getList(pagenumber) {
|
||
return axios({
|
||
url: this.url,
|
||
params: {
|
||
pagesize: 20,
|
||
pagenumber: pagenumber,
|
||
publishstatus: 2,
|
||
bidcategoryid: 1442,
|
||
iflongpro: 0,
|
||
_: Date.now(),
|
||
},
|
||
method: "get",
|
||
})
|
||
.then((res) => {
|
||
let result = res.data;
|
||
if (result.code === "success") {
|
||
return [null, result];
|
||
} else {
|
||
return ["err", null];
|
||
}
|
||
})
|
||
.catch((err) => {
|
||
return [err, null];
|
||
});
|
||
}
|
||
|
||
getNoticeUrl(id) {
|
||
let timestamp = Date.now();
|
||
return axios({
|
||
url: `https://glzb.geely.com/gpmp/notice/query?_=${timestamp}&pjtnoticeid=${id}`,
|
||
method: "get",
|
||
})
|
||
.then((res) => {
|
||
let result = res.data;
|
||
if (result.code === "success") {
|
||
let promises = [];
|
||
for (let item of result.data.attachs) {
|
||
let params = {
|
||
name: item.attachname,
|
||
downloadUrl: item.downloadUrl,
|
||
previewUrl: item.previewUrl,
|
||
attachname: item.attachname,
|
||
_: Date.now(),
|
||
};
|
||
promises.push(
|
||
axios({
|
||
url: `https://glzb.geely.com/pub/file/info/preview`,
|
||
method: "get",
|
||
params,
|
||
})
|
||
);
|
||
}
|
||
return Promise.allSettled(promises).then((results) => {
|
||
let urls = [];
|
||
results.forEach((result) => {
|
||
if (
|
||
result.status === "fulfilled" &&
|
||
result.value.data.code === "success"
|
||
) {
|
||
urls.push(result.value.data.data);
|
||
}
|
||
});
|
||
return [null, urls];
|
||
});
|
||
} else {
|
||
return ["err", null];
|
||
}
|
||
})
|
||
.catch((err) => {
|
||
console.log("err:", err);
|
||
return [err, null];
|
||
});
|
||
}
|
||
|
||
// writeFile(info) {
|
||
// fs.writeFileSync(this.filepath, JSON.stringify(info), "utf-8");
|
||
// }
|
||
}
|
||
|
||
new GEELY();
|