386 lines
11 KiB
JavaScript
386 lines
11 KiB
JavaScript
import axios from "axios";
|
|
import fs from "fs";
|
|
import path from "path";
|
|
import JSON5 from "json5";
|
|
import { timestampToDate, loopCall, keywordsInclude } from "./utils.js";
|
|
import config from "./config.js";
|
|
import { SQLiteMessageQueue } from "./sqlite.js";
|
|
|
|
class JiangHuai {
|
|
constructor(jsonMap) {
|
|
this.axiosInstance = axios.create({ timeout: 30000, maxRedirects: 5 });
|
|
this.axiosInstance.interceptors.request.use((config) => {
|
|
// 添加cookie到请求头
|
|
const cookieString = Array.from(this.cookiePair.entries())
|
|
.map(([name, value]) => `${name}=${value}`)
|
|
.join("; ");
|
|
config.headers.Cookie = cookieString;
|
|
return config;
|
|
});
|
|
this.axiosInstance.interceptors.response.use(
|
|
(response) => {
|
|
// 更新cookie到请求头
|
|
let cookieArr = response.headers["set-cookie"];
|
|
this.extractCookie(cookieArr);
|
|
return response;
|
|
},
|
|
(error) => {
|
|
return Promise.reject(error);
|
|
}
|
|
);
|
|
this.cookiePair = new Map();
|
|
this.csrfToken = "";
|
|
this.jsonMap = jsonMap;
|
|
// [
|
|
// {
|
|
// name: "江淮【招标公告】",
|
|
// info: [],
|
|
// options: {
|
|
// name: "江淮【招标公告】",
|
|
// url: "https://ahjhqc.youzhicai.com/domain/data-list-new",
|
|
// data: {
|
|
// pageIndex: 1,
|
|
// type: 1,
|
|
// companyId: "",
|
|
// title: "",
|
|
// ntype: 1,
|
|
// start_time: "",
|
|
// end_time: "",
|
|
// child: "",
|
|
// tenderType: 3,
|
|
// },
|
|
// },
|
|
// },
|
|
// {
|
|
// name: "江淮【变更/澄清公告】",
|
|
// info: [],
|
|
// options: {
|
|
// name: "江淮【变更/澄清公告】",
|
|
// url: "https://ahjhqc.youzhicai.com/domain/data-list-new",
|
|
// data: {
|
|
// pageIndex: 1,
|
|
// type: 1,
|
|
// companyId: "",
|
|
// title: "",
|
|
// ntype: "4,6",
|
|
// start_time: "",
|
|
// end_time: "",
|
|
// child: "",
|
|
// tenderType: 3,
|
|
// },
|
|
// },
|
|
// },
|
|
// ];
|
|
console.log("江淮 爬虫启动...");
|
|
this.queue = new SQLiteMessageQueue();
|
|
this.start();
|
|
}
|
|
|
|
async start() {
|
|
try {
|
|
await this.init();
|
|
} catch (err) {
|
|
console.error("启动失败:", err);
|
|
}
|
|
}
|
|
async init() {
|
|
for (let item of this.jsonMap) {
|
|
let announcements = this.queue.getAnnouncementsBySpider(item.name);
|
|
if (announcements.length > 0) {
|
|
this.loopFetchIncrement(item);
|
|
} else {
|
|
this.loopFetchFull(item);
|
|
}
|
|
}
|
|
}
|
|
async initializeCookie() {
|
|
try {
|
|
let headers = {
|
|
headers: {
|
|
"User-Agent":
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36",
|
|
Accept:
|
|
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
|
|
"Accept-Language": "zh-CN,zh;q=0.9",
|
|
"Cache-Control": "no-cache",
|
|
Pragma: "no-cache",
|
|
"Sec-Fetch-Dest": "document",
|
|
"Sec-Fetch-Mode": "navigate",
|
|
"Sec-Fetch-Site": "none",
|
|
"Upgrade-Insecure-Requests": "1",
|
|
},
|
|
};
|
|
const homeResponse = await this.axiosInstance.get(
|
|
"https://ahjhqc.youzhicai.com/homeindex/noticeListNew.html?type=1",
|
|
headers
|
|
);
|
|
// 提取csrf-token
|
|
let tokenMatch = homeResponse.data.match(
|
|
/<meta name="csrf-token" content="([^"]+)"/
|
|
);
|
|
// console.log(tokenMatch);
|
|
if (tokenMatch) {
|
|
let csrfToken = tokenMatch[1];
|
|
this.csrfToken = csrfToken;
|
|
}
|
|
console.log(this.csrfToken);
|
|
headers.headers["X-Csrf-Token"] = this.csrfToken;
|
|
const cacheResponse = await this.axiosInstance.get(
|
|
"https://ahjhqc.youzhicai.com/?cache=1",
|
|
headers
|
|
);
|
|
} catch (err) {
|
|
console.log("err", err);
|
|
throw err;
|
|
}
|
|
}
|
|
extractCookie(cookieArr) {
|
|
for (let cookie of cookieArr) {
|
|
let [key, value] = cookie.split(";")[0].split("=");
|
|
this.cookiePair.set(key, value);
|
|
}
|
|
// console.log(this.cookiePair);
|
|
}
|
|
// 全量爬取
|
|
loopFetchFull(props) {
|
|
console.log("开始全量爬取");
|
|
try {
|
|
loopCall(this.getInfo.bind(this), {
|
|
time: config.fullFetchTime,
|
|
pagenumber: 1,
|
|
additional: props.options,
|
|
stopWhen: (pagenumber, result) => {
|
|
return (
|
|
pagenumber >= result.pages || pagenumber >= config.pageNumberLimit
|
|
);
|
|
},
|
|
readyForNext: (pagenumber, result) => {
|
|
props.info.push(...result.info);
|
|
return pagenumber + 1;
|
|
},
|
|
complete: (result) => {
|
|
props.info.push(...result.info);
|
|
console.log(`爬取完成,共获取 ${props.info.length} 条有效数据`);
|
|
try {
|
|
if (props.info.length > 0) {
|
|
this.queue.saveAnnouncements(props.name, props.info);
|
|
this.queue.addMessage(props.name, props.info);
|
|
}
|
|
} catch (error) {
|
|
console.error("数据库操作失败:", error);
|
|
}
|
|
this.loopFetchIncrement(props);
|
|
},
|
|
});
|
|
} catch (error) {
|
|
console.error(`${props.options.name}全量爬取失败:`, error);
|
|
}
|
|
}
|
|
loopFetchIncrement(props) {
|
|
console.log("开始增量爬取");
|
|
try {
|
|
loopCall(this.getInfo.bind(this), {
|
|
time: config.incrementFetchTime, // 5分钟间隔
|
|
pagenumber: 1,
|
|
additional: props.options,
|
|
readyForNext: (pagenumber, result) => {
|
|
try {
|
|
let newInfo = this.queue.filterNewAnnouncements(
|
|
props.name,
|
|
result.info
|
|
);
|
|
// 存在新数据
|
|
if (newInfo.length > 0) {
|
|
console.log(`发现 ${newInfo.length} 条新数据`);
|
|
// props.info.push(...newInfo);
|
|
this.queue.saveAnnouncements(props.name, newInfo);
|
|
// this.writeFile(props);
|
|
this.queue.addMessage(props.name, newInfo);
|
|
// 全是新数据,继续下一页
|
|
if (newInfo.length === result.info.length) {
|
|
return pagenumber + 1;
|
|
} else {
|
|
// 有部分重复数据,重新从第一页开始
|
|
return 1;
|
|
}
|
|
} else {
|
|
console.log("没有发现新数据,继续监控...");
|
|
return 1; // 重新从第一页开始
|
|
}
|
|
} catch (error) {
|
|
console.error("数据库操作失败:", error);
|
|
}
|
|
},
|
|
});
|
|
} catch (error) {
|
|
console.error(`${props.options.name}增量爬取失败:`, error);
|
|
}
|
|
}
|
|
async getInfo(pagenumber = 1, config) {
|
|
let info = [];
|
|
console.log(`${config.name}--获取第 ${pagenumber} 页数据...`);
|
|
let result = await this.getList(pagenumber, config);
|
|
if (result[0]) {
|
|
// 出错, 记录错误日志
|
|
console.error("获取页面数据失败: ", result[0]);
|
|
return { pages: 0, info: [] };
|
|
} else {
|
|
// 公开寻源
|
|
let arr = result[1].list;
|
|
let total = result[1].total;
|
|
let pages = Math.ceil(total / 10);
|
|
|
|
for (let i = 0; i < arr.length; i++) {
|
|
let item = arr[i];
|
|
let endTime, publishTime;
|
|
publishTime = new Date(item.startTime).toLocaleDateString();
|
|
endTime = new Date(item.endTime).toLocaleDateString();
|
|
// 命中关键词
|
|
if (
|
|
keywordsInclude(item.noticeTitle) &&
|
|
item.endTime &&
|
|
+new Date(item.endTime) >= Date.now()
|
|
) {
|
|
console.log("处理项目:", item.noticeTitle);
|
|
info.push({
|
|
id: item.bulletinSID,
|
|
name: item.noticeTitle,
|
|
publishTime: publishTime,
|
|
endTime: endTime,
|
|
urls: `https://ahjhqc.youzhicai.com/${item.Url}`,
|
|
});
|
|
}
|
|
}
|
|
return { pages, info };
|
|
}
|
|
}
|
|
async getList(pagenumber, config) {
|
|
let data = config.data;
|
|
data.pageIndex = pagenumber;
|
|
let headers = {
|
|
Accept: "text/plain, */*; q=0.01",
|
|
"Accept-Language": "zh-CN,zh;q=0.9",
|
|
"Cache-Control": "no-cache",
|
|
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
|
|
Origin: "https://ahjhqc.youzhicai.com",
|
|
Pragma: "no-cache",
|
|
Priority: "u=1, i",
|
|
Referer:
|
|
"https://ahjhqc.youzhicai.com/homeindex/noticeListNew.html?type=1",
|
|
"Sec-Ch-Ua":
|
|
'"Not)A;Brand";v="8", "Chromium";v="138", "Google Chrome";v="138"',
|
|
"Sec-Ch-Ua-Mobile": "?0",
|
|
"Sec-Ch-Ua-Platform": '"macOS"',
|
|
"Sec-Fetch-Dest": "empty",
|
|
"Sec-Fetch-Mode": "cors",
|
|
"Sec-Fetch-Site": "same-origin",
|
|
"User-Agent":
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36",
|
|
"X-Requested-With": "XMLHttpRequest",
|
|
"X-Csrf-Token": this.csrfToken,
|
|
};
|
|
try {
|
|
const response = await this.axiosInstance({
|
|
url: config.url,
|
|
data,
|
|
method: "post",
|
|
headers,
|
|
});
|
|
let result = JSON5.parse(response.data);
|
|
if (result.list && result.list.length > 0) {
|
|
return [null, result];
|
|
} else {
|
|
return ["err", null];
|
|
}
|
|
} catch (err) {
|
|
console.log("cookie不对");
|
|
try {
|
|
await this.initializeCookie();
|
|
headers["X-Csrf-Token"] = this.csrfToken;
|
|
const retryResponse = await this.axiosInstance({
|
|
url: config.url,
|
|
data,
|
|
method: "post",
|
|
headers,
|
|
});
|
|
// console.log(retryResponse.data);
|
|
let result = JSON5.parse(retryResponse.data);
|
|
if (result.list && result.list.length > 0) {
|
|
return [null, result];
|
|
} else {
|
|
return ["err", null];
|
|
}
|
|
} catch (retryErr) {
|
|
return [retryErr, null];
|
|
}
|
|
}
|
|
}
|
|
// 分页获取数据
|
|
// getList(pagenumber, config) {
|
|
// let data = config.data;
|
|
// data.pageIndex = pagenumber;
|
|
// return axios({
|
|
// url: config.url,
|
|
// data: data,
|
|
// method: "post",
|
|
// headers: {
|
|
// "Content-Type": "application/x-www-form-urlencoded",
|
|
// },
|
|
// })
|
|
// .then((res) => {
|
|
// let result = res.data;
|
|
// if (result.list && result.list.length > 0) {
|
|
// return [null, result];
|
|
// } else {
|
|
// return ["err", null];
|
|
// }
|
|
// })
|
|
// .catch((err) => {
|
|
// return [err, null];
|
|
// });
|
|
// }
|
|
}
|
|
|
|
new JiangHuai([
|
|
{
|
|
name: "江淮【招标公告】",
|
|
info: [],
|
|
options: {
|
|
name: "江淮【招标公告】",
|
|
url: "https://ahjhqc.youzhicai.com/domain/data-list-new",
|
|
data: {
|
|
pageIndex: 1,
|
|
type: 1,
|
|
companyId: "",
|
|
title: "",
|
|
ntype: 1,
|
|
start_time: "",
|
|
end_time: "",
|
|
child: "",
|
|
tenderType: 3,
|
|
},
|
|
},
|
|
},
|
|
]);
|
|
new JiangHuai([
|
|
{
|
|
name: "江淮【变更/澄清公告】",
|
|
info: [],
|
|
options: {
|
|
name: "江淮【变更/澄清公告】",
|
|
url: "https://ahjhqc.youzhicai.com/domain/data-list-new",
|
|
data: {
|
|
pageIndex: 1,
|
|
type: 1,
|
|
companyId: "",
|
|
title: "",
|
|
ntype: "4,6",
|
|
start_time: "",
|
|
end_time: "",
|
|
child: "",
|
|
tenderType: 3,
|
|
},
|
|
},
|
|
},
|
|
]);
|