insurance-spider/youzhicai.js

407 lines
12 KiB
JavaScript

import axios from "axios";
import fs from "fs";
import path from "path";
import JSON5 from "json5";
import { timestampToDate, loopCall, keywordsInclude } from "./utils.js";
import config from "./config.js";
import { SQLiteMessageQueue } from "./sqlite.js";
import * as cheerio from "cheerio";
class YouZhiCai {
constructor(jsonMap) {
this.axiosInstance = axios.create({ timeout: 30000, maxRedirects: 5 });
this.axiosInstance.interceptors.request.use((config) => {
// 添加cookie到请求头
const cookieString = Array.from(this.cookiePair.entries())
.map(([name, value]) => `${name}=${value}`)
.join("; ");
config.headers.Cookie = cookieString;
return config;
});
this.axiosInstance.interceptors.response.use(
(response) => {
// 更新cookie到请求头
let cookieArr = response.headers["set-cookie"] || [];
this.extractCookie(cookieArr);
return response;
},
(error) => {
return Promise.reject(error);
}
);
this.cookiePair = new Map();
// this.csrfToken = "";
this.jsonMap = jsonMap;
console.log("优质采 爬虫启动...");
this.queue = new SQLiteMessageQueue();
this.start();
}
async start() {
try {
await this.init();
} catch (err) {
console.error("启动失败:", err);
}
}
async init() {
for (let item of this.jsonMap) {
let announcements = this.queue.getAnnouncementsBySpider(item.name);
if (announcements.length > 0) {
this.loopFetchIncrement(item);
} else {
this.loopFetchFull(item);
}
}
}
async initializeCookie() {
try {
let headers = {
headers: {
Accept: "text/plain, */*; q=0.01",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cache-Control": "no-cache",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
Origin: "https://www.youzhicai.com",
Pragma: "no-cache",
Priority: "u=1, i",
Referer: "https://www.youzhicai.com/s/1_1_0_0_.html",
"Sec-Ch-Ua":
'"Not)A;Brand";v="8", "Chromium";v="138", "Google Chrome";v="138"',
"Sec-Ch-Ua-Mobile": "?0",
"Sec-Ch-Ua-Platform": '"macOS"',
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-origin",
"User-Agent":
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36",
"X-Requested-With": "XMLHttpRequest",
},
};
const homeResponse = await this.axiosInstance.get(
"https://www.youzhicai.com/s/1_1_0_0_.html",
headers
);
// // 提取csrf-token
// let tokenMatch = homeResponse.data.match(
// /<meta name="csrf-token" content="([^"]+)"/
// );
// // console.log(tokenMatch);
// if (tokenMatch) {
// let csrfToken = tokenMatch[1];
// this.csrfToken = csrfToken;
// }
// console.log(this.csrfToken);
// headers.headers["X-Csrf-Token"] = this.csrfToken;
// const cacheResponse = await this.axiosInstance.get(
// "https://ahjhqc.youzhicai.com/?cache=1",
// headers
// );
} catch (err) {
console.log("err", err);
throw err;
}
}
extractCookie(cookieArr) {
for (let cookie of cookieArr) {
let [key, value] = cookie.split(";")[0].split("=");
this.cookiePair.set(key, value);
}
// console.log(this.cookiePair);
}
// 全量爬取
loopFetchFull(props) {
console.log("开始全量爬取");
try {
loopCall(this.getInfo.bind(this), {
time: config.fullFetchTime,
pagenumber: 1,
additional: props.options,
stopWhen: (pagenumber, result) => {
return (
pagenumber >= result.pages || pagenumber >= config.pageNumberLimit
);
},
readyForNext: (pagenumber, result) => {
props.info.push(...result.info);
return pagenumber + 1;
},
complete: (result) => {
props.info.push(...result.info);
console.log(`爬取完成,共获取 ${props.info.length} 条有效数据`);
try {
if (props.info.length > 0) {
this.queue.saveAnnouncements(props.name, props.info);
this.queue.addMessage(props.name, props.info);
}
} catch (error) {
console.error("数据库操作失败:", error);
}
this.loopFetchIncrement(props);
},
});
} catch (error) {
console.error(`${props.options.name}全量爬取失败:`, error);
}
}
loopFetchIncrement(props) {
console.log("开始增量爬取");
try {
loopCall(this.getInfo.bind(this), {
time: config.incrementFetchTime, // 5分钟间隔
pagenumber: 1,
additional: props.options,
readyForNext: (pagenumber, result) => {
try {
let newInfo = this.queue.filterNewAnnouncements(
props.name,
result.info
);
// 存在新数据
if (newInfo.length > 0) {
console.log(`发现 ${newInfo.length} 条新数据`);
// props.info.push(...newInfo);
this.queue.saveAnnouncements(props.name, newInfo);
// this.writeFile(props);
this.queue.addMessage(props.name, newInfo);
// 全是新数据,继续下一页
if (newInfo.length === result.info.length) {
return pagenumber + 1;
} else {
// 有部分重复数据,重新从第一页开始
return 1;
}
} else {
console.log("没有发现新数据,继续监控...");
return 1; // 重新从第一页开始
}
} catch (error) {
console.error("数据库操作失败:", error);
}
},
});
} catch (error) {
console.error(`${props.options.name}增量爬取失败:`, error);
}
}
async getInfo(pagenumber = 1, config) {
let info = [];
console.log(`${config.name}--获取第 ${pagenumber} 页数据...`);
let result = await this.getList(pagenumber, config);
if (result[0]) {
// 出错, 记录错误日志
console.error("获取页面数据失败: ", result[0]);
return { pages: 0, info: [] };
} else {
// 后面的都要验证码
// let pages = 2;
let html = result[1];
const $ = cheerio.load(html);
let total = $("#recommendMsg .info-num-value").text();
let pages = Math.ceil(total / 15);
if (pages > 2) {
pages = 2;
}
$(".project-li").each((index, element) => {
let id = $(element).find(".project-name0").attr("href");
let name = $(element).find(".project-name0").attr("title");
let publishTime = $(element).find(".pub-value0").text();
let leftDay = $(element).find(".left-day .emOrange:eq(0)").text();
let endTime = new Date(
+new Date(publishTime) + leftDay * 24 * 60 * 60 * 1000
).toLocaleDateString();
// console.log(endTime);
let urls = "https://www.youzhicai.com" + id;
if (keywordsInclude(name)) {
console.log("处理项目:", name, publishTime, endTime);
info.push({
id: id,
name: name,
publishTime: publishTime,
endTime: endTime,
urls: urls,
});
}
});
return { pages, info };
}
}
async getList(pagenumber, config) {
let data = config.data;
data.PageIndex = pagenumber;
if (this.cookiePair.get("__RequestVerificationToken")) {
data.__RequestVerificationToken = this.cookiePair.get(
"__RequestVerificationToken"
);
}
let headers = {
Accept: "text/plain, */*; q=0.01",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cache-Control": "no-cache",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
Origin: "https://www.youzhicai.com",
Pragma: "no-cache",
Priority: "u=1, i",
Referer: "https://www.youzhicai.com/s/1_1_0_0_.html",
"Sec-Ch-Ua":
'"Not)A;Brand";v="8", "Chromium";v="138", "Google Chrome";v="138"',
"Sec-Ch-Ua-Mobile": "?0",
"Sec-Ch-Ua-Platform": '"macOS"',
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-origin",
"User-Agent":
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36",
"X-Requested-With": "XMLHttpRequest",
};
try {
const response = await this.axiosInstance({
url: config.url,
data,
method: "post",
headers,
});
let result = response.data;
return [null, result];
} catch (err) {
console.log("cookie不对");
try {
await this.initializeCookie();
data.__RequestVerificationToken = this.cookiePair.get(
"__RequestVerificationToken"
);
const retryResponse = await this.axiosInstance({
url: config.url,
data,
method: "post",
headers,
});
// console.log(retryResponse.data);
let result = retryResponse.data;
return [null, result];
} catch (retryErr) {
return [retryErr, null];
}
}
}
}
new YouZhiCai([
{
name: "优质采【招标公告】",
info: [],
options: {
name: "优质采【招标公告】",
url: "https://www.youzhicai.com/s/1_1_0_0_.html",
data: {
MsProvince: "",
MsCity: "",
MsStartDate: "",
MsEndDate: "",
AutoOr: 0,
BackOr: 0,
NoticeTitle: "",
searchAccuracy: "precise",
matchType: "precise",
TenderType: "",
MsBidderType: 1,
MsNoticeType: 1,
MsPublishType: 0,
MsSingUpType: 1,
MsSort: 2,
MsProvince: "",
PageIndex: 1,
PageSize: 15,
AgencyId: "",
SecondSearch: "",
SecondSearchType: "",
TotalSize: 10000,
SearchRange: 3,
year: "",
key1: "",
key2: "",
key3: "",
},
},
},
]);
new YouZhiCai([
{
name: "优质采【澄清/变更公告】",
info: [],
options: {
name: "优质采【澄清/变更公告】",
url: "https://www.youzhicai.com/s/1_1_0_0_.html",
data: {
MsProvince: "",
MsCity: "",
MsStartDate: "",
MsEndDate: "",
AutoOr: 0,
BackOr: 0,
NoticeTitle: "",
searchAccuracy: "precise",
matchType: "precise",
TenderType: "",
MsBidderType: 1,
MsNoticeType: 5,
MsPublishType: 0,
MsSingUpType: 1,
MsSort: 2,
MsProvince: "",
PageIndex: 1,
PageSize: 15,
AgencyId: "",
SecondSearch: "",
SecondSearchType: "",
TotalSize: 10000,
SearchRange: 3,
year: "",
key1: "",
key2: "",
key3: "",
},
},
},
]);
new YouZhiCai([
{
name: "优质采【招标项目计划】",
info: [],
options: {
name: "优质采【招标项目计划】",
url: "https://www.youzhicai.com/s/1_1_0_0_.html",
data: {
MsProvince: "",
MsCity: "",
MsStartDate: "",
MsEndDate: "",
AutoOr: 0,
BackOr: 0,
NoticeTitle: "",
searchAccuracy: "precise",
matchType: "precise",
TenderType: "",
MsBidderType: 1,
MsNoticeType: 7,
MsPublishType: 0,
MsSingUpType: 1,
MsSort: 2,
MsProvince: "",
PageIndex: 1,
PageSize: 15,
AgencyId: "",
SecondSearch: "",
SecondSearchType: "",
TotalSize: 10000,
SearchRange: 3,
year: "",
key1: "",
key2: "",
key3: "",
},
},
},
]);