import axios from "axios"; import fs from "fs"; import path from "path"; import JSON5 from "json5"; import { timestampToDate, loopCall, keywordsInclude } from "./utils.js"; import config from "./config.js"; import { SQLiteMessageQueue } from "./sqlite.js"; import * as cheerio from "cheerio"; class YouZhiCai { constructor(jsonMap) { this.axiosInstance = axios.create({ timeout: 30000, maxRedirects: 5 }); this.axiosInstance.interceptors.request.use((config) => { // 添加cookie到请求头 const cookieString = Array.from(this.cookiePair.entries()) .map(([name, value]) => `${name}=${value}`) .join("; "); config.headers.Cookie = cookieString; return config; }); this.axiosInstance.interceptors.response.use( (response) => { // 更新cookie到请求头 let cookieArr = response.headers["set-cookie"] || []; this.extractCookie(cookieArr); return response; }, (error) => { return Promise.reject(error); } ); this.cookiePair = new Map(); // this.csrfToken = ""; this.jsonMap = jsonMap; console.log("优质采 爬虫启动..."); this.queue = new SQLiteMessageQueue(); this.start(); } async start() { try { await this.init(); } catch (err) { console.error("启动失败:", err); } } async init() { for (let item of this.jsonMap) { let announcements = this.queue.getAnnouncementsBySpider(item.name); if (announcements.length > 0) { this.loopFetchIncrement(item); } else { this.loopFetchFull(item); } } } async initializeCookie() { try { let headers = { headers: { Accept: "text/plain, */*; q=0.01", "Accept-Language": "zh-CN,zh;q=0.9", "Cache-Control": "no-cache", "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", Origin: "https://www.youzhicai.com", Pragma: "no-cache", Priority: "u=1, i", Referer: "https://www.youzhicai.com/s/1_1_0_0_.html", "Sec-Ch-Ua": '"Not)A;Brand";v="8", "Chromium";v="138", "Google Chrome";v="138"', "Sec-Ch-Ua-Mobile": "?0", "Sec-Ch-Ua-Platform": '"macOS"', "Sec-Fetch-Dest": "empty", "Sec-Fetch-Mode": "cors", "Sec-Fetch-Site": "same-origin", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36", "X-Requested-With": "XMLHttpRequest", }, }; const homeResponse = await this.axiosInstance.get( "https://www.youzhicai.com/s/1_1_0_0_.html", headers ); // // 提取csrf-token // let tokenMatch = homeResponse.data.match( // / { return ( pagenumber >= result.pages || pagenumber >= config.pageNumberLimit ); }, readyForNext: (pagenumber, result) => { props.info.push(...result.info); return pagenumber + 1; }, complete: (result) => { props.info.push(...result.info); console.log(`爬取完成,共获取 ${props.info.length} 条有效数据`); try { if (props.info.length > 0) { this.queue.saveAnnouncements(props.name, props.info); this.queue.addMessage(props.name, props.info); } } catch (error) { console.error("数据库操作失败:", error); } this.loopFetchIncrement(props); }, }); } catch (error) { console.error(`${props.options.name}全量爬取失败:`, error); } } loopFetchIncrement(props) { console.log("开始增量爬取"); try { loopCall(this.getInfo.bind(this), { time: config.incrementFetchTime, // 5分钟间隔 pagenumber: 1, additional: props.options, readyForNext: (pagenumber, result) => { try { let newInfo = this.queue.filterNewAnnouncements( props.name, result.info ); // 存在新数据 if (newInfo.length > 0) { console.log(`发现 ${newInfo.length} 条新数据`); // props.info.push(...newInfo); this.queue.saveAnnouncements(props.name, newInfo); // this.writeFile(props); this.queue.addMessage(props.name, newInfo); // 全是新数据,继续下一页 if (newInfo.length === result.info.length) { return pagenumber + 1; } else { // 有部分重复数据,重新从第一页开始 return 1; } } else { console.log("没有发现新数据,继续监控..."); return 1; // 重新从第一页开始 } } catch (error) { console.error("数据库操作失败:", error); } }, }); } catch (error) { console.error(`${props.options.name}增量爬取失败:`, error); } } async getInfo(pagenumber = 1, config) { let info = []; console.log(`${config.name}--获取第 ${pagenumber} 页数据...`); let result = await this.getList(pagenumber, config); if (result[0]) { // 出错, 记录错误日志 console.error("获取页面数据失败: ", result[0]); return { pages: 0, info: [] }; } else { // 后面的都要验证码 // let pages = 2; let html = result[1]; const $ = cheerio.load(html); let total = $("#recommendMsg .info-num-value").text(); let pages = Math.ceil(total / 15); if (pages > 2) { pages = 2; } $(".project-li").each((index, element) => { let id = $(element).find(".project-name0").attr("href"); let name = $(element).find(".project-name0").attr("title"); let publishTime = $(element).find(".pub-value0").text(); let leftDay = $(element).find(".left-day .emOrange:eq(0)").text(); let endTime = new Date( +new Date(publishTime) + leftDay * 24 * 60 * 60 * 1000 ).toLocaleDateString(); // console.log(endTime); let urls = "https://www.youzhicai.com" + id; if (keywordsInclude(name)) { console.log("处理项目:", name, publishTime, endTime); info.push({ id: id, name: name, publishTime: publishTime, endTime: endTime, urls: urls, }); } }); return { pages, info }; } } async getList(pagenumber, config) { let data = config.data; data.PageIndex = pagenumber; if (this.cookiePair.get("__RequestVerificationToken")) { data.__RequestVerificationToken = this.cookiePair.get( "__RequestVerificationToken" ); } let headers = { Accept: "text/plain, */*; q=0.01", "Accept-Language": "zh-CN,zh;q=0.9", "Cache-Control": "no-cache", "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", Origin: "https://www.youzhicai.com", Pragma: "no-cache", Priority: "u=1, i", Referer: "https://www.youzhicai.com/s/1_1_0_0_.html", "Sec-Ch-Ua": '"Not)A;Brand";v="8", "Chromium";v="138", "Google Chrome";v="138"', "Sec-Ch-Ua-Mobile": "?0", "Sec-Ch-Ua-Platform": '"macOS"', "Sec-Fetch-Dest": "empty", "Sec-Fetch-Mode": "cors", "Sec-Fetch-Site": "same-origin", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36", "X-Requested-With": "XMLHttpRequest", }; try { const response = await this.axiosInstance({ url: config.url, data, method: "post", headers, }); let result = response.data; return [null, result]; } catch (err) { console.log("cookie不对"); try { await this.initializeCookie(); data.__RequestVerificationToken = this.cookiePair.get( "__RequestVerificationToken" ); const retryResponse = await this.axiosInstance({ url: config.url, data, method: "post", headers, }); // console.log(retryResponse.data); let result = retryResponse.data; return [null, result]; } catch (retryErr) { return [retryErr, null]; } } } } new YouZhiCai([ { name: "优质采【招标公告】", info: [], options: { name: "优质采【招标公告】", url: "https://www.youzhicai.com/s/1_1_0_0_.html", data: { MsProvince: "", MsCity: "", MsStartDate: "", MsEndDate: "", AutoOr: 0, BackOr: 0, NoticeTitle: "", searchAccuracy: "precise", matchType: "precise", TenderType: "", MsBidderType: 1, MsNoticeType: 1, MsPublishType: 0, MsSingUpType: 1, MsSort: 2, MsProvince: "", PageIndex: 1, PageSize: 15, AgencyId: "", SecondSearch: "", SecondSearchType: "", TotalSize: 10000, SearchRange: 3, year: "", key1: "", key2: "", key3: "", }, }, }, ]); new YouZhiCai([ { name: "优质采【澄清/变更公告】", info: [], options: { name: "优质采【澄清/变更公告】", url: "https://www.youzhicai.com/s/1_1_0_0_.html", data: { MsProvince: "", MsCity: "", MsStartDate: "", MsEndDate: "", AutoOr: 0, BackOr: 0, NoticeTitle: "", searchAccuracy: "precise", matchType: "precise", TenderType: "", MsBidderType: 1, MsNoticeType: 5, MsPublishType: 0, MsSingUpType: 1, MsSort: 2, MsProvince: "", PageIndex: 1, PageSize: 15, AgencyId: "", SecondSearch: "", SecondSearchType: "", TotalSize: 10000, SearchRange: 3, year: "", key1: "", key2: "", key3: "", }, }, }, ]); new YouZhiCai([ { name: "优质采【招标项目计划】", info: [], options: { name: "优质采【招标项目计划】", url: "https://www.youzhicai.com/s/1_1_0_0_.html", data: { MsProvince: "", MsCity: "", MsStartDate: "", MsEndDate: "", AutoOr: 0, BackOr: 0, NoticeTitle: "", searchAccuracy: "precise", matchType: "precise", TenderType: "", MsBidderType: 1, MsNoticeType: 7, MsPublishType: 0, MsSingUpType: 1, MsSort: 2, MsProvince: "", PageIndex: 1, PageSize: 15, AgencyId: "", SecondSearch: "", SecondSearchType: "", TotalSize: 10000, SearchRange: 3, year: "", key1: "", key2: "", key3: "", }, }, }, ]);