From 9800d231d12e241d3163389bb92d6b9a6ff42531 Mon Sep 17 00:00:00 2001 From: huzhengrong Date: Fri, 24 Oct 2025 14:59:50 +0800 Subject: [PATCH] =?UTF-8?q?=F0=9F=A6=84=20refactor(service):=20=E5=88=A0?= =?UTF-8?q?=E9=99=A4=E4=B8=8D=E9=9C=80=E8=A6=81=E7=9A=84spider?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- service/jianghuai.js | 385 ------------------------------------------- 1 file changed, 385 deletions(-) delete mode 100644 service/jianghuai.js diff --git a/service/jianghuai.js b/service/jianghuai.js deleted file mode 100644 index 5085735..0000000 --- a/service/jianghuai.js +++ /dev/null @@ -1,385 +0,0 @@ -import axios from "axios"; -import fs from "fs"; -import path from "path"; -import JSON5 from "json5"; -import { timestampToDate, loopCall, keywordsInclude } from "../utils.js"; -import config from "../config.js"; -import { SQLiteMessageQueue } from "../sqlite.js"; - -class JiangHuai { - constructor(jsonMap) { - this.axiosInstance = axios.create({ timeout: 30000, maxRedirects: 5 }); - this.axiosInstance.interceptors.request.use((config) => { - // 添加cookie到请求头 - const cookieString = Array.from(this.cookiePair.entries()) - .map(([name, value]) => `${name}=${value}`) - .join("; "); - config.headers.Cookie = cookieString; - return config; - }); - this.axiosInstance.interceptors.response.use( - (response) => { - // 更新cookie到请求头 - let cookieArr = response.headers["set-cookie"]; - this.extractCookie(cookieArr); - return response; - }, - (error) => { - return Promise.reject(error); - } - ); - this.cookiePair = new Map(); - this.csrfToken = ""; - this.jsonMap = jsonMap; - // [ - // { - // name: "江淮【招标公告】", - // info: [], - // options: { - // name: "江淮【招标公告】", - // url: "https://ahjhqc.youzhicai.com/domain/data-list-new", - // data: { - // pageIndex: 1, - // type: 1, - // companyId: "", - // title: "", - // ntype: 1, - // start_time: "", - // end_time: "", - // child: "", - // tenderType: 3, - // }, - // }, - // }, - // { - // name: "江淮【变更/澄清公告】", - // info: [], - // options: { - // name: "江淮【变更/澄清公告】", - // url: "https://ahjhqc.youzhicai.com/domain/data-list-new", - // data: { - // pageIndex: 1, - // type: 1, - // companyId: "", - // title: "", - // ntype: "4,6", - // start_time: "", - // end_time: "", - // child: "", - // tenderType: 3, - // }, - // }, - // }, - // ]; - console.log("江淮 爬虫启动..."); - this.queue = new SQLiteMessageQueue(); - this.start(); - } - - async start() { - try { - await this.init(); - } catch (err) { - console.error("启动失败:", err); - } - } - async init() { - for (let item of this.jsonMap) { - let announcements = this.queue.getAnnouncementsBySpider(item.name); - if (announcements.length > 0) { - this.loopFetchIncrement(item); - } else { - this.loopFetchFull(item); - } - } - } - async initializeCookie() { - try { - let headers = { - headers: { - "User-Agent": - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36", - Accept: - "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8", - "Accept-Language": "zh-CN,zh;q=0.9", - "Cache-Control": "no-cache", - Pragma: "no-cache", - "Sec-Fetch-Dest": "document", - "Sec-Fetch-Mode": "navigate", - "Sec-Fetch-Site": "none", - "Upgrade-Insecure-Requests": "1", - }, - }; - const homeResponse = await this.axiosInstance.get( - "https://ahjhqc.youzhicai.com/homeindex/noticeListNew.html?type=1", - headers - ); - // 提取csrf-token - let tokenMatch = homeResponse.data.match( - / { - return ( - pagenumber >= result.pages || pagenumber >= config.pageNumberLimit - ); - }, - readyForNext: (pagenumber, result) => { - props.info.push(...result.info); - return pagenumber + 1; - }, - complete: (result) => { - props.info.push(...result.info); - console.log(`爬取完成,共获取 ${props.info.length} 条有效数据`); - try { - if (props.info.length > 0) { - this.queue.saveAnnouncements(props.name, props.info); - this.queue.addMessage(props.name, props.info); - } - } catch (error) { - console.error("数据库操作失败:", error); - } - this.loopFetchIncrement(props); - }, - }); - } catch (error) { - console.error(`${props.options.name}全量爬取失败:`, error); - } - } - loopFetchIncrement(props) { - console.log("开始增量爬取"); - try { - loopCall(this.getInfo.bind(this), { - time: config.incrementFetchTime, // 5分钟间隔 - pagenumber: 1, - additional: props.options, - readyForNext: (pagenumber, result) => { - try { - let newInfo = this.queue.filterNewAnnouncements( - props.name, - result.info - ); - // 存在新数据 - if (newInfo.length > 0) { - console.log(`发现 ${newInfo.length} 条新数据`); - // props.info.push(...newInfo); - this.queue.saveAnnouncements(props.name, newInfo); - // this.writeFile(props); - this.queue.addMessage(props.name, newInfo); - // 全是新数据,继续下一页 - if (newInfo.length === result.info.length) { - return pagenumber + 1; - } else { - // 有部分重复数据,重新从第一页开始 - return 1; - } - } else { - console.log("没有发现新数据,继续监控..."); - return 1; // 重新从第一页开始 - } - } catch (error) { - console.error("数据库操作失败:", error); - } - }, - }); - } catch (error) { - console.error(`${props.options.name}增量爬取失败:`, error); - } - } - async getInfo(pagenumber = 1, config) { - let info = []; - console.log(`${config.name}--获取第 ${pagenumber} 页数据...`); - let result = await this.getList(pagenumber, config); - if (result[0]) { - // 出错, 记录错误日志 - console.error("获取页面数据失败: ", result[0]); - return { pages: 0, info: [] }; - } else { - // 公开寻源 - let arr = result[1].list; - let total = result[1].total; - let pages = Math.ceil(total / 10); - - for (let i = 0; i < arr.length; i++) { - let item = arr[i]; - let endTime, publishTime; - publishTime = new Date(item.startTime).toLocaleDateString(); - endTime = new Date(item.endTime).toLocaleDateString(); - // 命中关键词 - if ( - keywordsInclude(item.noticeTitle) && - item.endTime && - +new Date(item.endTime) >= Date.now() - ) { - console.log("处理项目:", item.noticeTitle); - info.push({ - id: item.bulletinSID, - name: item.noticeTitle, - publishTime: publishTime, - endTime: endTime, - urls: `https://ahjhqc.youzhicai.com/${item.Url}`, - }); - } - } - return { pages, info }; - } - } - async getList(pagenumber, config) { - let data = config.data; - data.pageIndex = pagenumber; - let headers = { - Accept: "text/plain, */*; q=0.01", - "Accept-Language": "zh-CN,zh;q=0.9", - "Cache-Control": "no-cache", - "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", - Origin: "https://ahjhqc.youzhicai.com", - Pragma: "no-cache", - Priority: "u=1, i", - Referer: - "https://ahjhqc.youzhicai.com/homeindex/noticeListNew.html?type=1", - "Sec-Ch-Ua": - '"Not)A;Brand";v="8", "Chromium";v="138", "Google Chrome";v="138"', - "Sec-Ch-Ua-Mobile": "?0", - "Sec-Ch-Ua-Platform": '"macOS"', - "Sec-Fetch-Dest": "empty", - "Sec-Fetch-Mode": "cors", - "Sec-Fetch-Site": "same-origin", - "User-Agent": - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36", - "X-Requested-With": "XMLHttpRequest", - "X-Csrf-Token": this.csrfToken, - }; - try { - const response = await this.axiosInstance({ - url: config.url, - data, - method: "post", - headers, - }); - let result = JSON5.parse(response.data); - if (result.list && result.list.length > 0) { - return [null, result]; - } else { - return ["err", null]; - } - } catch (err) { - console.log("cookie不对"); - try { - await this.initializeCookie(); - headers["X-Csrf-Token"] = this.csrfToken; - const retryResponse = await this.axiosInstance({ - url: config.url, - data, - method: "post", - headers, - }); - // console.log(retryResponse.data); - let result = JSON5.parse(retryResponse.data); - if (result.list && result.list.length > 0) { - return [null, result]; - } else { - return ["err", null]; - } - } catch (retryErr) { - return [retryErr, null]; - } - } - } - // 分页获取数据 - // getList(pagenumber, config) { - // let data = config.data; - // data.pageIndex = pagenumber; - // return axios({ - // url: config.url, - // data: data, - // method: "post", - // headers: { - // "Content-Type": "application/x-www-form-urlencoded", - // }, - // }) - // .then((res) => { - // let result = res.data; - // if (result.list && result.list.length > 0) { - // return [null, result]; - // } else { - // return ["err", null]; - // } - // }) - // .catch((err) => { - // return [err, null]; - // }); - // } -} - -new JiangHuai([ - { - name: "江淮【招标公告】", - info: [], - options: { - name: "江淮【招标公告】", - url: "https://ahjhqc.youzhicai.com/domain/data-list-new", - data: { - pageIndex: 1, - type: 1, - companyId: "", - title: "", - ntype: 1, - start_time: "", - end_time: "", - child: "", - tenderType: 3, - }, - }, - }, -]); -new JiangHuai([ - { - name: "江淮【变更/澄清公告】", - info: [], - options: { - name: "江淮【变更/澄清公告】", - url: "https://ahjhqc.youzhicai.com/domain/data-list-new", - data: { - pageIndex: 1, - type: 1, - companyId: "", - title: "", - ntype: "4,6", - start_time: "", - end_time: "", - child: "", - tenderType: 3, - }, - }, - }, -]);