import axios from "axios"; import fs from "fs"; import path from "path"; import { timestampToDate, loopCall, keywordsInclude, getYiqiNoticeUrl, parseToGgDetailsParams, } from "./utils.js"; import config from "./config.js"; import * as cheerio from "cheerio"; import { SQLiteMessageQueue } from "./sqlite.js"; class NIO { constructor() { // this.filepath = path.resolve("yiqi.json"); this.info = []; console.log("蔚来 爬虫启动..."); this.queue = new SQLiteMessageQueue(); this.start(); } async start() { try { await this.init(); } catch (err) { console.error("启动失败:", err); } } async init() { let announcements = this.queue.getAnnouncementsBySpider("蔚来"); if (announcements.length > 0) { await this.increment(); } else { await this.fullFetch(); } } // 全量爬取 async fullFetch() { console.log("开始全量爬取..."); try { await loopCall(this.getInfo.bind(this), { time: config.fullFetchTime, pagenumber: 1, stopWhen: (pagenumber, result) => { return ( pagenumber >= result.pages || pagenumber >= config.pageNumberLimit ); }, readyForNext: (pagenumber, result) => { this.info.push(...result.info); return pagenumber + 1; }, complete: (result) => { this.info.push(...result.info); console.log(`爬取完成,共获取 ${this.info.length} 条有效数据`); try { if (this.info.length > 0) { this.queue.saveAnnouncements("蔚来", this.info); // this.writeFile(this.info); this.queue.addMessage("蔚来", this.info); } } catch (error) { console.error("数据库操作失败:", error); } }, }); } catch (error) { console.error("全量爬取失败:", error); } console.log("开始增量爬取..."); this.increment(); } // 增量爬取 async increment() { console.log("开始增量爬取模式,每5分钟检查一次新数据..."); try { await loopCall(this.getInfo.bind(this), { time: config.incrementFetchTime, // 5分钟间隔 pagenumber: 1, readyForNext: (pagenumber, result) => { try { let newInfo = this.queue.filterNewAnnouncements( "蔚来", result.info ); // 存在新数据 if (newInfo.length > 0) { console.log(`发现 ${newInfo.length} 条新数据`); // this.info.push(...newInfo); this.queue.saveAnnouncements("蔚来", newInfo); // this.writeFile(this.info); this.queue.addMessage("蔚来", newInfo); // 全是新数据,继续下一页 if (newInfo.length === result.info.length) { return pagenumber + 1; } else { // 有部分重复数据,重新从第一页开始 return 1; } } else { console.log("没有发现新数据,继续监控..."); return 1; // 重新从第一页开始 } } catch (error) { console.error("数据库操作失败:", error); } }, }); } catch (error) { console.error("增量爬取失败:", error); } } async getInfo(pagenumber = 1) { let info = []; console.log(`正在获取第 ${pagenumber} 页数据...`); let result = await this.getHtml(pagenumber); if (result[0]) { // 出错, 记录错误日志 console.error("获取页面数据失败:", result[0]); return { pages: 0, info: [] }; } else { let pages = 1; let html = result[1]; const $ = cheerio.load(html); let jsonStr = $("#__NEXT_DATA__").text(); let data = JSON.parse(jsonStr).props.pageProps.tenderNotices; // console.log(data); data.forEach((item) => { let id = item.id; let name = item.title; let publishTime = item.publishDate; let endTime = item.dueTime; let urls = item.documents[0].url; if ( endTime && +new Date(endTime) >= Date.now() && keywordsInclude(name) ) { info.push({ id, name, publishTime, endTime, urls, }); } }); return { pages, info }; } } // 分页获取数据 getHtml(pagenumber) { return axios({ url: "https://www.nio.cn/partnership/tender-notices", method: "get", }) .then((res) => { let result = res.data; return [null, result]; }) .catch((err) => { return [err, null]; }); } } new NIO();