diff --git a/ecosystem.config.cjs b/ecosystem.config.cjs index 3fbbb54..6bd6c4c 100644 --- a/ecosystem.config.cjs +++ b/ecosystem.config.cjs @@ -49,22 +49,6 @@ module.exports = { log_file: "./logs/cpic-combined.log", time: true, }, - // {//中国人寿保险爬虫 - // name: "china-life-spider", - // script: "./service/china-life.js", - // instances: 1, - // autorestart: true, - // watch: false, - // max_memory_restart: "300M", - // env: { - // NODE_ENV: "production", - // SPIDER_NAME: "china-life", - // }, - // error_file: "./logs/china-life-error.log", - // out_file: "./logs/china-life-out.log", - // log_file: "./logs/china-life-combined.log", - // time: true, - // }, {//中华保险爬虫 name: "cic-spider", script: "./service/cic.js", diff --git a/service/china-life.js b/service/china-life.js deleted file mode 100644 index a4fa4f7..0000000 --- a/service/china-life.js +++ /dev/null @@ -1,197 +0,0 @@ -import axios from "axios"; -import fs from "fs"; -import path from "path"; -import { timestampToDate, loopCall, keywordsInclude } from "../utils.js"; -import config from "../config.js"; -import { SQLiteMessageQueue } from "../sqlite.js"; -import puppeteer from 'puppeteer'; - -class DF { - constructor() { - this.jsonMap = [ - { - name: "中国人寿保险", - info: [], - options: { - name: "中国人寿保险", - url: "https://cpmsx.e-chinalife.com/xycms/#/procurementAnncmnt", - homeIndex: "", - }, - }, - ]; - console.log("中国人寿保险 爬虫启动..."); - this.queue = new SQLiteMessageQueue(); - this.start(); - } - - async start() { - try { - await this.init(); - } catch (err) { - console.error("启动失败:", err); - } - } - async init() { - for (let item of this.jsonMap) { - let announcements = this.queue.getAnnouncementsBySpider(item.name); - if (announcements.length > 0) { - this.loopFetchIncrement(item); - } else { - this.loopFetchFull(item); - } - } - } - // 全量爬取 - loopFetchFull(props) { - try { - loopCall(this.getInfo.bind(this), { - time: config.fullFetchTime, - pagenumber: 1, - additional: props.options, - stopWhen: (pagenumber, result) => { - return ( - pagenumber >= result.pages || pagenumber >= config.pageNumberLimit - ); - }, - readyForNext: (pagenumber, result) => { - props.info.push(...result.info); - return pagenumber + 1; - }, - complete: (result) => { - props.info.push(...result.info); - console.log(`爬取完成,共获取 ${props.info.length} 条有效数据`); - try { - if (props.info.length > 0) { - this.queue.saveAnnouncements(props.name, props.info); - // this.writeFile(props); - this.queue.addMessage(props.name, props.info); - } - } catch (error) { - console.error("数据库操作失败:", error); - } - this.loopFetchIncrement(props); - }, - }); - } catch (error) { - console.error(`${props.options.name}全量爬取失败:`, error); - } - } - loopFetchIncrement(props) { - try { - loopCall(this.getInfo.bind(this), { - time: config.incrementFetchTime, // 5分钟间隔 - pagenumber: 1, - additional: props.options, - readyForNext: (pagenumber, result) => { - try { - let newInfo = this.queue.filterNewAnnouncements( - props.name, - result.info - ); - // 存在新数据 - if (newInfo.length > 0) { - console.log(`发现 ${newInfo.length} 条新数据`); - // props.info.push(...newInfo); - this.queue.saveAnnouncements(props.name, newInfo); - // this.writeFile(props); - this.queue.addMessage(props.name, newInfo); - // 全是新数据,继续下一页 - if (newInfo.length === result.info.length) { - return pagenumber + 1; - } else { - // 有部分重复数据,重新从第一页开始 - return 1; - } - } else { - console.log("没有发现新数据,继续监控..."); - return 1; // 重新从第一页开始 - } - } catch (error) { - console.error("数据库操作失败:", error); - } - }, - }); - } catch (error) { - console.error(`${props.options.name}增量爬取失败:`, error); - } - } - async getInfo(pagenumber = 1, config) { - let info = []; - console.log(`${config.name}--获取第 ${pagenumber} 页数据...`); - let result = await this.getList(pagenumber, config); - if (result[0]) { - // 出错, 记录错误日志 - console.error("获取页面数据失败:", result[0].status); - return { pages: 0, info: [] }; - } else { - // 列表没有结束时间字段,默认5页 - let pages = 5; - let html = result[1]; - let browser = result[2]; - try{ - - const listData = await html.$$eval('.cardPadding', items => { - return items.map(item => { - console.log("item:", item); - let id = item.querySelector('a').innerHTML; - let name = item.querySelector('a').innerHTML; - let publishTime = item.querySelector('.releaseDate').innerHTML; - let endTime = null; - let urls = item.querySelector('a').getAttribute('href'); - - // if ( - // keywordsInclude(name) - // ) { - console.log("处理项目:", id, name); - info.push({ - id: id, - name: name, - publishTime: publishTime, - endTime: endTime, - urls: urls, - }); - // } - }); - }); - } catch (error) { - console.log("getInfo失败:", error); - } - - await browser.close(); - return { pages, info }; - } - } - // 分页获取数据 - async getList(pagenumber, config) { - let url = config.url; - try{ - const browser = await puppeteer.launch(); - const page = await browser.newPage(); - await page.goto(url) - console.log("页面加载完成-page",page); - // if (pagenumber != 1) { - // await page.locator('.el-pagination__jump input').fill(pagenumber) - // await page.keyboard.press('Enter'); - // await page.waitForResponse('https://cpmsx.e-chinalife.com/ocps/ocps-anncmnt-backend/microapp/web/exp/outer/getCmsContentList'); - // } - return [null,page,browser] - }catch (error) { - console.log("getList失败:", error); - return [error, null]; - } - - // return axios({ - // url: url, - // method: "get", - // }) - // .then((res) => { - // let result = res.data; - // return [null, result]; - // }) - // .catch((err) => { - // return [err, null]; - // }); - } -} - -new DF();