181 lines
5.2 KiB
JavaScript
181 lines
5.2 KiB
JavaScript
//太平洋保险
|
||
import axios from "axios";
|
||
import fs from "fs";
|
||
import path from "path";
|
||
import { timestampToDate, loopCall, keywordsInclude } from "../utils.js";
|
||
import config from "../config.js";
|
||
import { SQLiteMessageQueue } from "../sqlite.js";
|
||
import * as cheerio from "cheerio";
|
||
|
||
class DF {
|
||
constructor() {
|
||
this.jsonMap = [
|
||
{
|
||
name: "太平洋保险",
|
||
info: [],
|
||
options: {
|
||
name: "太平洋保险",
|
||
url: "https://www.cpic.com.cn/aboutUs/gsdt/zxgg/",
|
||
homeIndex: "index.shtml",
|
||
},
|
||
},
|
||
];
|
||
console.log("太平洋保险 爬虫启动...");
|
||
this.queue = new SQLiteMessageQueue();
|
||
this.start();
|
||
}
|
||
|
||
async start() {
|
||
try {
|
||
await this.init();
|
||
} catch (err) {
|
||
console.error("启动失败:", err);
|
||
}
|
||
}
|
||
async init() {
|
||
for (let item of this.jsonMap) {
|
||
let announcements = this.queue.getAnnouncementsBySpider(item.name);
|
||
if (announcements.length > 0) {
|
||
this.loopFetchIncrement(item);
|
||
} else {
|
||
this.loopFetchFull(item);
|
||
}
|
||
}
|
||
}
|
||
// 全量爬取
|
||
loopFetchFull(props) {
|
||
try {
|
||
loopCall(this.getInfo.bind(this), {
|
||
time: config.fullFetchTime,
|
||
pagenumber: 1,
|
||
additional: props.options,
|
||
stopWhen: (pagenumber, result) => {
|
||
return (
|
||
pagenumber >= result.pages || pagenumber >= config.pageNumberLimit
|
||
);
|
||
},
|
||
readyForNext: (pagenumber, result) => {
|
||
props.info.push(...result.info);
|
||
return pagenumber + 1;
|
||
},
|
||
complete: (result) => {
|
||
props.info.push(...result.info);
|
||
console.log(`爬取完成,共获取 ${props.info.length} 条有效数据`);
|
||
try {
|
||
if (props.info.length > 0) {
|
||
this.queue.saveAnnouncements(props.name, props.info);
|
||
// this.writeFile(props);
|
||
this.queue.addMessage(props.name, props.info);
|
||
}
|
||
} catch (error) {
|
||
console.error("数据库操作失败:", error);
|
||
}
|
||
this.loopFetchIncrement(props);
|
||
},
|
||
});
|
||
} catch (error) {
|
||
console.error(`${props.options.name}全量爬取失败:`, error);
|
||
}
|
||
}
|
||
loopFetchIncrement(props) {
|
||
try {
|
||
loopCall(this.getInfo.bind(this), {
|
||
time: config.incrementFetchTime, // 5分钟间隔
|
||
pagenumber: 1,
|
||
additional: props.options,
|
||
readyForNext: (pagenumber, result) => {
|
||
try {
|
||
let newInfo = this.queue.filterNewAnnouncements(
|
||
props.name,
|
||
result.info
|
||
);
|
||
// 存在新数据
|
||
if (newInfo.length > 0) {
|
||
console.log(`发现 ${newInfo.length} 条新数据`);
|
||
// props.info.push(...newInfo);
|
||
this.queue.saveAnnouncements(props.name, newInfo);
|
||
// this.writeFile(props);
|
||
this.queue.addMessage(props.name, newInfo);
|
||
// 全是新数据,继续下一页
|
||
if (newInfo.length === result.info.length) {
|
||
return pagenumber + 1;
|
||
} else {
|
||
// 有部分重复数据,重新从第一页开始
|
||
return 1;
|
||
}
|
||
} else {
|
||
console.log("没有发现新数据,继续监控...");
|
||
return 1; // 重新从第一页开始
|
||
}
|
||
} catch (error) {
|
||
console.error("数据库操作失败:", error);
|
||
}
|
||
},
|
||
});
|
||
} catch (error) {
|
||
console.error(`${props.options.name}增量爬取失败:`, error);
|
||
}
|
||
}
|
||
async getInfo(pagenumber = 1, config) {
|
||
let info = [];
|
||
console.log(`${config.name}--获取第 ${pagenumber} 页数据...`);
|
||
let result = await this.getList(pagenumber, config);
|
||
if (result[0]) {
|
||
// 出错, 记录错误日志
|
||
console.error("获取页面数据失败:", result[0].status);
|
||
return { pages: 0, info: [] };
|
||
} else {
|
||
// 列表没有结束时间字段,默认5页
|
||
let pages = 5;
|
||
let html = result[1];
|
||
const $ = cheerio.load(html);
|
||
$(".newsListSimple li").each((index, element) => {
|
||
let id = $(element).find("a").attr("href");
|
||
let name = $(element).find("a").text();
|
||
let publishTime = null;
|
||
let endTime = null;
|
||
let urls =
|
||
"https://www.cpic.com.cn" + $(element).find("a").attr("href");
|
||
if (
|
||
// endTime &&
|
||
// +new Date(endTime) >= Date.now() &&
|
||
keywordsInclude(name) &&
|
||
urls.includes("shtml")
|
||
) {
|
||
console.log("处理项目:", id, name);
|
||
info.push({
|
||
id: id,
|
||
name: name,
|
||
publishTime: publishTime,
|
||
endTime: endTime,
|
||
urls: urls,
|
||
});
|
||
}
|
||
});
|
||
return { pages, info };
|
||
}
|
||
}
|
||
// 分页获取数据
|
||
getList(pagenumber, config) {
|
||
let url = config.url;
|
||
if (pagenumber === 1) {
|
||
url += config.homeIndex;
|
||
} else {
|
||
url += `index_${pagenumber}.shtml`;
|
||
}
|
||
return axios({
|
||
url: url,
|
||
method: "get",
|
||
})
|
||
.then((res) => {
|
||
let result = res.data;
|
||
return [null, result];
|
||
})
|
||
.catch((err) => {
|
||
return [err, null];
|
||
});
|
||
}
|
||
}
|
||
|
||
new DF();
|