初始化
This commit is contained in:
commit
12ee63b814
|
|
@ -0,0 +1,109 @@
|
|||
# Node.js
|
||||
node_modules/
|
||||
npm-debug.log*
|
||||
yarn-debug.log*
|
||||
yarn-error.log*
|
||||
pnpm-debug.log*
|
||||
package-lock.json
|
||||
yarn.lock
|
||||
pnpm-lock.yaml
|
||||
|
||||
# Logs
|
||||
logs
|
||||
*.log
|
||||
*.log.*
|
||||
log/
|
||||
pids
|
||||
*.pid
|
||||
*.seed
|
||||
*.pid.lock
|
||||
|
||||
# OS
|
||||
.DS_Store
|
||||
Thumbs.db
|
||||
*.db
|
||||
|
||||
# dotenv environment variables
|
||||
.env
|
||||
.env.*
|
||||
!.env.example
|
||||
|
||||
# Editor directories and files
|
||||
.idea/
|
||||
.vscode/
|
||||
*.sublime-workspace
|
||||
*.sublime-project
|
||||
|
||||
# Build output
|
||||
dist/
|
||||
build/
|
||||
out/
|
||||
coverage/
|
||||
.nyc_output/
|
||||
|
||||
# Optional npm cache directory
|
||||
.npm/
|
||||
|
||||
# Optional eslint cache
|
||||
.eslintcache
|
||||
|
||||
# Optional REPL history
|
||||
.node_repl_history
|
||||
|
||||
# Mac system files
|
||||
.AppleDouble
|
||||
.LSOverride
|
||||
|
||||
# Test coverage
|
||||
coverage/
|
||||
|
||||
# TypeScript cache
|
||||
*.tsbuildinfo
|
||||
|
||||
# Optional: local data
|
||||
*.local
|
||||
|
||||
# Optional: debug
|
||||
debug.log
|
||||
|
||||
# Optional: next.js
|
||||
.next/
|
||||
|
||||
# Optional: Nuxt.js
|
||||
.nuxt/
|
||||
|
||||
# Optional: SvelteKit
|
||||
.svelte-kit/
|
||||
|
||||
# Optional: vuepress
|
||||
.vuepress/dist
|
||||
|
||||
# Optional: Storybook
|
||||
.storybook-out/
|
||||
|
||||
# Optional: Parcel
|
||||
.cache/
|
||||
|
||||
# Optional: output of 'npm pack'
|
||||
*.tgz
|
||||
|
||||
# Optional: PM2 logs and pids
|
||||
pids/
|
||||
*.pid
|
||||
*.seed
|
||||
*.pid.lock
|
||||
pm2.log
|
||||
|
||||
# Optional: dotenv
|
||||
.env.local
|
||||
.env.development.local
|
||||
.env.test.local
|
||||
.env.production.local
|
||||
|
||||
# Optional: jest
|
||||
jest.config.js
|
||||
jest.config.ts
|
||||
|
||||
# Optional: cypress
|
||||
cypress/videos/
|
||||
cypress/screenshots/
|
||||
|
|
@ -0,0 +1,173 @@
|
|||
import axios from "axios";
|
||||
import fs from "fs";
|
||||
import path from "path";
|
||||
import { timestampToDate, loopCall, keywordsInclude } from "./utils.js";
|
||||
import config from "./config.js";
|
||||
import { SQLiteMessageQueue } from "./sqlite.js";
|
||||
|
||||
class BYD {
|
||||
constructor() {
|
||||
this.info = [];
|
||||
console.log("比亚迪 爬虫启动...");
|
||||
this.queue = new SQLiteMessageQueue();
|
||||
this.start();
|
||||
}
|
||||
|
||||
async start() {
|
||||
try {
|
||||
await this.init();
|
||||
} catch (err) {
|
||||
console.error("启动失败:", err);
|
||||
}
|
||||
}
|
||||
async init() {
|
||||
let announcements = this.queue.getAnnouncementsBySpider("比亚迪");
|
||||
if (announcements.length > 0) {
|
||||
await this.increment();
|
||||
} else {
|
||||
await this.fullFetch();
|
||||
}
|
||||
}
|
||||
// 全量爬取
|
||||
async fullFetch() {
|
||||
console.log("开始全量爬取...");
|
||||
try {
|
||||
await loopCall(this.getInfo.bind(this), {
|
||||
time: config.fullFetchTime,
|
||||
pagenumber: 1,
|
||||
stopWhen: (pagenumber, result) => {
|
||||
return (
|
||||
pagenumber >= result.pages || pagenumber >= config.pageNumberLimit
|
||||
);
|
||||
},
|
||||
readyForNext: (pagenumber, result) => {
|
||||
this.info.push(...result.info);
|
||||
return pagenumber + 1;
|
||||
},
|
||||
complete: (result) => {
|
||||
this.info.push(...result.info);
|
||||
console.log(`爬取完成,共获取 ${this.info.length} 条有效数据`);
|
||||
try {
|
||||
if (this.info.length > 0) {
|
||||
this.queue.saveAnnouncements("比亚迪", this.info);
|
||||
// this.writeFile(this.info);
|
||||
this.queue.addMessage("比亚迪", this.info);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("数据库操作失败:", error);
|
||||
}
|
||||
},
|
||||
});
|
||||
} catch (error) {
|
||||
console.error("全量爬取失败:", error);
|
||||
}
|
||||
console.log("开始增量爬取...");
|
||||
this.increment();
|
||||
}
|
||||
|
||||
// 增量爬取
|
||||
async increment() {
|
||||
console.log("开始增量爬取模式,每5分钟检查一次新数据...");
|
||||
try {
|
||||
await loopCall(this.getInfo.bind(this), {
|
||||
time: config.incrementFetchTime, // 5分钟间隔
|
||||
pagenumber: 1,
|
||||
readyForNext: (pagenumber, result) => {
|
||||
try {
|
||||
let newInfo = this.queue.filterNewAnnouncements(
|
||||
"比亚迪",
|
||||
result.info
|
||||
);
|
||||
// 存在新数据
|
||||
if (newInfo.length > 0) {
|
||||
console.log(`发现 ${newInfo.length} 条新数据`);
|
||||
// this.info.push(...newInfo);
|
||||
this.queue.saveAnnouncements("比亚迪", newInfo);
|
||||
// this.writeFile(this.info);
|
||||
this.queue.addMessage("比亚迪", newInfo);
|
||||
// 全是新数据,继续下一页
|
||||
if (newInfo.length === result.info.length) {
|
||||
return pagenumber + 1;
|
||||
} else {
|
||||
// 有部分重复数据,重新从第一页开始
|
||||
return 1;
|
||||
}
|
||||
} else {
|
||||
console.log("没有发现新数据,继续监控...");
|
||||
return 1; // 重新从第一页开始
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("数据库操作失败:", error);
|
||||
}
|
||||
},
|
||||
});
|
||||
} catch (error) {
|
||||
console.error("增量爬取失败:", error);
|
||||
}
|
||||
}
|
||||
async getInfo(pagenumber = 1) {
|
||||
let info = [];
|
||||
console.log(`正在获取第 ${pagenumber} 页数据...`);
|
||||
let result = await this.getList(pagenumber);
|
||||
if (result[0]) {
|
||||
// 出错, 记录错误日志
|
||||
console.error("获取页面数据失败:", result[0]);
|
||||
return { pages: 0, info: [] };
|
||||
} else {
|
||||
let total = result[1].data.total;
|
||||
let pages = Math.ceil(total / 10);
|
||||
let arr = result[1].data.records;
|
||||
|
||||
for (let i = 0; i < arr.length; i++) {
|
||||
let item = arr[i];
|
||||
let endTime = timestampToDate(
|
||||
new Date(item.signUpEndTime).getTime(),
|
||||
true
|
||||
);
|
||||
// 命中关键词
|
||||
if (
|
||||
keywordsInclude(item.title) &&
|
||||
endTime &&
|
||||
+new Date(endTime) >= Date.now()
|
||||
) {
|
||||
// console.log("处理项目:", item.sourcingId, item.title);
|
||||
info.push({
|
||||
id: item.sourcingId,
|
||||
name: item.title,
|
||||
publishTime: timestampToDate(
|
||||
new Date(item.tenderNoticePublishTime).getTime(),
|
||||
true
|
||||
),
|
||||
endTime: endTime,
|
||||
urls: `https://spcn.byd.com/#/tender-detail?sourcingId=${item.sourcingId}`,
|
||||
});
|
||||
}
|
||||
}
|
||||
return { pages, info };
|
||||
}
|
||||
}
|
||||
// 分页获取数据
|
||||
getList(pagenumber) {
|
||||
return axios({
|
||||
url: "https://spcn.byd.com/api/srm-sou-sp/supplier/supplier/getTenderAnnouncementInfo",
|
||||
data: {
|
||||
pageNo: pagenumber,
|
||||
pageSize: 10,
|
||||
},
|
||||
method: "post",
|
||||
})
|
||||
.then((res) => {
|
||||
let result = res.data;
|
||||
if (result.msg === "成功" && result.code === "000000") {
|
||||
return [null, result];
|
||||
} else {
|
||||
return ["err", null];
|
||||
}
|
||||
})
|
||||
.catch((err) => {
|
||||
return [err, null];
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
new BYD();
|
||||
|
|
@ -0,0 +1,188 @@
|
|||
import axios from "axios";
|
||||
import fs from "fs";
|
||||
import path from "path";
|
||||
import {
|
||||
timestampToDate,
|
||||
loopCall,
|
||||
keywordsInclude,
|
||||
// addToMessageQueue,
|
||||
} from "./utils.js";
|
||||
import config from "./config.js";
|
||||
import { SQLiteMessageQueue } from "./sqlite.js";
|
||||
// import { messageQueue } from "./msgManager.js";
|
||||
// import cheerio from "cheerio";
|
||||
|
||||
class ChangAn {
|
||||
constructor() {
|
||||
// this.filepath = path.resolve("changan.json");
|
||||
this.info = [];
|
||||
console.log("长安 爬虫启动...");
|
||||
this.queue = new SQLiteMessageQueue();
|
||||
this.start();
|
||||
}
|
||||
|
||||
async start() {
|
||||
try {
|
||||
await this.init();
|
||||
} catch (err) {
|
||||
console.error("启动失败:", err);
|
||||
}
|
||||
}
|
||||
async init() {
|
||||
let announcements = this.queue.getAnnouncementsBySpider("长安");
|
||||
if (announcements.length > 0) {
|
||||
await this.increment();
|
||||
} else {
|
||||
await this.fullFetch();
|
||||
}
|
||||
|
||||
// if (fs.existsSync(this.filepath)) {
|
||||
// let data = fs.readFileSync(this.filepath, "utf-8");
|
||||
// this.info = data ? JSON.parse(data) : [];
|
||||
// if (this.info.length > 0) {
|
||||
// await this.increment();
|
||||
// } else {
|
||||
// await this.fullFetch();
|
||||
// }
|
||||
// } else {
|
||||
// console.log("历史文件不存在,开始全量爬取");
|
||||
// await this.fullFetch();
|
||||
// }
|
||||
}
|
||||
// 全量爬取
|
||||
async fullFetch() {
|
||||
console.log("开始全量爬取...");
|
||||
try {
|
||||
await loopCall(this.getInfo.bind(this), {
|
||||
time: config.fullFetchTime,
|
||||
pagenumber: 1,
|
||||
stopWhen: (pagenumber, result) => {
|
||||
return (
|
||||
pagenumber >= result.pages || pagenumber >= config.pageNumberLimit
|
||||
);
|
||||
},
|
||||
readyForNext: (pagenumber, result) => {
|
||||
this.info.push(...result.info);
|
||||
return pagenumber + 1;
|
||||
},
|
||||
complete: (result) => {
|
||||
this.info.push(...result.info);
|
||||
console.log(`爬取完成,共获取 ${this.info.length} 条有效数据`);
|
||||
try {
|
||||
this.queue.saveAnnouncements("长安", this.info);
|
||||
// this.writeFile(this.info);
|
||||
this.queue.addMessage("长安", this.info);
|
||||
} catch (error) {
|
||||
console.error("数据库操作失败:", error);
|
||||
}
|
||||
},
|
||||
});
|
||||
} catch (error) {
|
||||
console.error("全量爬取失败:", error);
|
||||
}
|
||||
console.log("开始增量爬取...");
|
||||
this.increment();
|
||||
}
|
||||
|
||||
// 增量爬取
|
||||
async increment() {
|
||||
console.log("开始增量爬取模式,每5分钟检查一次新数据...");
|
||||
try {
|
||||
await loopCall(this.getInfo.bind(this), {
|
||||
time: config.incrementFetchTime, // 5分钟间隔
|
||||
pagenumber: 1,
|
||||
readyForNext: (pagenumber, result) => {
|
||||
try {
|
||||
let newInfo = this.queue.filterNewAnnouncements(
|
||||
"长安",
|
||||
result.info
|
||||
);
|
||||
// 存在新数据
|
||||
if (newInfo.length > 0) {
|
||||
console.log(`发现 ${newInfo.length} 条新数据`);
|
||||
// this.info.push(...newInfo);
|
||||
this.queue.saveAnnouncements("长安", newInfo);
|
||||
// this.writeFile(this.info);
|
||||
this.queue.addMessage("长安", newInfo);
|
||||
// 全是新数据,继续下一页
|
||||
if (newInfo.length === result.info.length) {
|
||||
return pagenumber + 1;
|
||||
} else {
|
||||
// 有部分重复数据,重新从第一页开始
|
||||
return 1;
|
||||
}
|
||||
} else {
|
||||
console.log("没有发现新数据,继续监控...");
|
||||
return 1; // 重新从第一页开始
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("数据库操作失败:", error);
|
||||
}
|
||||
},
|
||||
});
|
||||
} catch (error) {
|
||||
console.error("增量爬取失败:", error);
|
||||
}
|
||||
}
|
||||
async getInfo(pagenumber = 1) {
|
||||
let info = [];
|
||||
console.log(`正在获取第 ${pagenumber} 页数据...`);
|
||||
let result = await this.getList(pagenumber);
|
||||
if (result[0]) {
|
||||
// 出错, 记录错误日志
|
||||
console.error("获取页面数据失败:", result[0]);
|
||||
return { pages: 0, info: [] };
|
||||
} else {
|
||||
// let total = result[1].result.total;
|
||||
let pages = result[1].result.pages;
|
||||
let arr = result[1].result.records;
|
||||
|
||||
for (let i = 0; i < arr.length; i++) {
|
||||
let item = arr[i];
|
||||
// 命中关键词
|
||||
if (keywordsInclude(item.projectName)) {
|
||||
console.log("处理项目:", item.id, item.projectName);
|
||||
info.push({
|
||||
id: item.id,
|
||||
name: item.projectName,
|
||||
publishTime: item.startTime,
|
||||
endTime: item.endTime,
|
||||
urls: `https://portal.changan.com.cn/noProdNoticeInfo?_t=${Date.now()}&id=${
|
||||
item.id
|
||||
}`,
|
||||
});
|
||||
}
|
||||
}
|
||||
return { pages, info };
|
||||
}
|
||||
}
|
||||
// 分页获取数据
|
||||
getList(pagenumber) {
|
||||
return axios({
|
||||
url: "https://portal.changan.com.cn/backend_8086/changan_platform/api/nonPdcSourceNoticeCt/listSourceNoticePageBySupplier",
|
||||
params: {
|
||||
_t: Date.now(),
|
||||
pageNo: pagenumber,
|
||||
pageSize: 20,
|
||||
},
|
||||
method: "get",
|
||||
})
|
||||
.then((res) => {
|
||||
let result = res.data;
|
||||
if (result.success) {
|
||||
return [null, result];
|
||||
} else {
|
||||
return ["err", null];
|
||||
}
|
||||
})
|
||||
.catch((err) => {
|
||||
return [err, null];
|
||||
});
|
||||
}
|
||||
|
||||
// writeFile(info) {
|
||||
// fs.writeFileSync(this.filepath, JSON.stringify(info), "utf-8");
|
||||
// }
|
||||
}
|
||||
|
||||
new ChangAn();
|
||||
|
|
@ -0,0 +1,251 @@
|
|||
import axios from "axios";
|
||||
import fs from "fs";
|
||||
import path from "path";
|
||||
import {
|
||||
timestampToDate,
|
||||
loopCall,
|
||||
keywordsInclude,
|
||||
// addToMessageQueue,
|
||||
} from "./utils.js";
|
||||
import config from "./config.js";
|
||||
import { SQLiteMessageQueue } from "./sqlite.js";
|
||||
// import { messageQueue } from "./msgManager.js";
|
||||
// import cheerio from "cheerio";
|
||||
|
||||
class Chery {
|
||||
constructor() {
|
||||
this.jsonMap = [
|
||||
{
|
||||
name: "奇瑞采购公告",
|
||||
// filepath: path.resolve("chery_cg.json"),
|
||||
info: [],
|
||||
options: {
|
||||
name: "采购公告",
|
||||
url: "https://ebd.mychery.com/cms/api/dynamicData/queryContentPage",
|
||||
categoryId: "5035",
|
||||
siteId: "747",
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "奇瑞寻源预告",
|
||||
// filepath: path.resolve("chery_xy.json"),
|
||||
info: [],
|
||||
options: {
|
||||
name: "寻源预告",
|
||||
url: "https://ebd.mychery.com/cms/api/dynamicData/queryContentPage",
|
||||
categoryId: "965901485789413376",
|
||||
siteId: "747",
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "奇瑞变更公告",
|
||||
// filepath: path.resolve("chery_bg.json"),
|
||||
info: [],
|
||||
options: {
|
||||
name: "变更公告",
|
||||
url: "https://ebd.mychery.com/cms/api/dynamicData/queryContentPage",
|
||||
categoryId: "5032",
|
||||
siteId: "747",
|
||||
},
|
||||
},
|
||||
];
|
||||
console.log("奇瑞 爬虫启动...");
|
||||
this.queue = new SQLiteMessageQueue();
|
||||
this.start();
|
||||
}
|
||||
|
||||
async start() {
|
||||
try {
|
||||
await this.init();
|
||||
} catch (err) {
|
||||
console.error("启动失败:", err);
|
||||
}
|
||||
}
|
||||
async init() {
|
||||
for (let item of this.jsonMap) {
|
||||
let announcements = this.queue.getAnnouncementsBySpider(item.name);
|
||||
if (announcements.length > 0) {
|
||||
this.loopFetchIncrement(item);
|
||||
} else {
|
||||
this.loopFetchFull(item);
|
||||
}
|
||||
// if (fs.existsSync(item.filepath)) {
|
||||
// let data = fs.readFileSync(item.filepath, "utf-8");
|
||||
// item.info = data ? JSON.parse(data) : [];
|
||||
// if (item.info.length > 0) {
|
||||
// // await this.increment(item);
|
||||
// console.log(`${item.name} 历史文件存在,开始增量爬取`);
|
||||
// this.loopFetchIncrement(item);
|
||||
// } else {
|
||||
// this.loopFetchFull(item);
|
||||
// }
|
||||
// } else {
|
||||
// console.log(`${item.name}历史文件不存在,开始全量爬取`);
|
||||
// this.loopFetchFull(item);
|
||||
// }
|
||||
}
|
||||
}
|
||||
// 全量爬取
|
||||
loopFetchFull(props) {
|
||||
try {
|
||||
loopCall(this.getInfo.bind(this), {
|
||||
time: config.fullFetchTime,
|
||||
pagenumber: 1,
|
||||
additional: props.options,
|
||||
stopWhen: (pagenumber, result) => {
|
||||
return (
|
||||
pagenumber >= result.pages || pagenumber >= config.pageNumberLimit
|
||||
);
|
||||
},
|
||||
readyForNext: (pagenumber, result) => {
|
||||
props.info.push(...result.info);
|
||||
return pagenumber + 1;
|
||||
},
|
||||
complete: (result) => {
|
||||
props.info.push(...result.info);
|
||||
console.log(`爬取完成,共获取 ${props.info.length} 条有效数据`);
|
||||
try {
|
||||
this.queue.saveAnnouncements(props.name, props.info);
|
||||
// this.writeFile(props);
|
||||
this.queue.addMessage(props.name, props.info);
|
||||
} catch (error) {
|
||||
console.error("数据库操作失败:", error);
|
||||
}
|
||||
this.loopFetchIncrement(props);
|
||||
},
|
||||
});
|
||||
} catch (error) {
|
||||
console.error(`奇瑞${props.options.name}全量爬取失败:`, error);
|
||||
}
|
||||
}
|
||||
loopFetchIncrement(props) {
|
||||
try {
|
||||
loopCall(this.getInfo.bind(this), {
|
||||
time: config.incrementFetchTime, // 5分钟间隔
|
||||
pagenumber: 1,
|
||||
additional: props.options,
|
||||
readyForNext: (pagenumber, result) => {
|
||||
try {
|
||||
let newInfo = this.queue.filterNewAnnouncements(
|
||||
props.name,
|
||||
result.info
|
||||
);
|
||||
// 存在新数据
|
||||
if (newInfo.length > 0) {
|
||||
console.log(`发现 ${newInfo.length} 条新数据`);
|
||||
// props.info.push(...newInfo);
|
||||
this.queue.saveAnnouncements(props.name, newInfo);
|
||||
// this.writeFile(props);
|
||||
this.queue.addMessage(props.name, newInfo);
|
||||
// 全是新数据,继续下一页
|
||||
if (newInfo.length === result.info.length) {
|
||||
return pagenumber + 1;
|
||||
} else {
|
||||
// 有部分重复数据,重新从第一页开始
|
||||
return 1;
|
||||
}
|
||||
} else {
|
||||
console.log("没有发现新数据,继续监控...");
|
||||
return 1; // 重新从第一页开始
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("数据库操作失败:", error);
|
||||
}
|
||||
},
|
||||
});
|
||||
} catch (error) {
|
||||
console.error(`奇瑞${props.options.name}增量爬取失败:`, error);
|
||||
}
|
||||
}
|
||||
async getInfo(pagenumber = 1, config) {
|
||||
let info = [];
|
||||
console.log(`${config.name}--获取第 ${pagenumber} 页数据...`);
|
||||
let result = await this.getList(pagenumber, config);
|
||||
if (result[0]) {
|
||||
// 出错, 记录错误日志
|
||||
console.error("获取页面数据失败:", result[0]);
|
||||
return { pages: 30, info: [] };
|
||||
} else {
|
||||
let pages = 30;
|
||||
let arr = result[1].res.rows;
|
||||
|
||||
for (let i = 0; i < arr.length; i++) {
|
||||
let item = arr[i];
|
||||
let endTime, publishTime;
|
||||
if (config.categoryId === "965901485789413376") {
|
||||
publishTime = item.publishDate.replace("T", " ").split(".")[0];
|
||||
endTime = this.extractDeadlineTime(item.text);
|
||||
} else {
|
||||
endTime = item.signUpEndTime.replace("T", " ").split(".")[0];
|
||||
publishTime = item.signUpBeginTime.replace("T", " ").split(".")[0];
|
||||
}
|
||||
// 命中关键词
|
||||
if (
|
||||
endTime &&
|
||||
keywordsInclude(item.title) &&
|
||||
+new Date(endTime) >= Date.now()
|
||||
) {
|
||||
// console.log("处理项目:", item.id, item.projectName);
|
||||
info.push({
|
||||
id: item.url,
|
||||
name: item.title,
|
||||
publishTime: publishTime,
|
||||
endTime: endTime,
|
||||
urls: `https://ebd.mychery.com/cms` + item.url,
|
||||
});
|
||||
}
|
||||
}
|
||||
return { pages, info };
|
||||
}
|
||||
}
|
||||
// 分页获取数据
|
||||
getList(pagenumber, config) {
|
||||
return axios({
|
||||
url: config.url,
|
||||
data: {
|
||||
dto: {
|
||||
bidType: "",
|
||||
categoryId: config.categoryId,
|
||||
city: "",
|
||||
county: "",
|
||||
province: "",
|
||||
purchaseMode: "",
|
||||
secondCompanyId: "",
|
||||
siteId: config.siteId,
|
||||
},
|
||||
pageNo: pagenumber,
|
||||
pageSize: "10",
|
||||
},
|
||||
method: "post",
|
||||
})
|
||||
.then((res) => {
|
||||
let result = res.data;
|
||||
if (result.code === 0) {
|
||||
return [null, result];
|
||||
} else {
|
||||
return ["err", null];
|
||||
}
|
||||
})
|
||||
.catch((err) => {
|
||||
return [err, null];
|
||||
});
|
||||
}
|
||||
|
||||
// writeFile(props) {
|
||||
// fs.writeFileSync(props.filepath, JSON.stringify(props.info), "utf-8");
|
||||
// }
|
||||
|
||||
extractDeadlineTime(html) {
|
||||
// 匹配"预告报名截止时间:"后面的时间格式
|
||||
const regex = /预告报名截止时间:(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2})/;
|
||||
const match = html.match(regex);
|
||||
|
||||
if (match) {
|
||||
return match[1];
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
new Chery();
|
||||
|
|
@ -0,0 +1,6 @@
|
|||
export default {
|
||||
// 分页获取限制
|
||||
pageNumberLimit: 3,
|
||||
fullFetchTime: 2000,
|
||||
incrementFetchTime: 5 * 60 * 1000,
|
||||
};
|
||||
|
|
@ -0,0 +1,187 @@
|
|||
import axios from "axios";
|
||||
import fs from "fs";
|
||||
import path from "path";
|
||||
import { timestampToDate, loopCall, keywordsInclude } from "./utils.js";
|
||||
import config from "./config.js";
|
||||
import { SQLiteMessageQueue } from "./sqlite.js";
|
||||
import * as cheerio from "cheerio";
|
||||
|
||||
class DF {
|
||||
constructor() {
|
||||
this.jsonMap = [
|
||||
{
|
||||
name: "东风【招标采购】",
|
||||
info: [],
|
||||
options: {
|
||||
name: "东风【招标采购】",
|
||||
url: "https://etp.dfmc.com.cn/jyxx/004001/",
|
||||
homeIndex: "trade_info_new.html",
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "东风【非招标采购】",
|
||||
info: [],
|
||||
options: {
|
||||
name: "东风【非招标采购】",
|
||||
url: "https://etp.dfmc.com.cn/jyxx/004002/",
|
||||
homeIndex: "trade_info_newf.html",
|
||||
},
|
||||
},
|
||||
];
|
||||
console.log("东风 爬虫启动...");
|
||||
this.queue = new SQLiteMessageQueue();
|
||||
this.start();
|
||||
}
|
||||
|
||||
async start() {
|
||||
try {
|
||||
await this.init();
|
||||
} catch (err) {
|
||||
console.error("启动失败:", err);
|
||||
}
|
||||
}
|
||||
async init() {
|
||||
for (let item of this.jsonMap) {
|
||||
let announcements = this.queue.getAnnouncementsBySpider(item.name);
|
||||
if (announcements.length > 0) {
|
||||
this.loopFetchIncrement(item);
|
||||
} else {
|
||||
this.loopFetchFull(item);
|
||||
}
|
||||
}
|
||||
}
|
||||
// 全量爬取
|
||||
loopFetchFull(props) {
|
||||
try {
|
||||
loopCall(this.getInfo.bind(this), {
|
||||
time: config.fullFetchTime,
|
||||
pagenumber: 1,
|
||||
additional: props.options,
|
||||
stopWhen: (pagenumber, result) => {
|
||||
return (
|
||||
pagenumber >= result.pages || pagenumber >= config.pageNumberLimit
|
||||
);
|
||||
},
|
||||
readyForNext: (pagenumber, result) => {
|
||||
props.info.push(...result.info);
|
||||
return pagenumber + 1;
|
||||
},
|
||||
complete: (result) => {
|
||||
props.info.push(...result.info);
|
||||
console.log(`爬取完成,共获取 ${props.info.length} 条有效数据`);
|
||||
try {
|
||||
if (props.info.length > 0) {
|
||||
this.queue.saveAnnouncements(props.name, props.info);
|
||||
// this.writeFile(props);
|
||||
this.queue.addMessage(props.name, props.info);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("数据库操作失败:", error);
|
||||
}
|
||||
this.loopFetchIncrement(props);
|
||||
},
|
||||
});
|
||||
} catch (error) {
|
||||
console.error(`${props.options.name}全量爬取失败:`, error);
|
||||
}
|
||||
}
|
||||
loopFetchIncrement(props) {
|
||||
try {
|
||||
loopCall(this.getInfo.bind(this), {
|
||||
time: config.incrementFetchTime, // 5分钟间隔
|
||||
pagenumber: 1,
|
||||
additional: props.options,
|
||||
readyForNext: (pagenumber, result) => {
|
||||
try {
|
||||
let newInfo = this.queue.filterNewAnnouncements(
|
||||
props.name,
|
||||
result.info
|
||||
);
|
||||
// 存在新数据
|
||||
if (newInfo.length > 0) {
|
||||
console.log(`发现 ${newInfo.length} 条新数据`);
|
||||
// props.info.push(...newInfo);
|
||||
this.queue.saveAnnouncements(props.name, newInfo);
|
||||
// this.writeFile(props);
|
||||
this.queue.addMessage(props.name, newInfo);
|
||||
// 全是新数据,继续下一页
|
||||
if (newInfo.length === result.info.length) {
|
||||
return pagenumber + 1;
|
||||
} else {
|
||||
// 有部分重复数据,重新从第一页开始
|
||||
return 1;
|
||||
}
|
||||
} else {
|
||||
console.log("没有发现新数据,继续监控...");
|
||||
return 1; // 重新从第一页开始
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("数据库操作失败:", error);
|
||||
}
|
||||
},
|
||||
});
|
||||
} catch (error) {
|
||||
console.error(`${props.options.name}增量爬取失败:`, error);
|
||||
}
|
||||
}
|
||||
async getInfo(pagenumber = 1, config) {
|
||||
let info = [];
|
||||
console.log(`${config.name}--获取第 ${pagenumber} 页数据...`);
|
||||
let result = await this.getList(pagenumber, config);
|
||||
if (result[0]) {
|
||||
// 出错, 记录错误日志
|
||||
console.error("获取页面数据失败:", result[0].status);
|
||||
return { pages: 0, info: [] };
|
||||
} else {
|
||||
// 第六页开始就要验证码了
|
||||
let pages = 5;
|
||||
let html = result[1];
|
||||
const $ = cheerio.load(html);
|
||||
$(".public-table tbody tr").each((index, element) => {
|
||||
let id = $(element).find("td:nth-child(3)").text();
|
||||
let name = $(element).find("a").text();
|
||||
let publishTime = $(element).find("td:nth-child(6)").text();
|
||||
let endTime = $(element).find("td:nth-child(5)").text();
|
||||
let urls =
|
||||
"https://etp.dfmc.com.cn" + $(element).find("a").attr("href");
|
||||
if (
|
||||
endTime &&
|
||||
+new Date(endTime) >= Date.now() &&
|
||||
keywordsInclude(name)
|
||||
) {
|
||||
console.log("处理项目:", id, name);
|
||||
info.push({
|
||||
id: id,
|
||||
name: name,
|
||||
publishTime: publishTime,
|
||||
endTime: endTime,
|
||||
urls: urls,
|
||||
});
|
||||
}
|
||||
});
|
||||
return { pages, info };
|
||||
}
|
||||
}
|
||||
// 分页获取数据
|
||||
getList(pagenumber, config) {
|
||||
let url = config.url;
|
||||
if (pagenumber === 1) {
|
||||
url += config.homeIndex;
|
||||
} else {
|
||||
url += `${pagenumber}.html`;
|
||||
}
|
||||
return axios({
|
||||
url: url,
|
||||
method: "get",
|
||||
})
|
||||
.then((res) => {
|
||||
let result = res.data;
|
||||
return [null, result];
|
||||
})
|
||||
.catch((err) => {
|
||||
return [err, null];
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
new DF();
|
||||
|
|
@ -0,0 +1,37 @@
|
|||
module.exports = {
|
||||
apps: [
|
||||
// 消息队列管理器(优先启动)
|
||||
{
|
||||
name: "msg-manager",
|
||||
script: "msgManager.js",
|
||||
instances: 1,
|
||||
autorestart: true,
|
||||
watch: false,
|
||||
max_memory_restart: "200M",
|
||||
env: {
|
||||
NODE_ENV: "production",
|
||||
SERVICE_NAME: "msg-manager",
|
||||
},
|
||||
error_file: "./logs/msg-manager-error.log",
|
||||
out_file: "./logs/msg-manager-out.log",
|
||||
log_file: "./logs/msg-manager-combined.log",
|
||||
time: true,
|
||||
},
|
||||
{
|
||||
name: "picc-spider",
|
||||
script: "picc.js",
|
||||
instances: 1,
|
||||
autorestart: true,
|
||||
watch: false,
|
||||
max_memory_restart: "300M",
|
||||
env: {
|
||||
NODE_ENV: "production",
|
||||
SPIDER_NAME: "picc",
|
||||
},
|
||||
error_file: "./logs/picc-error.log",
|
||||
out_file: "./logs/picc-out.log",
|
||||
log_file: "./logs/picc-combined.log",
|
||||
time: true,
|
||||
},
|
||||
],
|
||||
};
|
||||
|
|
@ -0,0 +1,237 @@
|
|||
import axios from "axios";
|
||||
import fs from "fs";
|
||||
import path from "path";
|
||||
import { timestampToDate, loopCall } from "./utils.js";
|
||||
import config from "./config.js";
|
||||
import { SQLiteMessageQueue } from "./sqlite.js";
|
||||
// import cheerio from "cheerio";
|
||||
// import { messageQueue } from "./msgManager.js";
|
||||
|
||||
class GEELY {
|
||||
constructor() {
|
||||
this.url = "https://glzb.geely.com/gpmp/notice/listnotice";
|
||||
// this.filepath = path.resolve("geely.json");
|
||||
this.info = [];
|
||||
console.log("GEELY 爬虫启动...");
|
||||
this.queue = new SQLiteMessageQueue();
|
||||
this.start();
|
||||
}
|
||||
|
||||
async start() {
|
||||
try {
|
||||
await this.init();
|
||||
} catch (err) {
|
||||
console.error("启动失败:", err);
|
||||
}
|
||||
}
|
||||
async init() {
|
||||
let announcements = this.queue.getAnnouncementsBySpider("吉利");
|
||||
if (announcements.length > 0) {
|
||||
await this.increment();
|
||||
} else {
|
||||
await this.fullFetch();
|
||||
}
|
||||
// if (fs.existsSync(this.filepath)) {
|
||||
// let data = fs.readFileSync(this.filepath, "utf-8");
|
||||
// this.info = data ? JSON.parse(data) : [];
|
||||
// if (this.info.length > 0) {
|
||||
// await this.increment();
|
||||
// } else {
|
||||
// await this.fullFetch();
|
||||
// }
|
||||
// } else {
|
||||
// console.log("历史文件不存在,开始全量爬取");
|
||||
// await this.fullFetch();
|
||||
// }
|
||||
}
|
||||
// 全量爬取
|
||||
async fullFetch() {
|
||||
console.log("开始全量爬取...");
|
||||
try {
|
||||
await loopCall(this.getInfo.bind(this), {
|
||||
time: config.fullFetchTime,
|
||||
pagenumber: 1,
|
||||
stopWhen: (pagenumber, result) => {
|
||||
return (
|
||||
pagenumber >= result.pages || pagenumber >= config.pageNumberLimit
|
||||
); // 限制最多2页用于测试
|
||||
},
|
||||
readyForNext: (pagenumber, result) => {
|
||||
this.info.push(...result.info);
|
||||
return pagenumber + 1;
|
||||
},
|
||||
complete: (result) => {
|
||||
this.info.push(...result.info);
|
||||
console.log(`爬取完成,共获取 ${this.info.length} 条有效数据`);
|
||||
try {
|
||||
this.queue.saveAnnouncements("吉利", this.info);
|
||||
// this.writeFile(this.info);
|
||||
this.queue.addMessage("吉利", this.info);
|
||||
} catch (error) {
|
||||
console.error("数据库操作失败:", error);
|
||||
}
|
||||
},
|
||||
});
|
||||
} catch (error) {
|
||||
console.error("全量爬取失败:", error);
|
||||
}
|
||||
console.log("开始增量爬取...");
|
||||
this.increment();
|
||||
}
|
||||
|
||||
// 增量爬取
|
||||
async increment() {
|
||||
console.log("开始增量爬取模式,每5分钟检查一次新数据...");
|
||||
try {
|
||||
await loopCall(this.getInfo.bind(this), {
|
||||
time: config.incrementFetchTime, // 5分钟间隔
|
||||
pagenumber: 1,
|
||||
readyForNext: (pagenumber, result) => {
|
||||
try {
|
||||
let newInfo = this.queue.filterNewAnnouncements(
|
||||
"吉利",
|
||||
result.info
|
||||
);
|
||||
// 存在新数据
|
||||
if (newInfo.length > 0) {
|
||||
console.log(`发现 ${newInfo.length} 条新数据`);
|
||||
this.queue.saveAnnouncements("吉利", newInfo);
|
||||
this.queue.addMessage("吉利", newInfo);
|
||||
// 全是新数据,继续下一页
|
||||
if (newInfo.length === result.info.length) {
|
||||
return pagenumber + 1;
|
||||
} else {
|
||||
// 有部分重复数据,重新从第一页开始
|
||||
return 1;
|
||||
}
|
||||
} else {
|
||||
console.log("没有发现新数据,继续监控...");
|
||||
return 1; // 重新从第一页开始
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("数据库操作失败:", error);
|
||||
}
|
||||
},
|
||||
});
|
||||
} catch (error) {
|
||||
console.error("增量爬取失败:", error);
|
||||
}
|
||||
}
|
||||
// 传入页码获取数据
|
||||
async getInfo(pagenumber = 1) {
|
||||
let today = new Date().setHours(0, 0, 0, 0);
|
||||
let beforeOneMonth = today - 30 * 24 * 60 * 60 * 1000;
|
||||
let info = [];
|
||||
console.log(`正在获取第 ${pagenumber} 页数据...`);
|
||||
let result = await this.getList(pagenumber);
|
||||
if (result[0]) {
|
||||
// 出错, 记录错误日志
|
||||
console.error("获取页面数据失败:", result[0]);
|
||||
return { pages: 0, info: [] };
|
||||
} else {
|
||||
let total = result[1].data.total;
|
||||
let pages = Math.ceil(total / 20);
|
||||
let arr = result[1].data.items;
|
||||
|
||||
for (let i = 0; i < arr.length; i++) {
|
||||
let item = arr[i];
|
||||
if (item.endtime >= today && item.publishtime >= beforeOneMonth) {
|
||||
console.log("处理项目:", item.pjtnoticeid, item.pjtnoticename);
|
||||
let noticeRes = await this.getNoticeUrl(item.pjtnoticeid);
|
||||
if (noticeRes[0]) {
|
||||
// 获取招标公告内容报错
|
||||
console.error("获取公告详情失败:", noticeRes[0]);
|
||||
} else {
|
||||
info.push({
|
||||
id: item.pjtnoticeid,
|
||||
name: item.pjtnoticename,
|
||||
publishTime: timestampToDate(item.publishtime),
|
||||
endTime: timestampToDate(item.endtime),
|
||||
urls: noticeRes[1],
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
return { pages, info };
|
||||
}
|
||||
}
|
||||
getList(pagenumber) {
|
||||
return axios({
|
||||
url: this.url,
|
||||
params: {
|
||||
pagesize: 20,
|
||||
pagenumber: pagenumber,
|
||||
publishstatus: 2,
|
||||
bidcategoryid: 1442,
|
||||
iflongpro: 0,
|
||||
_: Date.now(),
|
||||
},
|
||||
method: "get",
|
||||
})
|
||||
.then((res) => {
|
||||
let result = res.data;
|
||||
if (result.code === "success") {
|
||||
return [null, result];
|
||||
} else {
|
||||
return ["err", null];
|
||||
}
|
||||
})
|
||||
.catch((err) => {
|
||||
return [err, null];
|
||||
});
|
||||
}
|
||||
|
||||
getNoticeUrl(id) {
|
||||
let timestamp = Date.now();
|
||||
return axios({
|
||||
url: `https://glzb.geely.com/gpmp/notice/query?_=${timestamp}&pjtnoticeid=${id}`,
|
||||
method: "get",
|
||||
})
|
||||
.then((res) => {
|
||||
let result = res.data;
|
||||
if (result.code === "success") {
|
||||
let promises = [];
|
||||
for (let item of result.data.attachs) {
|
||||
let params = {
|
||||
name: item.attachname,
|
||||
downloadUrl: item.downloadUrl,
|
||||
previewUrl: item.previewUrl,
|
||||
attachname: item.attachname,
|
||||
_: Date.now(),
|
||||
};
|
||||
promises.push(
|
||||
axios({
|
||||
url: `https://glzb.geely.com/pub/file/info/preview`,
|
||||
method: "get",
|
||||
params,
|
||||
})
|
||||
);
|
||||
}
|
||||
return Promise.allSettled(promises).then((results) => {
|
||||
let urls = [];
|
||||
results.forEach((result) => {
|
||||
if (
|
||||
result.status === "fulfilled" &&
|
||||
result.value.data.code === "success"
|
||||
) {
|
||||
urls.push(result.value.data.data);
|
||||
}
|
||||
});
|
||||
return [null, urls];
|
||||
});
|
||||
} else {
|
||||
return ["err", null];
|
||||
}
|
||||
})
|
||||
.catch((err) => {
|
||||
console.log("err:", err);
|
||||
return [err, null];
|
||||
});
|
||||
}
|
||||
|
||||
// writeFile(info) {
|
||||
// fs.writeFileSync(this.filepath, JSON.stringify(info), "utf-8");
|
||||
// }
|
||||
}
|
||||
|
||||
new GEELY();
|
||||
|
|
@ -0,0 +1,234 @@
|
|||
import axios from "axios";
|
||||
import fs from "fs";
|
||||
import path from "path";
|
||||
import { timestampToDate, loopCall, keywordsInclude } from "./utils.js";
|
||||
import config from "./config.js";
|
||||
import { SQLiteMessageQueue } from "./sqlite.js";
|
||||
|
||||
class GreatWall {
|
||||
constructor() {
|
||||
this.jsonMap = [
|
||||
{
|
||||
name: "长城公开寻源",
|
||||
info: [],
|
||||
options: {
|
||||
name: "长城公开寻源",
|
||||
url: "https://srm.gwm.cn/cloud-srm/api-sou/sou-firstPage/souReqlistPage",
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "长城招募公示大厅",
|
||||
info: [],
|
||||
options: {
|
||||
name: "长城招募公示大厅",
|
||||
url: "https://srm.gwm.cn/cloud-srm/api-sou/api-ql/Recruit/visitList",
|
||||
data: {
|
||||
type: "Recruit",
|
||||
lang: "zh-cn",
|
||||
query: { "*": {} },
|
||||
payload: {
|
||||
filter: {},
|
||||
page: { sort: "lastUpdateDate desc", pageNum: 1, pageSize: 8 },
|
||||
},
|
||||
action: "visitList",
|
||||
tree: true,
|
||||
},
|
||||
},
|
||||
},
|
||||
];
|
||||
console.log("长城 爬虫启动...");
|
||||
this.queue = new SQLiteMessageQueue();
|
||||
this.start();
|
||||
}
|
||||
|
||||
async start() {
|
||||
try {
|
||||
await this.init();
|
||||
} catch (err) {
|
||||
console.error("启动失败:", err);
|
||||
}
|
||||
}
|
||||
async init() {
|
||||
for (let item of this.jsonMap) {
|
||||
let announcements = this.queue.getAnnouncementsBySpider(item.name);
|
||||
if (announcements.length > 0) {
|
||||
this.loopFetchIncrement(item);
|
||||
} else {
|
||||
this.loopFetchFull(item);
|
||||
}
|
||||
}
|
||||
}
|
||||
// 全量爬取
|
||||
loopFetchFull(props) {
|
||||
try {
|
||||
loopCall(this.getInfo.bind(this), {
|
||||
time: config.fullFetchTime,
|
||||
pagenumber: 1,
|
||||
additional: props.options,
|
||||
stopWhen: (pagenumber, result) => {
|
||||
return (
|
||||
pagenumber >= result.pages || pagenumber >= config.pageNumberLimit
|
||||
);
|
||||
},
|
||||
readyForNext: (pagenumber, result) => {
|
||||
props.info.push(...result.info);
|
||||
return pagenumber + 1;
|
||||
},
|
||||
complete: (result) => {
|
||||
props.info.push(...result.info);
|
||||
console.log(`爬取完成,共获取 ${props.info.length} 条有效数据`);
|
||||
try {
|
||||
if (props.info.length > 0) {
|
||||
this.queue.saveAnnouncements(props.name, props.info);
|
||||
// this.writeFile(props);
|
||||
this.queue.addMessage(props.name, props.info);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("数据库操作失败:", error);
|
||||
}
|
||||
this.loopFetchIncrement(props);
|
||||
},
|
||||
});
|
||||
} catch (error) {
|
||||
console.error(`${props.options.name}全量爬取失败:`, error);
|
||||
}
|
||||
}
|
||||
loopFetchIncrement(props) {
|
||||
try {
|
||||
loopCall(this.getInfo.bind(this), {
|
||||
time: config.incrementFetchTime, // 5分钟间隔
|
||||
pagenumber: 1,
|
||||
additional: props.options,
|
||||
readyForNext: (pagenumber, result) => {
|
||||
try {
|
||||
let newInfo = this.queue.filterNewAnnouncements(
|
||||
props.name,
|
||||
result.info
|
||||
);
|
||||
// 存在新数据
|
||||
if (newInfo.length > 0) {
|
||||
console.log(`发现 ${newInfo.length} 条新数据`);
|
||||
// props.info.push(...newInfo);
|
||||
this.queue.saveAnnouncements(props.name, newInfo);
|
||||
// this.writeFile(props);
|
||||
this.queue.addMessage(props.name, newInfo);
|
||||
// 全是新数据,继续下一页
|
||||
if (newInfo.length === result.info.length) {
|
||||
return pagenumber + 1;
|
||||
} else {
|
||||
// 有部分重复数据,重新从第一页开始
|
||||
return 1;
|
||||
}
|
||||
} else {
|
||||
console.log("没有发现新数据,继续监控...");
|
||||
return 1; // 重新从第一页开始
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("数据库操作失败:", error);
|
||||
}
|
||||
},
|
||||
});
|
||||
} catch (error) {
|
||||
console.error(`${props.options.name}增量爬取失败:`, error);
|
||||
}
|
||||
}
|
||||
async getInfo(pagenumber = 1, config) {
|
||||
let info = [];
|
||||
console.log(`${config.name}--获取第 ${pagenumber} 页数据...`);
|
||||
let result = await this.getList(pagenumber, config);
|
||||
if (result[0]) {
|
||||
// 出错, 记录错误日志
|
||||
console.error("获取页面数据失败:", result[0]);
|
||||
return { pages: 0, info: [] };
|
||||
} else {
|
||||
if (config.data) {
|
||||
// 招募公示大厅
|
||||
let arr = result[1].data.records;
|
||||
let pages = result[1].data.pageCount;
|
||||
for (let i = 0; i < arr.length; i++) {
|
||||
let item = arr[i];
|
||||
let endTime, publishTime;
|
||||
endTime = item.deadlineTime;
|
||||
publishTime = item.publishTime;
|
||||
// 命中关键词
|
||||
if (keywordsInclude(item.title)) {
|
||||
info.push({
|
||||
id: item.recruitId,
|
||||
name: item.title,
|
||||
publishTime: publishTime,
|
||||
endTime: endTime,
|
||||
urls: `https://srm.gwm.cn/#/portalBidding/vendorBiddingDetail?id=${item.recruitId}`,
|
||||
});
|
||||
}
|
||||
}
|
||||
return { pages, info };
|
||||
} else {
|
||||
// 公开寻源
|
||||
let arr = result[1].data.list;
|
||||
let pages = result[1].data.pages;
|
||||
|
||||
for (let i = 0; i < arr.length; i++) {
|
||||
let item = arr[i];
|
||||
let endTime, publishTime;
|
||||
endTime = item.publicEndTime;
|
||||
publishTime = item.releaseDate;
|
||||
// 命中关键词
|
||||
if (keywordsInclude(item.projectName)) {
|
||||
info.push({
|
||||
id: item.reqHeadId,
|
||||
name: item.projectName,
|
||||
publishTime: publishTime,
|
||||
endTime: endTime,
|
||||
urls: `https://srm.gwm.cn/#/portal?id=${item.reqHeadId}`,
|
||||
});
|
||||
}
|
||||
}
|
||||
return { pages, info };
|
||||
}
|
||||
}
|
||||
}
|
||||
// 分页获取数据
|
||||
getList(pagenumber, config) {
|
||||
let data = {};
|
||||
if (config.data) {
|
||||
data = config.data;
|
||||
data.payload.page.pageNum = pagenumber;
|
||||
} else {
|
||||
data = { pageNum: pagenumber, pageSize: 8 };
|
||||
}
|
||||
return axios({
|
||||
url: config.url,
|
||||
data: data,
|
||||
method: "post",
|
||||
})
|
||||
.then((res) => {
|
||||
let result = res.data;
|
||||
if (result.code == "0") {
|
||||
return [null, result];
|
||||
} else {
|
||||
return ["err", null];
|
||||
}
|
||||
})
|
||||
.catch((err) => {
|
||||
return [err, null];
|
||||
});
|
||||
}
|
||||
|
||||
// writeFile(props) {
|
||||
// fs.writeFileSync(props.filepath, JSON.stringify(props.info), "utf-8");
|
||||
// }
|
||||
|
||||
// extractDeadlineTime(html) {
|
||||
// // 匹配"预告报名截止时间:"后面的时间格式
|
||||
// const regex = /预告报名截止时间:(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2})/;
|
||||
// const match = html.match(regex);
|
||||
|
||||
// if (match) {
|
||||
// return match[1];
|
||||
// }
|
||||
|
||||
// return null;
|
||||
// }
|
||||
}
|
||||
|
||||
new GreatWall();
|
||||
|
|
@ -0,0 +1,385 @@
|
|||
import axios from "axios";
|
||||
import fs from "fs";
|
||||
import path from "path";
|
||||
import JSON5 from "json5";
|
||||
import { timestampToDate, loopCall, keywordsInclude } from "./utils.js";
|
||||
import config from "./config.js";
|
||||
import { SQLiteMessageQueue } from "./sqlite.js";
|
||||
|
||||
class JiangHuai {
|
||||
constructor(jsonMap) {
|
||||
this.axiosInstance = axios.create({ timeout: 30000, maxRedirects: 5 });
|
||||
this.axiosInstance.interceptors.request.use((config) => {
|
||||
// 添加cookie到请求头
|
||||
const cookieString = Array.from(this.cookiePair.entries())
|
||||
.map(([name, value]) => `${name}=${value}`)
|
||||
.join("; ");
|
||||
config.headers.Cookie = cookieString;
|
||||
return config;
|
||||
});
|
||||
this.axiosInstance.interceptors.response.use(
|
||||
(response) => {
|
||||
// 更新cookie到请求头
|
||||
let cookieArr = response.headers["set-cookie"];
|
||||
this.extractCookie(cookieArr);
|
||||
return response;
|
||||
},
|
||||
(error) => {
|
||||
return Promise.reject(error);
|
||||
}
|
||||
);
|
||||
this.cookiePair = new Map();
|
||||
this.csrfToken = "";
|
||||
this.jsonMap = jsonMap;
|
||||
// [
|
||||
// {
|
||||
// name: "江淮【招标公告】",
|
||||
// info: [],
|
||||
// options: {
|
||||
// name: "江淮【招标公告】",
|
||||
// url: "https://ahjhqc.youzhicai.com/domain/data-list-new",
|
||||
// data: {
|
||||
// pageIndex: 1,
|
||||
// type: 1,
|
||||
// companyId: "",
|
||||
// title: "",
|
||||
// ntype: 1,
|
||||
// start_time: "",
|
||||
// end_time: "",
|
||||
// child: "",
|
||||
// tenderType: 3,
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
// {
|
||||
// name: "江淮【变更/澄清公告】",
|
||||
// info: [],
|
||||
// options: {
|
||||
// name: "江淮【变更/澄清公告】",
|
||||
// url: "https://ahjhqc.youzhicai.com/domain/data-list-new",
|
||||
// data: {
|
||||
// pageIndex: 1,
|
||||
// type: 1,
|
||||
// companyId: "",
|
||||
// title: "",
|
||||
// ntype: "4,6",
|
||||
// start_time: "",
|
||||
// end_time: "",
|
||||
// child: "",
|
||||
// tenderType: 3,
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
// ];
|
||||
console.log("江淮 爬虫启动...");
|
||||
this.queue = new SQLiteMessageQueue();
|
||||
this.start();
|
||||
}
|
||||
|
||||
async start() {
|
||||
try {
|
||||
await this.init();
|
||||
} catch (err) {
|
||||
console.error("启动失败:", err);
|
||||
}
|
||||
}
|
||||
async init() {
|
||||
for (let item of this.jsonMap) {
|
||||
let announcements = this.queue.getAnnouncementsBySpider(item.name);
|
||||
if (announcements.length > 0) {
|
||||
this.loopFetchIncrement(item);
|
||||
} else {
|
||||
this.loopFetchFull(item);
|
||||
}
|
||||
}
|
||||
}
|
||||
async initializeCookie() {
|
||||
try {
|
||||
let headers = {
|
||||
headers: {
|
||||
"User-Agent":
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36",
|
||||
Accept:
|
||||
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
|
||||
"Accept-Language": "zh-CN,zh;q=0.9",
|
||||
"Cache-Control": "no-cache",
|
||||
Pragma: "no-cache",
|
||||
"Sec-Fetch-Dest": "document",
|
||||
"Sec-Fetch-Mode": "navigate",
|
||||
"Sec-Fetch-Site": "none",
|
||||
"Upgrade-Insecure-Requests": "1",
|
||||
},
|
||||
};
|
||||
const homeResponse = await this.axiosInstance.get(
|
||||
"https://ahjhqc.youzhicai.com/homeindex/noticeListNew.html?type=1",
|
||||
headers
|
||||
);
|
||||
// 提取csrf-token
|
||||
let tokenMatch = homeResponse.data.match(
|
||||
/<meta name="csrf-token" content="([^"]+)"/
|
||||
);
|
||||
// console.log(tokenMatch);
|
||||
if (tokenMatch) {
|
||||
let csrfToken = tokenMatch[1];
|
||||
this.csrfToken = csrfToken;
|
||||
}
|
||||
console.log(this.csrfToken);
|
||||
headers.headers["X-Csrf-Token"] = this.csrfToken;
|
||||
const cacheResponse = await this.axiosInstance.get(
|
||||
"https://ahjhqc.youzhicai.com/?cache=1",
|
||||
headers
|
||||
);
|
||||
} catch (err) {
|
||||
console.log("err", err);
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
extractCookie(cookieArr) {
|
||||
for (let cookie of cookieArr) {
|
||||
let [key, value] = cookie.split(";")[0].split("=");
|
||||
this.cookiePair.set(key, value);
|
||||
}
|
||||
// console.log(this.cookiePair);
|
||||
}
|
||||
// 全量爬取
|
||||
loopFetchFull(props) {
|
||||
console.log("开始全量爬取");
|
||||
try {
|
||||
loopCall(this.getInfo.bind(this), {
|
||||
time: config.fullFetchTime,
|
||||
pagenumber: 1,
|
||||
additional: props.options,
|
||||
stopWhen: (pagenumber, result) => {
|
||||
return (
|
||||
pagenumber >= result.pages || pagenumber >= config.pageNumberLimit
|
||||
);
|
||||
},
|
||||
readyForNext: (pagenumber, result) => {
|
||||
props.info.push(...result.info);
|
||||
return pagenumber + 1;
|
||||
},
|
||||
complete: (result) => {
|
||||
props.info.push(...result.info);
|
||||
console.log(`爬取完成,共获取 ${props.info.length} 条有效数据`);
|
||||
try {
|
||||
if (props.info.length > 0) {
|
||||
this.queue.saveAnnouncements(props.name, props.info);
|
||||
this.queue.addMessage(props.name, props.info);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("数据库操作失败:", error);
|
||||
}
|
||||
this.loopFetchIncrement(props);
|
||||
},
|
||||
});
|
||||
} catch (error) {
|
||||
console.error(`${props.options.name}全量爬取失败:`, error);
|
||||
}
|
||||
}
|
||||
loopFetchIncrement(props) {
|
||||
console.log("开始增量爬取");
|
||||
try {
|
||||
loopCall(this.getInfo.bind(this), {
|
||||
time: config.incrementFetchTime, // 5分钟间隔
|
||||
pagenumber: 1,
|
||||
additional: props.options,
|
||||
readyForNext: (pagenumber, result) => {
|
||||
try {
|
||||
let newInfo = this.queue.filterNewAnnouncements(
|
||||
props.name,
|
||||
result.info
|
||||
);
|
||||
// 存在新数据
|
||||
if (newInfo.length > 0) {
|
||||
console.log(`发现 ${newInfo.length} 条新数据`);
|
||||
// props.info.push(...newInfo);
|
||||
this.queue.saveAnnouncements(props.name, newInfo);
|
||||
// this.writeFile(props);
|
||||
this.queue.addMessage(props.name, newInfo);
|
||||
// 全是新数据,继续下一页
|
||||
if (newInfo.length === result.info.length) {
|
||||
return pagenumber + 1;
|
||||
} else {
|
||||
// 有部分重复数据,重新从第一页开始
|
||||
return 1;
|
||||
}
|
||||
} else {
|
||||
console.log("没有发现新数据,继续监控...");
|
||||
return 1; // 重新从第一页开始
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("数据库操作失败:", error);
|
||||
}
|
||||
},
|
||||
});
|
||||
} catch (error) {
|
||||
console.error(`${props.options.name}增量爬取失败:`, error);
|
||||
}
|
||||
}
|
||||
async getInfo(pagenumber = 1, config) {
|
||||
let info = [];
|
||||
console.log(`${config.name}--获取第 ${pagenumber} 页数据...`);
|
||||
let result = await this.getList(pagenumber, config);
|
||||
if (result[0]) {
|
||||
// 出错, 记录错误日志
|
||||
console.error("获取页面数据失败: ", result[0]);
|
||||
return { pages: 0, info: [] };
|
||||
} else {
|
||||
// 公开寻源
|
||||
let arr = result[1].list;
|
||||
let total = result[1].total;
|
||||
let pages = Math.ceil(total / 10);
|
||||
|
||||
for (let i = 0; i < arr.length; i++) {
|
||||
let item = arr[i];
|
||||
let endTime, publishTime;
|
||||
publishTime = new Date(item.startTime).toLocaleDateString();
|
||||
endTime = new Date(item.endTime).toLocaleDateString();
|
||||
// 命中关键词
|
||||
if (
|
||||
keywordsInclude(item.noticeTitle) &&
|
||||
item.endTime &&
|
||||
+new Date(item.endTime) >= Date.now()
|
||||
) {
|
||||
console.log("处理项目:", item.noticeTitle);
|
||||
info.push({
|
||||
id: item.bulletinSID,
|
||||
name: item.noticeTitle,
|
||||
publishTime: publishTime,
|
||||
endTime: endTime,
|
||||
urls: `https://ahjhqc.youzhicai.com/${item.Url}`,
|
||||
});
|
||||
}
|
||||
}
|
||||
return { pages, info };
|
||||
}
|
||||
}
|
||||
async getList(pagenumber, config) {
|
||||
let data = config.data;
|
||||
data.pageIndex = pagenumber;
|
||||
let headers = {
|
||||
Accept: "text/plain, */*; q=0.01",
|
||||
"Accept-Language": "zh-CN,zh;q=0.9",
|
||||
"Cache-Control": "no-cache",
|
||||
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
|
||||
Origin: "https://ahjhqc.youzhicai.com",
|
||||
Pragma: "no-cache",
|
||||
Priority: "u=1, i",
|
||||
Referer:
|
||||
"https://ahjhqc.youzhicai.com/homeindex/noticeListNew.html?type=1",
|
||||
"Sec-Ch-Ua":
|
||||
'"Not)A;Brand";v="8", "Chromium";v="138", "Google Chrome";v="138"',
|
||||
"Sec-Ch-Ua-Mobile": "?0",
|
||||
"Sec-Ch-Ua-Platform": '"macOS"',
|
||||
"Sec-Fetch-Dest": "empty",
|
||||
"Sec-Fetch-Mode": "cors",
|
||||
"Sec-Fetch-Site": "same-origin",
|
||||
"User-Agent":
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36",
|
||||
"X-Requested-With": "XMLHttpRequest",
|
||||
"X-Csrf-Token": this.csrfToken,
|
||||
};
|
||||
try {
|
||||
const response = await this.axiosInstance({
|
||||
url: config.url,
|
||||
data,
|
||||
method: "post",
|
||||
headers,
|
||||
});
|
||||
let result = JSON5.parse(response.data);
|
||||
if (result.list && result.list.length > 0) {
|
||||
return [null, result];
|
||||
} else {
|
||||
return ["err", null];
|
||||
}
|
||||
} catch (err) {
|
||||
console.log("cookie不对");
|
||||
try {
|
||||
await this.initializeCookie();
|
||||
headers["X-Csrf-Token"] = this.csrfToken;
|
||||
const retryResponse = await this.axiosInstance({
|
||||
url: config.url,
|
||||
data,
|
||||
method: "post",
|
||||
headers,
|
||||
});
|
||||
// console.log(retryResponse.data);
|
||||
let result = JSON5.parse(retryResponse.data);
|
||||
if (result.list && result.list.length > 0) {
|
||||
return [null, result];
|
||||
} else {
|
||||
return ["err", null];
|
||||
}
|
||||
} catch (retryErr) {
|
||||
return [retryErr, null];
|
||||
}
|
||||
}
|
||||
}
|
||||
// 分页获取数据
|
||||
// getList(pagenumber, config) {
|
||||
// let data = config.data;
|
||||
// data.pageIndex = pagenumber;
|
||||
// return axios({
|
||||
// url: config.url,
|
||||
// data: data,
|
||||
// method: "post",
|
||||
// headers: {
|
||||
// "Content-Type": "application/x-www-form-urlencoded",
|
||||
// },
|
||||
// })
|
||||
// .then((res) => {
|
||||
// let result = res.data;
|
||||
// if (result.list && result.list.length > 0) {
|
||||
// return [null, result];
|
||||
// } else {
|
||||
// return ["err", null];
|
||||
// }
|
||||
// })
|
||||
// .catch((err) => {
|
||||
// return [err, null];
|
||||
// });
|
||||
// }
|
||||
}
|
||||
|
||||
new JiangHuai([
|
||||
{
|
||||
name: "江淮【招标公告】",
|
||||
info: [],
|
||||
options: {
|
||||
name: "江淮【招标公告】",
|
||||
url: "https://ahjhqc.youzhicai.com/domain/data-list-new",
|
||||
data: {
|
||||
pageIndex: 1,
|
||||
type: 1,
|
||||
companyId: "",
|
||||
title: "",
|
||||
ntype: 1,
|
||||
start_time: "",
|
||||
end_time: "",
|
||||
child: "",
|
||||
tenderType: 3,
|
||||
},
|
||||
},
|
||||
},
|
||||
]);
|
||||
new JiangHuai([
|
||||
{
|
||||
name: "江淮【变更/澄清公告】",
|
||||
info: [],
|
||||
options: {
|
||||
name: "江淮【变更/澄清公告】",
|
||||
url: "https://ahjhqc.youzhicai.com/domain/data-list-new",
|
||||
data: {
|
||||
pageIndex: 1,
|
||||
type: 1,
|
||||
companyId: "",
|
||||
title: "",
|
||||
ntype: "4,6",
|
||||
start_time: "",
|
||||
end_time: "",
|
||||
child: "",
|
||||
tenderType: 3,
|
||||
},
|
||||
},
|
||||
},
|
||||
]);
|
||||
|
|
@ -0,0 +1,193 @@
|
|||
import axios from "axios";
|
||||
import fs from "fs";
|
||||
import path from "path";
|
||||
import { timestampToDate, loopCall, keywordsInclude } from "./utils.js";
|
||||
import config from "./config.js";
|
||||
import { SQLiteMessageQueue } from "./sqlite.js";
|
||||
// import cheerio from "cheerio";
|
||||
|
||||
class LeapMotor {
|
||||
constructor() {
|
||||
this.url =
|
||||
"https://lpsrm.leapmotor.com/cloud-srm/api-inq/inq-anon/reqhead/listPage";
|
||||
this.info = [];
|
||||
console.log("零跑 爬虫启动...");
|
||||
this.queue = new SQLiteMessageQueue();
|
||||
this.start();
|
||||
}
|
||||
|
||||
async start() {
|
||||
try {
|
||||
await this.init();
|
||||
} catch (err) {
|
||||
console.error("启动失败:", err);
|
||||
}
|
||||
}
|
||||
async init() {
|
||||
let announcements = this.queue.getAnnouncementsBySpider("零跑");
|
||||
if (announcements.length > 0) {
|
||||
// console.log(announcements);
|
||||
await this.increment();
|
||||
} else {
|
||||
await this.fullFetch();
|
||||
}
|
||||
}
|
||||
// 全量爬取
|
||||
async fullFetch() {
|
||||
console.log("开始全量爬取...");
|
||||
try {
|
||||
await loopCall(this.getInfo.bind(this), {
|
||||
time: config.fullFetchTime,
|
||||
pagenumber: 1,
|
||||
stopWhen: (pagenumber, result) => {
|
||||
return (
|
||||
pagenumber >= result.pages || pagenumber >= config.pageNumberLimit
|
||||
);
|
||||
},
|
||||
readyForNext: (pagenumber, result) => {
|
||||
this.info.push(...result.info);
|
||||
return pagenumber + 1;
|
||||
},
|
||||
complete: (result) => {
|
||||
this.info.push(...result.info);
|
||||
console.log(`爬取完成,共获取 ${this.info.length} 条有效数据`);
|
||||
try {
|
||||
this.queue.saveAnnouncements("零跑", this.info);
|
||||
this.queue.addMessage("零跑", this.info);
|
||||
} catch (error) {
|
||||
console.error("数据库操作失败:", error);
|
||||
}
|
||||
},
|
||||
});
|
||||
} catch (error) {
|
||||
console.error("全量爬取失败:", error);
|
||||
}
|
||||
console.log("开始增量爬取...");
|
||||
this.increment();
|
||||
}
|
||||
|
||||
// 增量爬取
|
||||
async increment() {
|
||||
console.log("开始增量爬取模式,每5分钟检查一次新数据...");
|
||||
try {
|
||||
await loopCall(this.getInfo.bind(this), {
|
||||
time: config.incrementFetchTime, // 5分钟间隔
|
||||
pagenumber: 1,
|
||||
readyForNext: (pagenumber, result) => {
|
||||
// 判断数据是否存在
|
||||
try {
|
||||
let newInfo = this.queue.filterNewAnnouncements(
|
||||
"零跑",
|
||||
result.info
|
||||
);
|
||||
// 有新数据
|
||||
if (newInfo.length > 0) {
|
||||
console.log(`发现 ${newInfo.length} 条新数据`);
|
||||
|
||||
this.queue.saveAnnouncements("零跑", newInfo);
|
||||
this.queue.addMessage("零跑", newInfo);
|
||||
|
||||
// 全是新数据,继续下一页
|
||||
if (newInfo.length === result.info.length) {
|
||||
return pagenumber + 1;
|
||||
} else {
|
||||
// 有部分重复数据,重新从第一页开始
|
||||
return 1;
|
||||
}
|
||||
} else {
|
||||
console.log("没有发现新数据,继续监控...");
|
||||
return 1; // 重新从第一页开始
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("数据库操作失败:", error);
|
||||
}
|
||||
},
|
||||
});
|
||||
} catch (error) {
|
||||
console.error("增量爬取失败:", error);
|
||||
}
|
||||
}
|
||||
// 传入页码获取数据
|
||||
async getInfo(pagenumber = 1) {
|
||||
let info = [];
|
||||
console.log(`正在获取第 ${pagenumber} 页数据...`);
|
||||
let result = await this.getList(pagenumber);
|
||||
if (result[0]) {
|
||||
// 出错, 记录错误日志
|
||||
console.error("获取页面数据失败:", result[0]);
|
||||
return { pages: 0, info: [] };
|
||||
} else {
|
||||
// let total = result[1].data.total;
|
||||
let pages = result[1].data.pages;
|
||||
let arr = result[1].data.list;
|
||||
|
||||
for (let i = 0; i < arr.length; i++) {
|
||||
let item = arr[i];
|
||||
// 命中关键词
|
||||
if (keywordsInclude(item.souReqTitile)) {
|
||||
console.log("处理项目:", item.reqHeadId, item.souReqTitile);
|
||||
let noticeRes = await this.getNoticeUrl(item.reqHeadId);
|
||||
if (noticeRes[0]) {
|
||||
// 获取招标公告内容报错
|
||||
console.error("获取公告链接失败:", noticeRes[0]);
|
||||
} else {
|
||||
info.push({
|
||||
id: item.reqHeadId,
|
||||
name: item.souReqTitile,
|
||||
publishTime: item.publishTime,
|
||||
endTime: item.expirationTime,
|
||||
urls: noticeRes[1],
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
return { pages, info };
|
||||
}
|
||||
}
|
||||
getList(pagenumber) {
|
||||
return axios({
|
||||
url: this.url,
|
||||
data: {
|
||||
pageNum: pagenumber,
|
||||
pageSize: 8,
|
||||
},
|
||||
method: "post",
|
||||
})
|
||||
.then((res) => {
|
||||
let result = res.data;
|
||||
if (result.code === "0") {
|
||||
return [null, result];
|
||||
} else {
|
||||
return ["err", null];
|
||||
}
|
||||
})
|
||||
.catch((err) => {
|
||||
return [err, null];
|
||||
});
|
||||
}
|
||||
|
||||
getNoticeUrl(id) {
|
||||
return axios({
|
||||
url: `https://lpsrm.leapmotor.com/cloud-srm/api-inq/inq-anon/pj/reqhead/get?id=${id}`,
|
||||
method: "get",
|
||||
})
|
||||
.then((res) => {
|
||||
let result = res.data;
|
||||
if (result.code === "0") {
|
||||
return [null, result.data.extNoticeLink];
|
||||
} else {
|
||||
return ["err", null];
|
||||
}
|
||||
})
|
||||
.catch((err) => {
|
||||
console.log("err:", err);
|
||||
return [err, null];
|
||||
});
|
||||
}
|
||||
|
||||
// writeFile(info) {
|
||||
// fs.writeFileSync(this.filepath, JSON.stringify(info), "utf-8");
|
||||
// }
|
||||
}
|
||||
|
||||
new LeapMotor();
|
||||
|
|
@ -0,0 +1,100 @@
|
|||
import nodemailer from "nodemailer";
|
||||
import path from "path";
|
||||
|
||||
class EmailSender {
|
||||
constructor(config) {
|
||||
this.transporter = nodemailer.createTransport(config);
|
||||
this.defaultFrom = config.auth.user;
|
||||
}
|
||||
async sendEmail(options) {
|
||||
try {
|
||||
const mailOptions = {
|
||||
from: options.from || this.defaultFrom,
|
||||
to: options.to,
|
||||
cc: options.cc,
|
||||
bcc: options.bcc,
|
||||
subject: options.subject,
|
||||
text: options.text,
|
||||
html: options.html,
|
||||
attachments: options.attachments || [],
|
||||
};
|
||||
|
||||
const info = await this.transporter.sendMail(mailOptions);
|
||||
console.log(`邮件发送成功: ${options.to} - ${info.messageId}`);
|
||||
return { success: true, messageId: info.messageId };
|
||||
} catch (error) {
|
||||
console.error(`邮件发送失败: ${options.to} -`, error.message);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
async sendBasicEmail(to, subject, content) {
|
||||
return await this.sendEmail({ to, subject, html: content });
|
||||
}
|
||||
|
||||
async sendEmailWithAttachments(to, subject, content, attachmentPath) {
|
||||
const attachments = [];
|
||||
if (attachmentPath) {
|
||||
attachments.push({
|
||||
filename: path.basename(attachmentPath),
|
||||
path: attachmentPath,
|
||||
});
|
||||
}
|
||||
return await this.sendEmail({ to, subject, html: content, attachments });
|
||||
}
|
||||
|
||||
async sendBulkEmail(recipients, subject, content) {
|
||||
const results = [];
|
||||
for (const recipient of recipients) {
|
||||
try {
|
||||
const result = await this.sendEmail({
|
||||
to: recipient,
|
||||
subject,
|
||||
html: content,
|
||||
});
|
||||
results.push({ recipient, success: true, result });
|
||||
} catch (error) {
|
||||
results.push({ recipient, success: false, error: error.message });
|
||||
}
|
||||
await new Promise((resolve) => setTimeout(resolve, 1000));
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
async testConnection() {
|
||||
try {
|
||||
await this.transporter.verify();
|
||||
console.log("邮件服务器连接成功");
|
||||
return true;
|
||||
} catch (error) {
|
||||
console.error("邮件服务器连接失败:", error);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// async function example() {
|
||||
// let emailSender = new EmailSender({
|
||||
// host: "smtp.exmail.qq.com",
|
||||
// port: 465,
|
||||
// secure: true,
|
||||
// auth: {
|
||||
// user: "jiqiren@axbbaoxian.com",
|
||||
// pass: "Am13579q",
|
||||
// },
|
||||
// });
|
||||
// const isConnected = await emailSender.testConnection();
|
||||
// if (!isConnected) {
|
||||
// console.log("邮件服务器连接失败");
|
||||
// return;
|
||||
// }
|
||||
// emailSender.sendBasicEmail(
|
||||
// "cpw@axbbaoxian.com",
|
||||
// "测试邮件",
|
||||
// "这是测试邮件内容"
|
||||
// );
|
||||
// }
|
||||
|
||||
// example().catch((err) => {
|
||||
// console.error("程序错误:", err);
|
||||
// });
|
||||
export { EmailSender };
|
||||
|
|
@ -0,0 +1,212 @@
|
|||
// msgQueue.js - 基于事件的消息队列
|
||||
import { EventEmitter } from "events";
|
||||
import fs from "fs";
|
||||
import path from "path";
|
||||
import { EmailSender } from "./mailer.js";
|
||||
import { SQLiteMessageQueue } from "./sqlite.js";
|
||||
import { md5 } from "./utils.js";
|
||||
import axios from "axios";
|
||||
|
||||
class MessageQueue extends EventEmitter {
|
||||
constructor() {
|
||||
super();
|
||||
this.queue = new SQLiteMessageQueue();
|
||||
this.processing = false;
|
||||
// this.queueFile = path.resolve("message_queue.json");K
|
||||
this.emailSender = new EmailSender({
|
||||
host: "smtp.exmail.qq.com",
|
||||
port: 465,
|
||||
secure: true,
|
||||
auth: {
|
||||
user: "jiqiren@axbbaoxian.com",
|
||||
pass: "Am13579q",
|
||||
},
|
||||
});
|
||||
this.recipients = [
|
||||
"huzhengrong@axbbaoxian.com",
|
||||
];
|
||||
|
||||
// 启动处理器
|
||||
this.startProcessor();
|
||||
}
|
||||
|
||||
// 添加消息到队列
|
||||
|
||||
// 处理队列
|
||||
async startProcessor() {
|
||||
setInterval(async () => {
|
||||
// 清除状态 不等于 pending的数据
|
||||
console.log("开始处理队列");
|
||||
try {
|
||||
const pendingMessages = this.queue.getPendingMessages();
|
||||
if (!this.processing && pendingMessages.length > 0) {
|
||||
await this.processQueue(pendingMessages);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`❌ 获取待处理消息失败:`, error);
|
||||
}
|
||||
}, 60 * 60 * 1000); // 1h处理一次
|
||||
}
|
||||
|
||||
async processQueue(pendingMessages) {
|
||||
this.processing = true;
|
||||
|
||||
let msgMap = {};
|
||||
for (const message of pendingMessages) {
|
||||
try {
|
||||
console.log(`📧 处理消息: ${message.spider_name}`);
|
||||
// console.log(typeof message.data);
|
||||
// let formdata = JSON.parse(message.data);
|
||||
if (!msgMap[message.spider_name]) {
|
||||
msgMap[message.spider_name] = message.data;
|
||||
} else {
|
||||
msgMap[message.spider_name].push(...message.data);
|
||||
}
|
||||
|
||||
message.status = "sent";
|
||||
message.sent_at = new Date().toISOString();
|
||||
this.queue.updateMessageStatus(
|
||||
message.id,
|
||||
message.status,
|
||||
message.sent_at
|
||||
);
|
||||
} catch (error) {
|
||||
console.error(`❌ 消息处理失败: ${message.id}`, error);
|
||||
message.status = "failed";
|
||||
message.error_message = error.message;
|
||||
this.queue.updateMessageStatus(
|
||||
message.id,
|
||||
message.status,
|
||||
null,
|
||||
// message.sent_at,
|
||||
message.error_message
|
||||
);
|
||||
}
|
||||
}
|
||||
let html = "";
|
||||
for (const spiderName in msgMap) {
|
||||
html += this.generateTable(spiderName, msgMap[spiderName]);
|
||||
}
|
||||
try {
|
||||
this.emailSender.sendBulkEmail(this.recipients, "招标项目最新公告", html);
|
||||
} catch (error) {
|
||||
console.error(`❌ 通知发送失败: ${error}`);
|
||||
}
|
||||
|
||||
this.processing = false;
|
||||
}
|
||||
|
||||
generateTable(spiderName, data) {
|
||||
let tableHtml = `
|
||||
<div style="margin: 30px 0; font-family: Arial, sans-serif;">
|
||||
<h2 style="color: #2c3e50; border-bottom: 3px solid #3498db; padding-bottom: 10px; margin-bottom: 20px;">
|
||||
🕷️ ${spiderName} (${data.length} 条新增)
|
||||
</h2>
|
||||
|
||||
<div style="overflow-x: auto; box-shadow: 0 2px 8px rgba(0,0,0,0.1); border-radius: 8px; margin-bottom: 20px;">
|
||||
<table style="width: 100%; border-collapse: collapse; background: white; min-width: 800px;">
|
||||
<thead>
|
||||
<tr style="background: linear-gradient(135deg, #3498db 0%, #2980b9 100%); color: white;">
|
||||
<th style="border: 1px solid #ddd; padding: 12px 8px; text-align: left; font-weight: bold; width: 50px;">序号</th>
|
||||
<th style="border: 1px solid #ddd; padding: 12px 8px; text-align: left; font-weight: bold; min-width: 300px;">项目名称</th>
|
||||
<th style="border: 1px solid #ddd; padding: 12px 8px; text-align: left; font-weight: bold; width: 140px;">发布时间</th>
|
||||
<th style="border: 1px solid #ddd; padding: 12px 8px; text-align: left; font-weight: bold; width: 140px;">截止时间</th>
|
||||
<th style="border: 1px solid #ddd; padding: 12px 8px; text-align: left; font-weight: bold; width: 100px;">查看详情</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
`;
|
||||
data.forEach((item, index) => {
|
||||
const rowColor = index % 2 === 0 ? "#f8f9fa" : "white";
|
||||
// const publishTime = this.formatDateTime(item.publishTime);
|
||||
// const endTime = this.formatDateTime(item.endTime);
|
||||
const urls = this.formatUrls(item.urls);
|
||||
|
||||
tableHtml += `
|
||||
<tr style="background-color: ${rowColor}; border-bottom: 1px solid #eee;">
|
||||
<td style="border: 1px solid #ddd; padding: 10px 8px; text-align: center; font-weight: bold; color: #666;">
|
||||
${index + 1}
|
||||
</td>
|
||||
<td style="border: 1px solid #ddd; padding: 10px 8px; line-height: 1.4;">
|
||||
<div style="font-weight: 500; color: #2c3e50; margin-bottom: 4px;">
|
||||
${item.name}
|
||||
</div>
|
||||
|
||||
</td>
|
||||
<td style="border: 1px solid #ddd; padding: 10px 8px; color: #495057;">
|
||||
${item.publishTime}
|
||||
</td>
|
||||
<td style="border: 1px solid #ddd; padding: 10px 8px; color: #495057;">
|
||||
<div>${item.endTime}</div>
|
||||
</td>
|
||||
<td style="border: 1px solid #ddd; padding: 10px 8px; text-align: center;">
|
||||
${urls}
|
||||
</td>
|
||||
</tr>
|
||||
`;
|
||||
});
|
||||
|
||||
tableHtml += `
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
</div>
|
||||
`;
|
||||
|
||||
return tableHtml;
|
||||
}
|
||||
|
||||
getSign(timestamp) {
|
||||
let secret = "cpwyyds";
|
||||
let uri = "/common/message/push";
|
||||
const url = uri + timestamp + secret;
|
||||
const myCalc = md5(url);
|
||||
let sign =
|
||||
myCalc.substring(5, 13) +
|
||||
myCalc.substring(29, 31) +
|
||||
myCalc.substring(18, 27);
|
||||
//sign 转大写
|
||||
sign = sign.toUpperCase();
|
||||
return sign;
|
||||
}
|
||||
|
||||
formatUrls(urls) {
|
||||
if (!urls) {
|
||||
return '<span style="color: #6c757d;">无链接</span>';
|
||||
}
|
||||
|
||||
// 处理数组形式的URLs
|
||||
if (Array.isArray(urls)) {
|
||||
if (urls.length === 0) {
|
||||
return '<span style="color: #6c757d;">无链接</span>';
|
||||
}
|
||||
|
||||
if (urls.length === 1) {
|
||||
return `<a href="${urls[0]}" target="_blank" style="color: #007bff; text-decoration: none; padding: 6px 12px; background-color: #e3f2fd; border-radius: 4px; font-size: 12px; border: 1px solid #90caf9; display: inline-block;">📄 查看</a>`;
|
||||
}
|
||||
|
||||
// 多个链接的情况
|
||||
let linksHtml = '<div style="line-height: 1.6;">';
|
||||
urls.forEach((url, index) => {
|
||||
linksHtml += `<a href="${url}" target="_blank" style="color: #007bff; text-decoration: none; padding: 4px 8px; background-color: #e3f2fd; border-radius: 3px; font-size: 11px; margin: 2px; display: inline-block; border: 1px solid #90caf9;">📄 链接${
|
||||
index + 1
|
||||
}</a>`;
|
||||
});
|
||||
linksHtml += "</div>";
|
||||
return linksHtml;
|
||||
}
|
||||
|
||||
// 处理字符串形式的URL
|
||||
if (typeof urls === "string") {
|
||||
return `<a href="${urls}" target="_blank" style="color: #007bff; text-decoration: none; padding: 6px 12px; background-color: #e3f2fd; border-radius: 4px; font-size: 12px; border: 1px solid #90caf9; display: inline-block;">📄 查看</a>`;
|
||||
}
|
||||
|
||||
return '<span style="color: #6c757d;">链接格式错误</span>';
|
||||
}
|
||||
}
|
||||
|
||||
const messageQueue = new MessageQueue();
|
||||
|
||||
export { messageQueue };
|
||||
|
||||
// export default MessageQueue;
|
||||
|
|
@ -0,0 +1,170 @@
|
|||
import axios from "axios";
|
||||
import fs from "fs";
|
||||
import path from "path";
|
||||
import {
|
||||
timestampToDate,
|
||||
loopCall,
|
||||
keywordsInclude,
|
||||
getYiqiNoticeUrl,
|
||||
parseToGgDetailsParams,
|
||||
} from "./utils.js";
|
||||
import config from "./config.js";
|
||||
import * as cheerio from "cheerio";
|
||||
import { SQLiteMessageQueue } from "./sqlite.js";
|
||||
|
||||
class NIO {
|
||||
constructor() {
|
||||
// this.filepath = path.resolve("yiqi.json");
|
||||
this.info = [];
|
||||
console.log("蔚来 爬虫启动...");
|
||||
this.queue = new SQLiteMessageQueue();
|
||||
this.start();
|
||||
}
|
||||
|
||||
async start() {
|
||||
try {
|
||||
await this.init();
|
||||
} catch (err) {
|
||||
console.error("启动失败:", err);
|
||||
}
|
||||
}
|
||||
async init() {
|
||||
let announcements = this.queue.getAnnouncementsBySpider("蔚来");
|
||||
if (announcements.length > 0) {
|
||||
await this.increment();
|
||||
} else {
|
||||
await this.fullFetch();
|
||||
}
|
||||
}
|
||||
// 全量爬取
|
||||
async fullFetch() {
|
||||
console.log("开始全量爬取...");
|
||||
try {
|
||||
await loopCall(this.getInfo.bind(this), {
|
||||
time: config.fullFetchTime,
|
||||
pagenumber: 1,
|
||||
stopWhen: (pagenumber, result) => {
|
||||
return (
|
||||
pagenumber >= result.pages || pagenumber >= config.pageNumberLimit
|
||||
);
|
||||
},
|
||||
readyForNext: (pagenumber, result) => {
|
||||
this.info.push(...result.info);
|
||||
return pagenumber + 1;
|
||||
},
|
||||
complete: (result) => {
|
||||
this.info.push(...result.info);
|
||||
console.log(`爬取完成,共获取 ${this.info.length} 条有效数据`);
|
||||
try {
|
||||
if (this.info.length > 0) {
|
||||
this.queue.saveAnnouncements("蔚来", this.info);
|
||||
// this.writeFile(this.info);
|
||||
this.queue.addMessage("蔚来", this.info);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("数据库操作失败:", error);
|
||||
}
|
||||
},
|
||||
});
|
||||
} catch (error) {
|
||||
console.error("全量爬取失败:", error);
|
||||
}
|
||||
console.log("开始增量爬取...");
|
||||
this.increment();
|
||||
}
|
||||
|
||||
// 增量爬取
|
||||
async increment() {
|
||||
console.log("开始增量爬取模式,每5分钟检查一次新数据...");
|
||||
try {
|
||||
await loopCall(this.getInfo.bind(this), {
|
||||
time: config.incrementFetchTime, // 5分钟间隔
|
||||
pagenumber: 1,
|
||||
readyForNext: (pagenumber, result) => {
|
||||
try {
|
||||
let newInfo = this.queue.filterNewAnnouncements(
|
||||
"蔚来",
|
||||
result.info
|
||||
);
|
||||
// 存在新数据
|
||||
if (newInfo.length > 0) {
|
||||
console.log(`发现 ${newInfo.length} 条新数据`);
|
||||
// this.info.push(...newInfo);
|
||||
this.queue.saveAnnouncements("蔚来", newInfo);
|
||||
// this.writeFile(this.info);
|
||||
this.queue.addMessage("蔚来", newInfo);
|
||||
// 全是新数据,继续下一页
|
||||
if (newInfo.length === result.info.length) {
|
||||
return pagenumber + 1;
|
||||
} else {
|
||||
// 有部分重复数据,重新从第一页开始
|
||||
return 1;
|
||||
}
|
||||
} else {
|
||||
console.log("没有发现新数据,继续监控...");
|
||||
return 1; // 重新从第一页开始
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("数据库操作失败:", error);
|
||||
}
|
||||
},
|
||||
});
|
||||
} catch (error) {
|
||||
console.error("增量爬取失败:", error);
|
||||
}
|
||||
}
|
||||
async getInfo(pagenumber = 1) {
|
||||
let info = [];
|
||||
console.log(`正在获取第 ${pagenumber} 页数据...`);
|
||||
let result = await this.getHtml(pagenumber);
|
||||
if (result[0]) {
|
||||
// 出错, 记录错误日志
|
||||
console.error("获取页面数据失败:", result[0]);
|
||||
return { pages: 0, info: [] };
|
||||
} else {
|
||||
let pages = 1;
|
||||
let html = result[1];
|
||||
const $ = cheerio.load(html);
|
||||
let jsonStr = $("#__NEXT_DATA__").text();
|
||||
let data = JSON.parse(jsonStr).props.pageProps.tenderNotices;
|
||||
// console.log(data);
|
||||
data.forEach((item) => {
|
||||
let id = item.id;
|
||||
let name = item.title;
|
||||
let publishTime = item.publishDate;
|
||||
let endTime = item.dueTime;
|
||||
let urls = item.documents[0].url;
|
||||
if (
|
||||
endTime &&
|
||||
+new Date(endTime) >= Date.now() &&
|
||||
keywordsInclude(name)
|
||||
) {
|
||||
info.push({
|
||||
id,
|
||||
name,
|
||||
publishTime,
|
||||
endTime,
|
||||
urls,
|
||||
});
|
||||
}
|
||||
});
|
||||
return { pages, info };
|
||||
}
|
||||
}
|
||||
// 分页获取数据
|
||||
getHtml(pagenumber) {
|
||||
return axios({
|
||||
url: "https://www.nio.cn/partnership/tender-notices",
|
||||
method: "get",
|
||||
})
|
||||
.then((res) => {
|
||||
let result = res.data;
|
||||
return [null, result];
|
||||
})
|
||||
.catch((err) => {
|
||||
return [err, null];
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
new NIO();
|
||||
|
|
@ -0,0 +1,23 @@
|
|||
{
|
||||
"name": "net-spider",
|
||||
"version": "1.0.0",
|
||||
"description": "",
|
||||
"main": "index.js",
|
||||
"type": "module",
|
||||
"scripts": {
|
||||
"test": "echo \"Error: no test specified\" && exit 1",
|
||||
"start": "pm2 start ecosystem.config.cjs",
|
||||
"stop": "pm2 stop all",
|
||||
"stats": "node stats.js",
|
||||
"restart": "pm2 restart all"
|
||||
},
|
||||
"author": "",
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"axios": "^1.12.2",
|
||||
"better-sqlite3": "^12.4.1",
|
||||
"cheerio": "^1.1.2",
|
||||
"json5": "^2.2.3",
|
||||
"nodemailer": "^7.0.6"
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,214 @@
|
|||
import axios from "axios";
|
||||
import fs from "fs";
|
||||
import path from "path";
|
||||
import { timestampToDate, loopCall } from "./utils.js";
|
||||
import config from "./config.js";
|
||||
import { SQLiteMessageQueue } from "./sqlite.js";
|
||||
|
||||
class PICC {
|
||||
constructor() {
|
||||
this.info = [];
|
||||
console.log("中国人民保险 爬虫启动...");
|
||||
this.queue = new SQLiteMessageQueue();
|
||||
this.start();
|
||||
}
|
||||
|
||||
async start() {
|
||||
try {
|
||||
await this.init();
|
||||
} catch (err) {
|
||||
console.error("启动失败:", err);
|
||||
}
|
||||
}
|
||||
async init() {
|
||||
let announcements = this.queue.getAnnouncementsBySpider("中国人民保险");
|
||||
if (announcements.length > 0) {
|
||||
await this.increment();
|
||||
} else {
|
||||
await this.fullFetch();
|
||||
}
|
||||
}
|
||||
// 全量爬取
|
||||
async fullFetch() {
|
||||
console.log("开始全量爬取...");
|
||||
try {
|
||||
await loopCall(this.getInfo.bind(this), {
|
||||
time: config.fullFetchTime,
|
||||
pagenumber: 1,
|
||||
stopWhen: (pagenumber, result) => {
|
||||
return (
|
||||
pagenumber >= result.pages || pagenumber >= config.pageNumberLimit
|
||||
);
|
||||
},
|
||||
readyForNext: (pagenumber, result) => {
|
||||
this.info.push(...result.info);
|
||||
return pagenumber + 1;
|
||||
},
|
||||
complete: (result) => {
|
||||
this.info.push(...result.info);
|
||||
console.log(`爬取完成,共获取 ${this.info.length} 条有效数据`);
|
||||
try {
|
||||
if (this.info.length > 0) {
|
||||
this.queue.saveAnnouncements("中国人民保险", this.info);
|
||||
// this.writeFile(this.info);
|
||||
this.queue.addMessage("中国人民保险", this.info);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("数据库操作失败:", error);
|
||||
}
|
||||
},
|
||||
});
|
||||
} catch (error) {
|
||||
console.error("全量爬取失败:", error);
|
||||
}
|
||||
console.log("开始增量爬取...");
|
||||
this.increment();
|
||||
}
|
||||
|
||||
// 增量爬取
|
||||
async increment() {
|
||||
console.log("开始增量爬取模式,每5分钟检查一次新数据...");
|
||||
try {
|
||||
await loopCall(this.getInfo.bind(this), {
|
||||
time: config.incrementFetchTime, // 5分钟间隔
|
||||
pagenumber: 1,
|
||||
readyForNext: (pagenumber, result) => {
|
||||
try {
|
||||
let newInfo = this.queue.filterNewAnnouncements(
|
||||
"中国人民保险",
|
||||
result.info
|
||||
);
|
||||
// 存在新数据
|
||||
if (newInfo.length > 0) {
|
||||
console.log(`发现 ${newInfo.length} 条新数据`);
|
||||
// this.info.push(...newInfo);
|
||||
this.queue.saveAnnouncements("中国人民保险", newInfo);
|
||||
// this.writeFile(this.info);
|
||||
this.queue.addMessage("中国人民保险", newInfo);
|
||||
// 全是新数据,继续下一页
|
||||
if (newInfo.length === result.info.length) {
|
||||
return pagenumber + 1;
|
||||
} else {
|
||||
// 有部分重复数据,重新从第一页开始
|
||||
return 1;
|
||||
}
|
||||
} else {
|
||||
console.log("没有发现新数据,继续监控...");
|
||||
return 1; // 重新从第一页开始
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("数据库操作失败:", error);
|
||||
}
|
||||
},
|
||||
});
|
||||
} catch (error) {
|
||||
console.error("增量爬取失败:", error);
|
||||
}
|
||||
}
|
||||
async getInfo(pagenumber = 1) {
|
||||
let info = [];
|
||||
console.log(`正在获取第 ${pagenumber} 页数据...`);
|
||||
let result = await this.getList(pagenumber);
|
||||
if (result[0]) {
|
||||
// 出错, 记录错误日志
|
||||
console.error("获取页面数据失败:", result[0]);
|
||||
return { pages: 0, info: [] };
|
||||
} else {
|
||||
let total = result[1].res.total;
|
||||
let pages = Math.ceil(total / 10);
|
||||
let arr = result[1].res.rows;
|
||||
|
||||
for (let i = 0; i < arr.length; i++) {
|
||||
let item = arr[i];
|
||||
let endTime = timestampToDate(
|
||||
new Date(item.tenderFileSaleEndTime).getTime(),
|
||||
true
|
||||
);
|
||||
// 命中关键词
|
||||
if (
|
||||
this.keywordsInclude(item.title) &&
|
||||
endTime &&
|
||||
+new Date(endTime) >= Date.now()
|
||||
) {
|
||||
// console.log("处理项目:", item.sourcingId, item.title);
|
||||
info.push({
|
||||
id: item.sourcingId,
|
||||
name: item.title,
|
||||
publishTime: timestampToDate(
|
||||
new Date(item.tenderFileSaleBeginTime).getTime(),
|
||||
true
|
||||
),
|
||||
endTime: endTime,
|
||||
urls: `https://ec.picc.com/cms/default/webfile${item.url}`,
|
||||
});
|
||||
}
|
||||
}
|
||||
return { pages, info };
|
||||
}
|
||||
}
|
||||
// 分页获取数据
|
||||
getList(pagenumber) {
|
||||
return axios({
|
||||
url: "https://ec.picc.com/cms/api/dynamicData/queryContentPage",
|
||||
data: {
|
||||
dto:{
|
||||
categoryId:"211,213,214,215,216,217",
|
||||
city:"",
|
||||
county:"",
|
||||
purchaseMode:"",
|
||||
siteId:"725"
|
||||
},
|
||||
pageNo: pagenumber,
|
||||
pageSize: 10,
|
||||
},
|
||||
method: "post",
|
||||
headers: {
|
||||
'Accept': 'application/json, text/javascript, */*; q=0.01',
|
||||
'Accept-Encoding': 'gzip, deflate, br, zstd',
|
||||
'Accept-Language': 'zh-CN,zh;q=0.9',
|
||||
'Connection': 'keep-alive',
|
||||
'Content-Type': 'application/json; charset=UTF-8',
|
||||
'Cookie': 'G_rbec_47_11_8080=22685.52745.19855.0000',
|
||||
'Host': 'ec.picc.com',
|
||||
'Origin': 'https://ec.picc.com',
|
||||
'Referer': 'https://ec.picc.com/cms/default/webfile/ywgg1/index.html',
|
||||
'Sec-Fetch-Dest': 'empty',
|
||||
'Sec-Fetch-Mode': 'cors',
|
||||
'Sec-Fetch-Site': 'same-origin',
|
||||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36',
|
||||
'X-Requested-With': 'XMLHttpRequest',
|
||||
'Sec-Ch-Ua': '"Google Chrome";v="141", "Not?A_Brand";v="8", "Chromium";v="141"',
|
||||
'Sec-Ch-Ua-Mobile': '?0',
|
||||
'Sec-Ch-Ua-Platform': "macOS",
|
||||
}
|
||||
})
|
||||
.then((res) => {
|
||||
let result = res.data;
|
||||
console.log("then",result)
|
||||
if (result.msg === "操作成功" && result.code === 0) {
|
||||
return [null, result];
|
||||
} else {
|
||||
return ["err", null];
|
||||
}
|
||||
})
|
||||
.catch((err) => {
|
||||
console.log('catch', err)
|
||||
return [err, null];
|
||||
});
|
||||
}
|
||||
|
||||
keywordsInclude(name) {
|
||||
let keywords = [
|
||||
"保险",
|
||||
"车险",
|
||||
"非车险",
|
||||
"科技",
|
||||
"大模型",
|
||||
"承保",
|
||||
"第三方平台",
|
||||
];
|
||||
return keywords.some((keyword) => name.includes(keyword));
|
||||
}
|
||||
}
|
||||
|
||||
new PICC();
|
||||
|
|
@ -0,0 +1,47 @@
|
|||
# 查看指定爬虫详细信息
|
||||
|
||||
pm2 show chery-spider
|
||||
|
||||
# 查看指定爬虫状态
|
||||
|
||||
pm2 list | grep chery-spider
|
||||
|
||||
# 实时监控指定爬虫
|
||||
|
||||
pm2 monit chery-spider
|
||||
|
||||
# 停止指定爬虫(不删除)
|
||||
|
||||
pm2 stop chery-spider
|
||||
|
||||
# 彻底删除爬虫进程
|
||||
|
||||
pm2 delete chery-spider
|
||||
|
||||
# 停止并删除
|
||||
|
||||
pm2 stop chery-spider && pm2 delete chery-spider
|
||||
|
||||
# 查看指定爬虫的实时日志
|
||||
|
||||
pm2 logs chery-spider
|
||||
|
||||
# 查看最近 100 行日志
|
||||
|
||||
pm2 logs chery-spider --lines 100
|
||||
|
||||
# 只查看标准输出日志
|
||||
|
||||
pm2 logs chery-spider --out
|
||||
|
||||
# 只查看错误日志
|
||||
|
||||
pm2 logs chery-spider --err
|
||||
|
||||
# 查看某个时间段的日志
|
||||
|
||||
pm2 logs chery-spider --timestamp
|
||||
|
||||
# 清空日志
|
||||
|
||||
pm2 flush chery-spider
|
||||
|
|
@ -0,0 +1,320 @@
|
|||
import Database from "better-sqlite3";
|
||||
import fs from "fs";
|
||||
// import { wechatPush } from "./utils.js";
|
||||
|
||||
class SQLiteMessageQueue {
|
||||
constructor() {
|
||||
// this.db = new Database("message_queue.db");
|
||||
this.db = new Database("spider_data.db");
|
||||
this.init();
|
||||
this.setupGracefulShutdown();
|
||||
}
|
||||
init() {
|
||||
this.db.exec(`
|
||||
CREATE TABLE IF NOT EXISTS announcements (
|
||||
id TEXT PRIMARY KEY,
|
||||
spider_name TEXT NOT NULL,
|
||||
name TEXT NOT NULL,
|
||||
publish_time TEXT,
|
||||
end_time TEXT,
|
||||
urls TEXT,
|
||||
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TEXT
|
||||
)
|
||||
`);
|
||||
|
||||
this.db.exec(`
|
||||
CREATE TABLE IF NOT EXISTS messages (
|
||||
id TEXT PRIMARY KEY,
|
||||
spider_name TEXT NOT NULL,
|
||||
data TEXT NOT NULL,
|
||||
timestamp TEXT NOT NULL,
|
||||
status TEXT DEFAULT 'pending',
|
||||
sent_at TEXT,
|
||||
error_message TEXT
|
||||
)
|
||||
`);
|
||||
this.db.exec(`
|
||||
CREATE INDEX IF NOT EXISTS idx_announcements_spider ON announcements(spider_name);
|
||||
CREATE INDEX IF NOT EXISTS idx_announcements_time ON announcements(publish_time);
|
||||
CREATE INDEX IF NOT EXISTS idx_announcements_created ON announcements(created_at);
|
||||
CREATE INDEX IF NOT EXISTS idx_status ON messages(status);
|
||||
CREATE INDEX IF NOT EXISTS idx_spider_status ON messages(spider_name, status);
|
||||
CREATE INDEX IF NOT EXISTS idx_timestamp ON messages(timestamp);
|
||||
`);
|
||||
|
||||
this.insertAnnouncementStmt = this.db.prepare(`
|
||||
INSERT OR REPLACE INTO announcements
|
||||
(id, spider_name, name, publish_time, end_time, urls, created_at, updated_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
||||
`);
|
||||
|
||||
this.getAnnouncementStmt = this.db.prepare(`
|
||||
SELECT * FROM announcements WHERE id = ?
|
||||
`);
|
||||
|
||||
this.getAnnouncementsBySpiderStmt = this.db.prepare(`
|
||||
SELECT * FROM announcements WHERE spider_name = ?
|
||||
ORDER BY created_at DESC
|
||||
`);
|
||||
|
||||
this.checkAnnouncementExistsStmt = this.db.prepare(`
|
||||
SELECT COUNT(*) as count FROM announcements WHERE id = ?
|
||||
`);
|
||||
|
||||
// 预编译SQL语句(提高性能)
|
||||
this.insertStmt = this.db.prepare(`
|
||||
INSERT INTO messages (id, spider_name, data, timestamp, status)
|
||||
VALUES (?, ?, ?, ?, ?)
|
||||
`);
|
||||
|
||||
this.getPendingStmt = this.db.prepare(`
|
||||
SELECT * FROM messages WHERE status = 'pending'
|
||||
ORDER BY timestamp ASC
|
||||
`);
|
||||
|
||||
this.getFailedStmt = this.db.prepare(`
|
||||
SELECT * FROM messages WHERE status = 'failed'
|
||||
ORDER BY timestamp ASC
|
||||
`);
|
||||
|
||||
this.updateStatusStmt = this.db.prepare(`
|
||||
UPDATE messages
|
||||
SET status = ?, sent_at = ?, error_message = ?
|
||||
WHERE id = ?
|
||||
`);
|
||||
}
|
||||
// safeExecute(methodName, operation, ...args) {
|
||||
// }
|
||||
saveAnnouncement(spiderName, announcement) {
|
||||
const now = new Date().toISOString();
|
||||
const isNew = !this.isAnnouncementExists(announcement.id);
|
||||
|
||||
this.insertAnnouncementStmt.run(
|
||||
announcement.id,
|
||||
spiderName,
|
||||
announcement.name,
|
||||
announcement.publishTime,
|
||||
announcement.endTime,
|
||||
announcement.urls,
|
||||
isNew ? now : this.getAnnouncement(announcement.id)?.created_at || now,
|
||||
now
|
||||
);
|
||||
|
||||
return isNew;
|
||||
}
|
||||
/**
|
||||
* 批量保存公告并返回新公告
|
||||
*/
|
||||
saveAnnouncements(spiderName, announcements) {
|
||||
const newAnnouncements = [];
|
||||
|
||||
// 使用事务提高性能
|
||||
const saveMany = this.db.transaction((announcements) => {
|
||||
for (const announcement of announcements) {
|
||||
const isNew = this.saveAnnouncement(spiderName, announcement);
|
||||
if (isNew) {
|
||||
newAnnouncements.push(announcement);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
saveMany(announcements);
|
||||
|
||||
console.log(`💾 ${spiderName}: 保存 ${announcements.length} 条公告`);
|
||||
return newAnnouncements;
|
||||
}
|
||||
/**
|
||||
* 检查公告是否存在
|
||||
*/
|
||||
isAnnouncementExists(announcementId) {
|
||||
const result = this.checkAnnouncementExistsStmt.get(announcementId);
|
||||
return result.count > 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取单个公告
|
||||
*/
|
||||
getAnnouncement(id) {
|
||||
return this.getAnnouncementStmt.get(id);
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取指定爬虫的所有公告
|
||||
*/
|
||||
getAnnouncementsBySpider(spiderName) {
|
||||
return this.getAnnouncementsBySpiderStmt.all(spiderName);
|
||||
}
|
||||
/**
|
||||
* 根据 spiderName 删除其所有公告
|
||||
*/
|
||||
deleteAnnouncementsBySpider(spiderName) {
|
||||
const stmt = this.db.prepare(`DELETE FROM announcements WHERE spider_name = ?`);
|
||||
const info = stmt.run(spiderName);
|
||||
console.log(`🗑️ 删除 ${spiderName} 的公告,共删除 ${info.changes} 条`);
|
||||
return info.changes;
|
||||
}
|
||||
/**
|
||||
* 过滤出新公告
|
||||
*/
|
||||
filterNewAnnouncements(spiderName, announcements) {
|
||||
return announcements.filter(
|
||||
(announcement) => !this.isAnnouncementExists(announcement.id)
|
||||
);
|
||||
}
|
||||
|
||||
// =============
|
||||
// 消息队列相关方法
|
||||
// =============
|
||||
|
||||
addMessage(spiderName, data) {
|
||||
const message = {
|
||||
id: Date.now() + "-" + Math.random().toString(36).substr(2, 9),
|
||||
spider_name: spiderName,
|
||||
data: JSON.stringify(data),
|
||||
timestamp: new Date().toISOString(),
|
||||
status: "pending",
|
||||
};
|
||||
this.insertStmt.run(
|
||||
message.id,
|
||||
message.spider_name,
|
||||
message.data,
|
||||
message.timestamp,
|
||||
message.status
|
||||
);
|
||||
// wechatPush(spiderName, data);
|
||||
console.log(`📤 添加消息到队列: ${spiderName} - ${data.length} 条数据`);
|
||||
return message.id;
|
||||
}
|
||||
|
||||
getPendingMessages() {
|
||||
const rows = this.getPendingStmt.all();
|
||||
return rows.map((row) => ({
|
||||
...row,
|
||||
data: JSON.parse(row.data),
|
||||
}));
|
||||
}
|
||||
|
||||
getFailedMessages() {
|
||||
const rows = this.getFailedStmt.all();
|
||||
return rows.map((row) => ({
|
||||
...row,
|
||||
data: JSON.parse(row.data),
|
||||
}));
|
||||
}
|
||||
|
||||
updateMessageStatus(id, status, sentAt = null, errorMessage = null) {
|
||||
this.updateStatusStmt.run(status, sentAt, errorMessage, id);
|
||||
}
|
||||
migrateFromJsonFile(spiderName, jsonFilePath) {
|
||||
try {
|
||||
if (!fs.existsSync(jsonFilePath)) {
|
||||
console.log(`📁 ${jsonFilePath} 不存在,跳过迁移`);
|
||||
return 0;
|
||||
}
|
||||
|
||||
const data = JSON.parse(fs.readFileSync(jsonFilePath, "utf-8"));
|
||||
if (!Array.isArray(data) || data.length === 0) {
|
||||
console.log(`📁 ${jsonFilePath} 数据为空,跳过迁移`);
|
||||
return 0;
|
||||
}
|
||||
|
||||
const migrateMany = this.db.transaction((announcements) => {
|
||||
for (const announcement of announcements) {
|
||||
this.saveAnnouncement(spiderName, announcement);
|
||||
}
|
||||
});
|
||||
|
||||
migrateMany(data);
|
||||
console.log(`🔄 成功迁移 ${data.length} 条 ${spiderName} 数据到数据库`);
|
||||
return data.length;
|
||||
} catch (error) {
|
||||
console.error(`❌ 迁移 ${jsonFilePath} 失败:`, error);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
cleanOldMessages(daysBefore = 30) {
|
||||
const cutoffDate = new Date();
|
||||
cutoffDate.setDate(cutoffDate.getDate() - daysBefore);
|
||||
|
||||
const stmt = this.db.prepare(`
|
||||
DELETE FROM messages
|
||||
WHERE status = 'sent' AND sent_at < ?
|
||||
`);
|
||||
|
||||
const result = stmt.run(cutoffDate.toISOString());
|
||||
console.log(`🧹 清理了 ${result.changes} 条旧消息`);
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取统计信息
|
||||
*/
|
||||
getStats() {
|
||||
const stats = {};
|
||||
|
||||
// 按爬虫统计公告数量
|
||||
const announcementStats = this.db
|
||||
.prepare(
|
||||
`
|
||||
SELECT spider_name, COUNT(*) as count
|
||||
FROM announcements
|
||||
GROUP BY spider_name
|
||||
`
|
||||
).all()
|
||||
// .prepare(`
|
||||
// SELECT spider_name, name
|
||||
// FROM announcements WHERE spider_name = '吉利'
|
||||
// `)
|
||||
// .all();
|
||||
|
||||
// 消息状态统计(status == pending)
|
||||
const messageStats = this.db
|
||||
.prepare(
|
||||
`
|
||||
SELECT status, data, sent_at
|
||||
FROM messages WHERE status = 'pending'
|
||||
`
|
||||
)
|
||||
.all();
|
||||
|
||||
stats.announcements = announcementStats;
|
||||
stats.messages = messageStats;
|
||||
|
||||
return stats;
|
||||
}
|
||||
setupGracefulShutdown() {
|
||||
// 正常退出信号
|
||||
process.on("SIGINT", () => {
|
||||
console.log("收到 SIGINT 信号,正在关闭数据库...");
|
||||
this.close();
|
||||
process.exit(0);
|
||||
});
|
||||
|
||||
// 终止信号
|
||||
process.on("SIGTERM", () => {
|
||||
console.log("收到 SIGTERM 信号,正在关闭数据库...");
|
||||
this.close();
|
||||
process.exit(0);
|
||||
});
|
||||
|
||||
// 未捕获异常
|
||||
process.on("uncaughtException", (error) => {
|
||||
console.error("未捕获异常:", error);
|
||||
this.close();
|
||||
process.exit(1);
|
||||
});
|
||||
|
||||
// 未处理的Promise拒绝
|
||||
process.on("unhandledRejection", (reason, promise) => {
|
||||
console.error("未处理的Promise拒绝:", reason);
|
||||
this.close();
|
||||
process.exit(1);
|
||||
});
|
||||
}
|
||||
// 关闭数据库连接
|
||||
close() {
|
||||
this.db.close();
|
||||
}
|
||||
}
|
||||
|
||||
export { SQLiteMessageQueue };
|
||||
|
|
@ -0,0 +1,80 @@
|
|||
import { SQLiteMessageQueue } from "./sqlite.js";
|
||||
import path from "path";
|
||||
import { md5 } from "./utils.js";
|
||||
import axios from "axios";
|
||||
|
||||
const queue = new SQLiteMessageQueue();
|
||||
|
||||
const stats = queue.getStats();
|
||||
|
||||
// function merge() {
|
||||
// let files = [
|
||||
// { name: "长安", path: "changan.json" },
|
||||
// { name: "奇瑞变更公告", path: "chery_bg.json" },
|
||||
// { name: "奇瑞采购公告", path: "chery_cg.json" },
|
||||
// { name: "奇瑞寻源预告", path: "chery_xy.json" },
|
||||
// { name: "零跑", path: "leapMotor.json" },
|
||||
// { name: "吉利", path: "geely.json" },
|
||||
// { name: "一汽", path: "yiqi.json" },
|
||||
// ];
|
||||
// files.forEach((file) => {
|
||||
// queue.migrateFromJsonFile(file.name, path.resolve(file.path));
|
||||
// });
|
||||
// }
|
||||
// merge();
|
||||
// 把message中的数据状态改成pending
|
||||
// queue.getFailedMessages()
|
||||
// .forEach((message) => {
|
||||
// queue.updateMessageStatus(message.id, "pending");
|
||||
// });
|
||||
// function getSign(timestamp) {
|
||||
// let secret = "cpwyyds";
|
||||
// let uri = "/common/message/push";
|
||||
// const url = uri + timestamp + secret;
|
||||
// console.log(url);
|
||||
// const myCalc = md5(url);
|
||||
// let sign =
|
||||
// myCalc.substring(5, 13) +
|
||||
// myCalc.substring(29, 31) +
|
||||
// myCalc.substring(18, 27);
|
||||
// //sign 转大写
|
||||
// sign = sign.toUpperCase();
|
||||
// return sign;
|
||||
// }
|
||||
// let time = new Date().getTime();
|
||||
// let data = {
|
||||
// timestamp: time,
|
||||
// sign: getSign(time),
|
||||
// templateNo: "A002",
|
||||
// url: "https://www.baidu.com/",
|
||||
// paramList: [
|
||||
// {
|
||||
// key: "thing8",
|
||||
// value: "网站name",
|
||||
// },
|
||||
// {
|
||||
// key: "thing2",
|
||||
// value: "项目name",
|
||||
// },
|
||||
// {
|
||||
// key: "time14",
|
||||
// value: "2025-11-2",
|
||||
// },
|
||||
// {
|
||||
// key: "time17",
|
||||
// value: "2025-11-3 00:00:00",
|
||||
// },
|
||||
// ],
|
||||
// };
|
||||
// axios({
|
||||
// url: "https://testadvert.shenlintech.com/platform/common/message/push",
|
||||
// method: "post",
|
||||
// data,
|
||||
// })
|
||||
// .then((res) => {
|
||||
// console.log(res.data);
|
||||
// })
|
||||
// .catch((err) => {
|
||||
// console.log(err);
|
||||
// });
|
||||
console.log(stats);
|
||||
|
|
@ -0,0 +1,309 @@
|
|||
import axios from "axios";
|
||||
import fs from "fs";
|
||||
import path from "path";
|
||||
import JSON5 from "json5";
|
||||
import { timestampToDate, loopCall, keywordsInclude } from "./utils.js";
|
||||
import config from "./config.js";
|
||||
import { SQLiteMessageQueue } from "./sqlite.js";
|
||||
import * as cheerio from "cheerio";
|
||||
|
||||
class Third {
|
||||
constructor(jsonMap) {
|
||||
this.axiosInstance = axios.create({ timeout: 30000, maxRedirects: 5 });
|
||||
this.axiosInstance.interceptors.request.use((config) => {
|
||||
// 添加cookie到请求头
|
||||
const cookieString = Array.from(this.cookiePair.entries())
|
||||
.map(([name, value]) => `${name}=${value}`)
|
||||
.join("; ");
|
||||
config.headers.Cookie = cookieString;
|
||||
// console.log(config);
|
||||
return config;
|
||||
});
|
||||
this.axiosInstance.interceptors.response.use(
|
||||
(response) => {
|
||||
// 更新cookie到请求头
|
||||
let cookieArr = response.headers["set-cookie"] || [];
|
||||
this.extractCookie(cookieArr);
|
||||
return response;
|
||||
},
|
||||
(error) => {
|
||||
return Promise.reject(error);
|
||||
}
|
||||
);
|
||||
this.cookiePair = new Map();
|
||||
// this.csrfToken = "";
|
||||
this.jsonMap = jsonMap;
|
||||
console.log("三方平台 爬虫启动...");
|
||||
this.queue = new SQLiteMessageQueue();
|
||||
this.start();
|
||||
}
|
||||
|
||||
async start() {
|
||||
try {
|
||||
await this.init();
|
||||
} catch (err) {
|
||||
console.error("启动失败:", err);
|
||||
}
|
||||
}
|
||||
async init() {
|
||||
for (let item of this.jsonMap) {
|
||||
let announcements = this.queue.getAnnouncementsBySpider(item.name);
|
||||
if (announcements.length > 0) {
|
||||
this.loopFetchIncrement(item);
|
||||
} else {
|
||||
this.loopFetchFull(item);
|
||||
}
|
||||
}
|
||||
}
|
||||
async initializeCookie() {
|
||||
try {
|
||||
let headers = {
|
||||
headers: {
|
||||
Accept: "text/plain, */*; q=0.01",
|
||||
"Accept-Language": "zh-CN,zh;q=0.9",
|
||||
"Cache-Control": "no-cache",
|
||||
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
|
||||
Origin: "https://www.chinabidding.com",
|
||||
Pragma: "no-cache",
|
||||
Priority: "u=1, i",
|
||||
Referer: "https://www.chinabidding.com/search/proj.htm",
|
||||
"Sec-Ch-Ua":
|
||||
'"Not)A;Brand";v="8", "Chromium";v="138", "Google Chrome";v="138"',
|
||||
"Sec-Ch-Ua-Mobile": "?0",
|
||||
"Sec-Ch-Ua-Platform": '"macOS"',
|
||||
"Sec-Fetch-Dest": "empty",
|
||||
"Sec-Fetch-Mode": "cors",
|
||||
"Sec-Fetch-Site": "same-origin",
|
||||
"User-Agent":
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36",
|
||||
"X-Requested-With": "XMLHttpRequest",
|
||||
},
|
||||
};
|
||||
const homeResponse = await this.axiosInstance.get(
|
||||
"https://www.chinabidding.com/search/proj.htm",
|
||||
headers
|
||||
);
|
||||
} catch (err) {
|
||||
console.log("err", err);
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
extractCookie(cookieArr) {
|
||||
for (let cookie of cookieArr) {
|
||||
let [key, value] = cookie.split(";")[0].split("=");
|
||||
this.cookiePair.set(key, value);
|
||||
}
|
||||
// console.log(this.cookiePair);
|
||||
}
|
||||
// 全量爬取
|
||||
loopFetchFull(props) {
|
||||
console.log("开始全量爬取");
|
||||
try {
|
||||
loopCall(this.getInfo.bind(this), {
|
||||
time: config.fullFetchTime,
|
||||
pagenumber: 1,
|
||||
additional: props.options,
|
||||
stopWhen: (pagenumber, result) => {
|
||||
return (
|
||||
pagenumber >= result.pages || pagenumber >= config.pageNumberLimit
|
||||
);
|
||||
},
|
||||
readyForNext: (pagenumber, result) => {
|
||||
props.info.push(...result.info);
|
||||
return pagenumber + 1;
|
||||
},
|
||||
complete: (result) => {
|
||||
props.info.push(...result.info);
|
||||
console.log(`爬取完成,共获取 ${props.info.length} 条有效数据`);
|
||||
try {
|
||||
if (props.info.length > 0) {
|
||||
this.queue.saveAnnouncements(props.name, props.info);
|
||||
this.queue.addMessage(props.name, props.info);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("数据库操作失败:", error);
|
||||
}
|
||||
this.loopFetchIncrement(props);
|
||||
},
|
||||
});
|
||||
} catch (error) {
|
||||
console.error(`${props.options.name}全量爬取失败:`, error);
|
||||
}
|
||||
}
|
||||
loopFetchIncrement(props) {
|
||||
console.log("开始增量爬取");
|
||||
try {
|
||||
loopCall(this.getInfo.bind(this), {
|
||||
time: config.incrementFetchTime, // 5分钟间隔
|
||||
pagenumber: 1,
|
||||
additional: props.options,
|
||||
readyForNext: (pagenumber, result) => {
|
||||
try {
|
||||
let newInfo = this.queue.filterNewAnnouncements(
|
||||
props.name,
|
||||
result.info
|
||||
);
|
||||
// 存在新数据
|
||||
if (newInfo.length > 0) {
|
||||
console.log(`发现 ${newInfo.length} 条新数据`);
|
||||
// props.info.push(...newInfo);
|
||||
this.queue.saveAnnouncements(props.name, newInfo);
|
||||
// this.writeFile(props);
|
||||
this.queue.addMessage(props.name, newInfo);
|
||||
// 全是新数据,继续下一页
|
||||
if (newInfo.length === result.info.length) {
|
||||
return pagenumber + 1;
|
||||
} else {
|
||||
// 有部分重复数据,重新从第一页开始
|
||||
return 1;
|
||||
}
|
||||
} else {
|
||||
console.log("没有发现新数据,继续监控...");
|
||||
return 1; // 重新从第一页开始
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("数据库操作失败:", error);
|
||||
}
|
||||
},
|
||||
});
|
||||
} catch (error) {
|
||||
console.error(`${props.options.name}增量爬取失败:`, error);
|
||||
}
|
||||
}
|
||||
|
||||
async getNoticeDetail(url) {
|
||||
try {
|
||||
let result = await axios.get(url);
|
||||
return result.data;
|
||||
} catch (err) {
|
||||
return "err";
|
||||
}
|
||||
}
|
||||
async getInfo(pagenumber = 1, config) {
|
||||
let info = [];
|
||||
console.log(`${config.name}--获取第 ${pagenumber} 页数据...`);
|
||||
let result = await this.getList(pagenumber, config);
|
||||
if (result[0]) {
|
||||
// 出错, 记录错误日志
|
||||
console.error("获取页面数据失败: ", result[0]);
|
||||
return { pages: 0, info: [] };
|
||||
} else {
|
||||
let pages = 3;
|
||||
let html = result[1];
|
||||
const $ = cheerio.load(html);
|
||||
$(".as-pager-body li").each((index, element) => {
|
||||
let idmatch = $(element)
|
||||
.find(".as-pager-item")
|
||||
.attr("href")
|
||||
.match(/\/bidDetail\/(\d+)\.html/);
|
||||
let id = idmatch ? idmatch[1] : "";
|
||||
let name = $(element).find(".txt").attr("title");
|
||||
|
||||
let url = $(element).find(".as-pager-item").attr("href");
|
||||
if (keywordsInclude(name)) {
|
||||
console.log("处理项目:", name);
|
||||
info.push({
|
||||
id: id,
|
||||
name: name,
|
||||
urls: url,
|
||||
publishTime: "--",
|
||||
endTime: "--",
|
||||
});
|
||||
}
|
||||
});
|
||||
return { pages, info };
|
||||
}
|
||||
}
|
||||
async getList(pagenumber, config) {
|
||||
let data = config.data;
|
||||
data.currentPage = pagenumber;
|
||||
let headers = {
|
||||
Accept: "text/plain, */*; q=0.01",
|
||||
"Accept-Language": "zh-CN,zh;q=0.9",
|
||||
"Cache-Control": "no-cache",
|
||||
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
|
||||
Origin: "https://www.chinabidding.com",
|
||||
Pragma: "no-cache",
|
||||
Priority: "u=1, i",
|
||||
Referer: "https://www.chinabidding.com/search/proj.htm",
|
||||
"Sec-Ch-Ua":
|
||||
'"Not)A;Brand";v="8", "Chromium";v="138", "Google Chrome";v="138"',
|
||||
"Sec-Ch-Ua-Mobile": "?0",
|
||||
"Sec-Ch-Ua-Platform": '"macOS"',
|
||||
"Sec-Fetch-Dest": "empty",
|
||||
"Sec-Fetch-Mode": "cors",
|
||||
"Sec-Fetch-Site": "same-origin",
|
||||
"User-Agent":
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36",
|
||||
"X-Requested-With": "XMLHttpRequest",
|
||||
};
|
||||
try {
|
||||
const response = await this.axiosInstance({
|
||||
url: config.url,
|
||||
data,
|
||||
method: "post",
|
||||
headers,
|
||||
});
|
||||
let result = response.data;
|
||||
return [null, result];
|
||||
} catch (err) {
|
||||
console.log("cookie不对");
|
||||
try {
|
||||
await this.initializeCookie();
|
||||
const retryResponse = await this.axiosInstance({
|
||||
url: config.url,
|
||||
data,
|
||||
method: "post",
|
||||
headers,
|
||||
});
|
||||
let result = retryResponse.data;
|
||||
return [null, result];
|
||||
} catch (retryErr) {
|
||||
return [retryErr, null];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
new Third([
|
||||
{
|
||||
name: "机电项目招投标【招标公告】",
|
||||
info: [],
|
||||
options: {
|
||||
name: "机电项目招投标【招标公告】",
|
||||
url: "https://www.chinabidding.com/search/proj.htm",
|
||||
data: {
|
||||
fullText: "",
|
||||
pubDate: "",
|
||||
infoClassCodes: "(0105 0103)",
|
||||
normIndustry: "",
|
||||
zoneCode: "",
|
||||
fundSourceCodes: "",
|
||||
poClass: "BidNotice",
|
||||
rangeType: "",
|
||||
currentPage: 1,
|
||||
},
|
||||
},
|
||||
},
|
||||
]);
|
||||
new Third([
|
||||
{
|
||||
name: "机电项目招投标【招标变更公告】",
|
||||
info: [],
|
||||
options: {
|
||||
name: "机电项目招投标【招标变更公告】",
|
||||
url: "https://www.chinabidding.com/search/proj.htm",
|
||||
data: {
|
||||
fullText: "",
|
||||
pubDate: "",
|
||||
infoClassCodes: "(0106 0104)",
|
||||
normIndustry: "",
|
||||
zoneCode: "",
|
||||
fundSourceCodes: "",
|
||||
poClass: "BidNotice",
|
||||
rangeType: "",
|
||||
currentPage: 1,
|
||||
},
|
||||
},
|
||||
},
|
||||
]);
|
||||
|
|
@ -0,0 +1,271 @@
|
|||
import crypto from "crypto";
|
||||
import axios from "axios";
|
||||
/**
|
||||
* 将时间戳(毫秒)转换为 yyyy-mm-dd 格式的字符串
|
||||
* @param {number} timestamp - 毫秒级时间戳
|
||||
* @returns {string} yyyy-mm-dd 格式日期
|
||||
*/
|
||||
function timestampToDate(timestamp, mode) {
|
||||
const date = new Date(timestamp);
|
||||
const year = date.getFullYear();
|
||||
// 补零
|
||||
const month = String(date.getMonth() + 1).padStart(2, "0");
|
||||
const day = String(date.getDate()).padStart(2, "0");
|
||||
if (!mode) {
|
||||
return `${year}-${month}-${day}`;
|
||||
} else {
|
||||
const hours = String(date.getHours()).padStart(2, "0");
|
||||
const minutes = String(date.getMinutes()).padStart(2, "0");
|
||||
const seconds = String(date.getSeconds()).padStart(2, "0");
|
||||
return `${year}-${month}-${day} ${hours}:${minutes}:${seconds}`;
|
||||
}
|
||||
}
|
||||
|
||||
function md5(text, inputEncoding = "utf8", outputEncoding = "hex") {
|
||||
return crypto
|
||||
.createHash("md5")
|
||||
.update(text, inputEncoding)
|
||||
.digest(outputEncoding);
|
||||
}
|
||||
function getSign(timestamp) {
|
||||
let secret = "cpwyyds";
|
||||
let uri = "/common/message/push";
|
||||
const url = uri + timestamp + secret;
|
||||
const myCalc = md5(url);
|
||||
let sign =
|
||||
myCalc.substring(5, 13) +
|
||||
myCalc.substring(29, 31) +
|
||||
myCalc.substring(18, 27);
|
||||
//sign 转大写
|
||||
sign = sign.toUpperCase();
|
||||
return sign;
|
||||
}
|
||||
// 微信推送
|
||||
// function wechatPush(spiderName, arr) {
|
||||
// for (let item of arr) {
|
||||
// let timestamp = new Date().getTime();
|
||||
// let sign = getSign(timestamp);
|
||||
// let url = "";
|
||||
// if (typeof item.urls === "string") {
|
||||
// url = item.urls;
|
||||
// } else {
|
||||
// url = item.urls[0];
|
||||
// }
|
||||
// let data = {
|
||||
// timestamp,
|
||||
// sign,
|
||||
// templateNo: "A002",
|
||||
// url,
|
||||
// paramList: [
|
||||
// {
|
||||
// key: "thing8",
|
||||
// value: spiderName,
|
||||
// },
|
||||
// {
|
||||
// key: "thing2",
|
||||
// value:
|
||||
// item.name.length > 20
|
||||
// ? item.name.substring(0, 16) + "..."
|
||||
// : item.name,
|
||||
// },
|
||||
// {
|
||||
// key: "time14",
|
||||
// value: item.publishTime,
|
||||
// },
|
||||
// {
|
||||
// key: "time17",
|
||||
// value: item.endTime,
|
||||
// },
|
||||
// ],
|
||||
// };
|
||||
// axios({
|
||||
// url: "https://advert.shenlintech.com/platform/common/message/push",
|
||||
// method: "post",
|
||||
// data,
|
||||
// });
|
||||
// }
|
||||
// }
|
||||
// 废弃
|
||||
function addToMessageQueue(spiderName, data) {
|
||||
const message = {
|
||||
id: Date.now() + "-" + Math.random().toString(36).substr(2, 9),
|
||||
spiderName,
|
||||
data,
|
||||
timestamp: new Date().toISOString(),
|
||||
status: "pending",
|
||||
};
|
||||
let queue = [];
|
||||
const queueFile = "message_queue.json";
|
||||
if (fs.existsSync(queueFile)) {
|
||||
queue = JSON.parse(fs.readFileSync(queueFile, "utf-8"));
|
||||
}
|
||||
// 添加新消息
|
||||
queue.push(message);
|
||||
|
||||
fs.writeFileSync(queueFile, JSON.stringify(queue, null, 2));
|
||||
console.log(`📤 添加消息到队列: ${spiderName} - ${data.length} 条数据`);
|
||||
}
|
||||
|
||||
async function loopCall(fn, options = {}) {
|
||||
let { time, pagenumber, stopWhen, readyForNext, complete, additional } =
|
||||
options;
|
||||
let shouldContinue = true;
|
||||
while (shouldContinue) {
|
||||
try {
|
||||
let result = await fn(pagenumber, additional);
|
||||
// console.log(`页面 ${pagenumber} 处理完成`);
|
||||
|
||||
// 检查停止条件
|
||||
if (stopWhen && stopWhen(pagenumber, result)) {
|
||||
complete && complete(result);
|
||||
shouldContinue = false;
|
||||
} else {
|
||||
pagenumber = readyForNext(pagenumber, result);
|
||||
await new Promise((resolve) => setTimeout(resolve, time));
|
||||
}
|
||||
} catch (err) {
|
||||
console.error("loopCall 出错:", err);
|
||||
shouldContinue = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
function keywordsInclude(name) {
|
||||
let keywords = [
|
||||
"海外",
|
||||
"国际",
|
||||
"内容",
|
||||
"营销",
|
||||
"运营",
|
||||
"直播",
|
||||
"品牌",
|
||||
"事件",
|
||||
"策略",
|
||||
"传播",
|
||||
"执行",
|
||||
"社媒",
|
||||
"视频",
|
||||
"制作",
|
||||
"拍摄",
|
||||
"效果",
|
||||
];
|
||||
return keywords.some((keyword) => name.includes(keyword));
|
||||
}
|
||||
// 一汽专用获取公告链接的方法
|
||||
function getYiqiNoticeUrl(gongGaoType, guid, version, origin) {
|
||||
let baseUrl = "https://etp.faw.cn/";
|
||||
//是否对参数加密
|
||||
var isSecrect = false;
|
||||
|
||||
//候选人公示加密
|
||||
if (gongGaoType == 7) {
|
||||
isSecrect = true;
|
||||
}
|
||||
if (isSecrect) {
|
||||
var url = baseUrl + "/gg/toGongGaoDetail";
|
||||
guid = encodeSixF(guid);
|
||||
// var params = {
|
||||
// guid: guid,
|
||||
// gongGaoType: gongGaoType,
|
||||
// version: dealNullAndUndefined(version),
|
||||
// statusCode: 1,
|
||||
// isNew: 1,
|
||||
// };
|
||||
// try {
|
||||
// await httpPostCurrent(url, params);
|
||||
// } catch (err) {
|
||||
// console.log(err);
|
||||
// return "加密链接";
|
||||
// }
|
||||
return "加密链接,请直接上对应网站查看";
|
||||
} else {
|
||||
var url =
|
||||
baseUrl +
|
||||
"/gg/toGongGaoDetail?guid=" +
|
||||
guid +
|
||||
"&gongGaoType=" +
|
||||
gongGaoType +
|
||||
"&version=" +
|
||||
version +
|
||||
"&isNew=1";
|
||||
return url;
|
||||
}
|
||||
}
|
||||
function parseToGgDetailsParams(funcStr) {
|
||||
// funcStr = "toGgDetails('6','642ed424-cd9b-4cb0-8b74-9cc868d8f95a:2','2','1','')"
|
||||
|
||||
const match = funcStr.match(/toGgDetails\(([^)]+)\)/);
|
||||
if (match) {
|
||||
// 解析参数字符串
|
||||
const paramsStr = match[1];
|
||||
// 简单的参数解析(处理引号包围的参数)
|
||||
const params = paramsStr
|
||||
.split(",")
|
||||
.map((param) => param.trim().replace(/['"]/g, ""));
|
||||
return params;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
function encodeSixF(input) {
|
||||
var keyStr =
|
||||
"ABCDEFGHIJKLMNOP" +
|
||||
"QRSTUVWXYZabcdef" +
|
||||
"ghijklmnopqrstuv" +
|
||||
"wxyz0123456789+/" +
|
||||
"=";
|
||||
var output = "";
|
||||
var chr1,
|
||||
chr2,
|
||||
chr3 = "";
|
||||
var enc1,
|
||||
enc2,
|
||||
enc3,
|
||||
enc4 = "";
|
||||
var i = 0;
|
||||
do {
|
||||
chr1 = input.charCodeAt(i++);
|
||||
chr2 = input.charCodeAt(i++);
|
||||
chr3 = input.charCodeAt(i++);
|
||||
enc1 = chr1 >> 2;
|
||||
enc2 = ((chr1 & 3) << 4) | (chr2 >> 4);
|
||||
enc3 = ((chr2 & 15) << 2) | (chr3 >> 6);
|
||||
enc4 = chr3 & 63;
|
||||
if (isNaN(chr2)) {
|
||||
enc3 = enc4 = 64;
|
||||
} else if (isNaN(chr3)) {
|
||||
enc4 = 64;
|
||||
}
|
||||
output =
|
||||
output +
|
||||
keyStr.charAt(enc1) +
|
||||
keyStr.charAt(enc2) +
|
||||
keyStr.charAt(enc3) +
|
||||
keyStr.charAt(enc4);
|
||||
chr1 = chr2 = chr3 = "";
|
||||
enc1 = enc2 = enc3 = enc4 = "";
|
||||
} while (i < input.length);
|
||||
|
||||
if (output != null && output.indexOf("=") != -1) {
|
||||
var reg = new RegExp("=", "g");
|
||||
var outputNew = output.replace(reg, "r1e2p3l4");
|
||||
output = outputNew;
|
||||
}
|
||||
|
||||
return output + "+*+";
|
||||
}
|
||||
function dealNullAndUndefined(value) {
|
||||
if (typeof value == "undefined") return "";
|
||||
if (value == null) return "";
|
||||
if (value == "null") return "";
|
||||
if (value == "undefined") return "";
|
||||
return value;
|
||||
}
|
||||
export {
|
||||
timestampToDate,
|
||||
loopCall,
|
||||
keywordsInclude,
|
||||
getYiqiNoticeUrl,
|
||||
parseToGgDetailsParams,
|
||||
addToMessageQueue,
|
||||
md5,
|
||||
// wechatPush
|
||||
};
|
||||
|
|
@ -0,0 +1,199 @@
|
|||
import axios from "axios";
|
||||
import fs from "fs";
|
||||
import path from "path";
|
||||
import {
|
||||
timestampToDate,
|
||||
loopCall,
|
||||
keywordsInclude,
|
||||
getYiqiNoticeUrl,
|
||||
parseToGgDetailsParams,
|
||||
// addToMessageQueue,
|
||||
} from "./utils.js";
|
||||
import config from "./config.js";
|
||||
import * as cheerio from "cheerio";
|
||||
import { SQLiteMessageQueue } from "./sqlite.js";
|
||||
// import { messageQueue } from "./msgManager.js";
|
||||
|
||||
class YiQi {
|
||||
constructor() {
|
||||
// this.filepath = path.resolve("yiqi.json");
|
||||
this.info = [];
|
||||
console.log("一汽 爬虫启动...");
|
||||
this.queue = new SQLiteMessageQueue();
|
||||
this.start();
|
||||
}
|
||||
|
||||
async start() {
|
||||
try {
|
||||
await this.init();
|
||||
} catch (err) {
|
||||
console.error("启动失败:", err);
|
||||
}
|
||||
}
|
||||
async init() {
|
||||
let announcements = this.queue.getAnnouncementsBySpider("一汽");
|
||||
if (announcements.length > 0) {
|
||||
await this.increment();
|
||||
} else {
|
||||
await this.fullFetch();
|
||||
}
|
||||
// if (fs.existsSync(this.filepath)) {
|
||||
// let data = fs.readFileSync(this.filepath, "utf-8");
|
||||
// this.info = data ? JSON.parse(data) : [];
|
||||
// if (this.info.length > 0) {
|
||||
// await this.increment();
|
||||
// } else {
|
||||
// await this.fullFetch();
|
||||
// }
|
||||
// } else {
|
||||
// console.log("历史文件不存在,开始全量爬取");
|
||||
// await this.fullFetch();
|
||||
// }
|
||||
}
|
||||
// 全量爬取
|
||||
async fullFetch() {
|
||||
console.log("开始全量爬取...");
|
||||
try {
|
||||
await loopCall(this.getInfo.bind(this), {
|
||||
time: config.fullFetchTime,
|
||||
pagenumber: 1,
|
||||
stopWhen: (pagenumber, result) => {
|
||||
return (
|
||||
pagenumber >= result.pages || pagenumber >= config.pageNumberLimit
|
||||
);
|
||||
},
|
||||
readyForNext: (pagenumber, result) => {
|
||||
this.info.push(...result.info);
|
||||
return pagenumber + 1;
|
||||
},
|
||||
complete: (result) => {
|
||||
this.info.push(...result.info);
|
||||
console.log(`爬取完成,共获取 ${this.info.length} 条有效数据`);
|
||||
try {
|
||||
this.queue.saveAnnouncements("一汽", this.info);
|
||||
// this.writeFile(this.info);
|
||||
this.queue.addMessage("一汽", this.info);
|
||||
} catch (error) {
|
||||
console.error("数据库操作失败:", error);
|
||||
}
|
||||
},
|
||||
});
|
||||
} catch (error) {
|
||||
console.error("全量爬取失败:", error);
|
||||
}
|
||||
console.log("开始增量爬取...");
|
||||
this.increment();
|
||||
}
|
||||
|
||||
// 增量爬取
|
||||
async increment() {
|
||||
console.log("开始增量爬取模式,每5分钟检查一次新数据...");
|
||||
try {
|
||||
await loopCall(this.getInfo.bind(this), {
|
||||
time: config.incrementFetchTime, // 5分钟间隔
|
||||
pagenumber: 1,
|
||||
readyForNext: (pagenumber, result) => {
|
||||
try {
|
||||
let newInfo = this.queue.filterNewAnnouncements(
|
||||
"一汽",
|
||||
result.info
|
||||
);
|
||||
// let newInfo = result.info.filter(
|
||||
// (item) => !this.info.some((info) => info.id === item.id)
|
||||
// );
|
||||
// 存在新数据
|
||||
if (newInfo.length > 0) {
|
||||
console.log(`发现 ${newInfo.length} 条新数据`);
|
||||
// this.info.push(...newInfo);
|
||||
this.queue.saveAnnouncements("一汽", newInfo);
|
||||
// this.writeFile(this.info);
|
||||
this.queue.addMessage("一汽", newInfo);
|
||||
// 全是新数据,继续下一页
|
||||
if (newInfo.length === result.info.length) {
|
||||
return pagenumber + 1;
|
||||
} else {
|
||||
// 有部分重复数据,重新从第一页开始
|
||||
return 1;
|
||||
}
|
||||
} else {
|
||||
console.log("没有发现新数据,继续监控...");
|
||||
return 1; // 重新从第一页开始
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("数据库操作失败:", error);
|
||||
}
|
||||
},
|
||||
});
|
||||
} catch (error) {
|
||||
console.error("增量爬取失败:", error);
|
||||
}
|
||||
}
|
||||
async getInfo(pagenumber = 1) {
|
||||
let info = [];
|
||||
console.log(`正在获取第 ${pagenumber} 页数据...`);
|
||||
let result = await this.getHtml(pagenumber);
|
||||
if (result[0]) {
|
||||
// 出错, 记录错误日志
|
||||
console.error("获取页面数据失败:", result[0]);
|
||||
return { pages: 30, info: [] };
|
||||
} else {
|
||||
let pages = 30;
|
||||
let html = result[1];
|
||||
const $ = cheerio.load(html);
|
||||
let noticeEl = $(".zl-list-main .zl-col-6");
|
||||
noticeEl.each((index, element) => {
|
||||
let id = $(element).find(".zl-desc-item:contains('项目编号')").text();
|
||||
let name = $(element).find(".title").text();
|
||||
let publishTime = $(element)
|
||||
.find(".zl-desc-item:contains('发布时间')")
|
||||
.text();
|
||||
let endTime = $(element).find(".daojishi").attr("data-time");
|
||||
// 获取生产链接的参数
|
||||
let funcStr = $(element).find(".jump").attr("onclick");
|
||||
|
||||
let funcArgs = parseToGgDetailsParams(funcStr);
|
||||
// 公告未过期 && 命中关键词
|
||||
if (endTime && keywordsInclude(name)) {
|
||||
let noticeUrl = getYiqiNoticeUrl(...funcArgs);
|
||||
info.push({
|
||||
id: id.replace("项目编号:", ""),
|
||||
name: name.trim(),
|
||||
publishTime: publishTime.replace("发布时间:", "").trim(),
|
||||
endTime: timestampToDate(Number(endTime)),
|
||||
urls: noticeUrl,
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
return { pages, info };
|
||||
}
|
||||
}
|
||||
// 分页获取数据
|
||||
getHtml(pagenumber) {
|
||||
return axios({
|
||||
url: "https://etp.faw.cn/gg/allJYTypeGGList?hangYeType=-1&xmLeiXing=&ggStartTimeEnd=&gongGaoType=5&isNew=1",
|
||||
data: {
|
||||
searchType: "",
|
||||
searchText: "",
|
||||
currentPage: pagenumber,
|
||||
},
|
||||
headers: {
|
||||
"Content-Type": "application/x-www-form-urlencoded",
|
||||
},
|
||||
method: "post",
|
||||
})
|
||||
.then((res) => {
|
||||
let result = res.data;
|
||||
return [null, result];
|
||||
})
|
||||
.catch((err) => {
|
||||
return [err, null];
|
||||
});
|
||||
}
|
||||
|
||||
// writeFile(info) {
|
||||
// fs.writeFileSync(this.filepath, JSON.stringify(info), "utf-8");
|
||||
// }
|
||||
}
|
||||
|
||||
new YiQi();
|
||||
|
|
@ -0,0 +1,406 @@
|
|||
import axios from "axios";
|
||||
import fs from "fs";
|
||||
import path from "path";
|
||||
import JSON5 from "json5";
|
||||
import { timestampToDate, loopCall, keywordsInclude } from "./utils.js";
|
||||
import config from "./config.js";
|
||||
import { SQLiteMessageQueue } from "./sqlite.js";
|
||||
import * as cheerio from "cheerio";
|
||||
|
||||
class YouZhiCai {
|
||||
constructor(jsonMap) {
|
||||
this.axiosInstance = axios.create({ timeout: 30000, maxRedirects: 5 });
|
||||
this.axiosInstance.interceptors.request.use((config) => {
|
||||
// 添加cookie到请求头
|
||||
const cookieString = Array.from(this.cookiePair.entries())
|
||||
.map(([name, value]) => `${name}=${value}`)
|
||||
.join("; ");
|
||||
config.headers.Cookie = cookieString;
|
||||
return config;
|
||||
});
|
||||
this.axiosInstance.interceptors.response.use(
|
||||
(response) => {
|
||||
// 更新cookie到请求头
|
||||
let cookieArr = response.headers["set-cookie"] || [];
|
||||
this.extractCookie(cookieArr);
|
||||
return response;
|
||||
},
|
||||
(error) => {
|
||||
return Promise.reject(error);
|
||||
}
|
||||
);
|
||||
this.cookiePair = new Map();
|
||||
// this.csrfToken = "";
|
||||
this.jsonMap = jsonMap;
|
||||
console.log("优质采 爬虫启动...");
|
||||
this.queue = new SQLiteMessageQueue();
|
||||
this.start();
|
||||
}
|
||||
|
||||
async start() {
|
||||
try {
|
||||
await this.init();
|
||||
} catch (err) {
|
||||
console.error("启动失败:", err);
|
||||
}
|
||||
}
|
||||
async init() {
|
||||
for (let item of this.jsonMap) {
|
||||
let announcements = this.queue.getAnnouncementsBySpider(item.name);
|
||||
if (announcements.length > 0) {
|
||||
this.loopFetchIncrement(item);
|
||||
} else {
|
||||
this.loopFetchFull(item);
|
||||
}
|
||||
}
|
||||
}
|
||||
async initializeCookie() {
|
||||
try {
|
||||
let headers = {
|
||||
headers: {
|
||||
Accept: "text/plain, */*; q=0.01",
|
||||
"Accept-Language": "zh-CN,zh;q=0.9",
|
||||
"Cache-Control": "no-cache",
|
||||
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
|
||||
Origin: "https://www.youzhicai.com",
|
||||
Pragma: "no-cache",
|
||||
Priority: "u=1, i",
|
||||
Referer: "https://www.youzhicai.com/s/1_1_0_0_.html",
|
||||
"Sec-Ch-Ua":
|
||||
'"Not)A;Brand";v="8", "Chromium";v="138", "Google Chrome";v="138"',
|
||||
"Sec-Ch-Ua-Mobile": "?0",
|
||||
"Sec-Ch-Ua-Platform": '"macOS"',
|
||||
"Sec-Fetch-Dest": "empty",
|
||||
"Sec-Fetch-Mode": "cors",
|
||||
"Sec-Fetch-Site": "same-origin",
|
||||
"User-Agent":
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36",
|
||||
"X-Requested-With": "XMLHttpRequest",
|
||||
},
|
||||
};
|
||||
const homeResponse = await this.axiosInstance.get(
|
||||
"https://www.youzhicai.com/s/1_1_0_0_.html",
|
||||
headers
|
||||
);
|
||||
// // 提取csrf-token
|
||||
// let tokenMatch = homeResponse.data.match(
|
||||
// /<meta name="csrf-token" content="([^"]+)"/
|
||||
// );
|
||||
// // console.log(tokenMatch);
|
||||
// if (tokenMatch) {
|
||||
// let csrfToken = tokenMatch[1];
|
||||
// this.csrfToken = csrfToken;
|
||||
// }
|
||||
// console.log(this.csrfToken);
|
||||
// headers.headers["X-Csrf-Token"] = this.csrfToken;
|
||||
// const cacheResponse = await this.axiosInstance.get(
|
||||
// "https://ahjhqc.youzhicai.com/?cache=1",
|
||||
// headers
|
||||
// );
|
||||
} catch (err) {
|
||||
console.log("err", err);
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
extractCookie(cookieArr) {
|
||||
for (let cookie of cookieArr) {
|
||||
let [key, value] = cookie.split(";")[0].split("=");
|
||||
this.cookiePair.set(key, value);
|
||||
}
|
||||
// console.log(this.cookiePair);
|
||||
}
|
||||
// 全量爬取
|
||||
loopFetchFull(props) {
|
||||
console.log("开始全量爬取");
|
||||
try {
|
||||
loopCall(this.getInfo.bind(this), {
|
||||
time: config.fullFetchTime,
|
||||
pagenumber: 1,
|
||||
additional: props.options,
|
||||
stopWhen: (pagenumber, result) => {
|
||||
return (
|
||||
pagenumber >= result.pages || pagenumber >= config.pageNumberLimit
|
||||
);
|
||||
},
|
||||
readyForNext: (pagenumber, result) => {
|
||||
props.info.push(...result.info);
|
||||
return pagenumber + 1;
|
||||
},
|
||||
complete: (result) => {
|
||||
props.info.push(...result.info);
|
||||
console.log(`爬取完成,共获取 ${props.info.length} 条有效数据`);
|
||||
try {
|
||||
if (props.info.length > 0) {
|
||||
this.queue.saveAnnouncements(props.name, props.info);
|
||||
this.queue.addMessage(props.name, props.info);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("数据库操作失败:", error);
|
||||
}
|
||||
this.loopFetchIncrement(props);
|
||||
},
|
||||
});
|
||||
} catch (error) {
|
||||
console.error(`${props.options.name}全量爬取失败:`, error);
|
||||
}
|
||||
}
|
||||
loopFetchIncrement(props) {
|
||||
console.log("开始增量爬取");
|
||||
try {
|
||||
loopCall(this.getInfo.bind(this), {
|
||||
time: config.incrementFetchTime, // 5分钟间隔
|
||||
pagenumber: 1,
|
||||
additional: props.options,
|
||||
readyForNext: (pagenumber, result) => {
|
||||
try {
|
||||
let newInfo = this.queue.filterNewAnnouncements(
|
||||
props.name,
|
||||
result.info
|
||||
);
|
||||
// 存在新数据
|
||||
if (newInfo.length > 0) {
|
||||
console.log(`发现 ${newInfo.length} 条新数据`);
|
||||
// props.info.push(...newInfo);
|
||||
this.queue.saveAnnouncements(props.name, newInfo);
|
||||
// this.writeFile(props);
|
||||
this.queue.addMessage(props.name, newInfo);
|
||||
// 全是新数据,继续下一页
|
||||
if (newInfo.length === result.info.length) {
|
||||
return pagenumber + 1;
|
||||
} else {
|
||||
// 有部分重复数据,重新从第一页开始
|
||||
return 1;
|
||||
}
|
||||
} else {
|
||||
console.log("没有发现新数据,继续监控...");
|
||||
return 1; // 重新从第一页开始
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("数据库操作失败:", error);
|
||||
}
|
||||
},
|
||||
});
|
||||
} catch (error) {
|
||||
console.error(`${props.options.name}增量爬取失败:`, error);
|
||||
}
|
||||
}
|
||||
async getInfo(pagenumber = 1, config) {
|
||||
let info = [];
|
||||
console.log(`${config.name}--获取第 ${pagenumber} 页数据...`);
|
||||
let result = await this.getList(pagenumber, config);
|
||||
if (result[0]) {
|
||||
// 出错, 记录错误日志
|
||||
console.error("获取页面数据失败: ", result[0]);
|
||||
return { pages: 0, info: [] };
|
||||
} else {
|
||||
// 后面的都要验证码
|
||||
|
||||
// let pages = 2;
|
||||
let html = result[1];
|
||||
const $ = cheerio.load(html);
|
||||
let total = $("#recommendMsg .info-num-value").text();
|
||||
let pages = Math.ceil(total / 15);
|
||||
if (pages > 2) {
|
||||
pages = 2;
|
||||
}
|
||||
$(".project-li").each((index, element) => {
|
||||
let id = $(element).find(".project-name0").attr("href");
|
||||
let name = $(element).find(".project-name0").attr("title");
|
||||
let publishTime = $(element).find(".pub-value0").text();
|
||||
let leftDay = $(element).find(".left-day .emOrange:eq(0)").text();
|
||||
let endTime = new Date(
|
||||
+new Date(publishTime) + leftDay * 24 * 60 * 60 * 1000
|
||||
).toLocaleDateString();
|
||||
// console.log(endTime);
|
||||
let urls = "https://www.youzhicai.com" + id;
|
||||
if (keywordsInclude(name)) {
|
||||
console.log("处理项目:", name, publishTime, endTime);
|
||||
info.push({
|
||||
id: id,
|
||||
name: name,
|
||||
publishTime: publishTime,
|
||||
endTime: endTime,
|
||||
urls: urls,
|
||||
});
|
||||
}
|
||||
});
|
||||
return { pages, info };
|
||||
}
|
||||
}
|
||||
async getList(pagenumber, config) {
|
||||
let data = config.data;
|
||||
data.PageIndex = pagenumber;
|
||||
if (this.cookiePair.get("__RequestVerificationToken")) {
|
||||
data.__RequestVerificationToken = this.cookiePair.get(
|
||||
"__RequestVerificationToken"
|
||||
);
|
||||
}
|
||||
let headers = {
|
||||
Accept: "text/plain, */*; q=0.01",
|
||||
"Accept-Language": "zh-CN,zh;q=0.9",
|
||||
"Cache-Control": "no-cache",
|
||||
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
|
||||
Origin: "https://www.youzhicai.com",
|
||||
Pragma: "no-cache",
|
||||
Priority: "u=1, i",
|
||||
Referer: "https://www.youzhicai.com/s/1_1_0_0_.html",
|
||||
"Sec-Ch-Ua":
|
||||
'"Not)A;Brand";v="8", "Chromium";v="138", "Google Chrome";v="138"',
|
||||
"Sec-Ch-Ua-Mobile": "?0",
|
||||
"Sec-Ch-Ua-Platform": '"macOS"',
|
||||
"Sec-Fetch-Dest": "empty",
|
||||
"Sec-Fetch-Mode": "cors",
|
||||
"Sec-Fetch-Site": "same-origin",
|
||||
"User-Agent":
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36",
|
||||
"X-Requested-With": "XMLHttpRequest",
|
||||
};
|
||||
try {
|
||||
const response = await this.axiosInstance({
|
||||
url: config.url,
|
||||
data,
|
||||
method: "post",
|
||||
headers,
|
||||
});
|
||||
let result = response.data;
|
||||
return [null, result];
|
||||
} catch (err) {
|
||||
console.log("cookie不对");
|
||||
try {
|
||||
await this.initializeCookie();
|
||||
data.__RequestVerificationToken = this.cookiePair.get(
|
||||
"__RequestVerificationToken"
|
||||
);
|
||||
const retryResponse = await this.axiosInstance({
|
||||
url: config.url,
|
||||
data,
|
||||
method: "post",
|
||||
headers,
|
||||
});
|
||||
// console.log(retryResponse.data);
|
||||
let result = retryResponse.data;
|
||||
return [null, result];
|
||||
} catch (retryErr) {
|
||||
return [retryErr, null];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
new YouZhiCai([
|
||||
{
|
||||
name: "优质采【招标公告】",
|
||||
info: [],
|
||||
options: {
|
||||
name: "优质采【招标公告】",
|
||||
url: "https://www.youzhicai.com/s/1_1_0_0_.html",
|
||||
data: {
|
||||
MsProvince: "",
|
||||
MsCity: "",
|
||||
MsStartDate: "",
|
||||
MsEndDate: "",
|
||||
AutoOr: 0,
|
||||
BackOr: 0,
|
||||
NoticeTitle: "",
|
||||
searchAccuracy: "precise",
|
||||
matchType: "precise",
|
||||
TenderType: "",
|
||||
MsBidderType: 1,
|
||||
MsNoticeType: 1,
|
||||
MsPublishType: 0,
|
||||
MsSingUpType: 1,
|
||||
MsSort: 2,
|
||||
MsProvince: "",
|
||||
PageIndex: 1,
|
||||
PageSize: 15,
|
||||
AgencyId: "",
|
||||
SecondSearch: "",
|
||||
SecondSearchType: "",
|
||||
TotalSize: 10000,
|
||||
SearchRange: 3,
|
||||
year: "",
|
||||
key1: "",
|
||||
key2: "",
|
||||
key3: "",
|
||||
},
|
||||
},
|
||||
},
|
||||
]);
|
||||
new YouZhiCai([
|
||||
{
|
||||
name: "优质采【澄清/变更公告】",
|
||||
info: [],
|
||||
options: {
|
||||
name: "优质采【澄清/变更公告】",
|
||||
url: "https://www.youzhicai.com/s/1_1_0_0_.html",
|
||||
data: {
|
||||
MsProvince: "",
|
||||
MsCity: "",
|
||||
MsStartDate: "",
|
||||
MsEndDate: "",
|
||||
AutoOr: 0,
|
||||
BackOr: 0,
|
||||
NoticeTitle: "",
|
||||
searchAccuracy: "precise",
|
||||
matchType: "precise",
|
||||
TenderType: "",
|
||||
MsBidderType: 1,
|
||||
MsNoticeType: 5,
|
||||
MsPublishType: 0,
|
||||
MsSingUpType: 1,
|
||||
MsSort: 2,
|
||||
MsProvince: "",
|
||||
PageIndex: 1,
|
||||
PageSize: 15,
|
||||
AgencyId: "",
|
||||
SecondSearch: "",
|
||||
SecondSearchType: "",
|
||||
TotalSize: 10000,
|
||||
SearchRange: 3,
|
||||
year: "",
|
||||
key1: "",
|
||||
key2: "",
|
||||
key3: "",
|
||||
},
|
||||
},
|
||||
},
|
||||
]);
|
||||
new YouZhiCai([
|
||||
{
|
||||
name: "优质采【招标项目计划】",
|
||||
info: [],
|
||||
options: {
|
||||
name: "优质采【招标项目计划】",
|
||||
url: "https://www.youzhicai.com/s/1_1_0_0_.html",
|
||||
data: {
|
||||
MsProvince: "",
|
||||
MsCity: "",
|
||||
MsStartDate: "",
|
||||
MsEndDate: "",
|
||||
AutoOr: 0,
|
||||
BackOr: 0,
|
||||
NoticeTitle: "",
|
||||
searchAccuracy: "precise",
|
||||
matchType: "precise",
|
||||
TenderType: "",
|
||||
MsBidderType: 1,
|
||||
MsNoticeType: 7,
|
||||
MsPublishType: 0,
|
||||
MsSingUpType: 1,
|
||||
MsSort: 2,
|
||||
MsProvince: "",
|
||||
PageIndex: 1,
|
||||
PageSize: 15,
|
||||
AgencyId: "",
|
||||
SecondSearch: "",
|
||||
SecondSearchType: "",
|
||||
TotalSize: 10000,
|
||||
SearchRange: 3,
|
||||
year: "",
|
||||
key1: "",
|
||||
key2: "",
|
||||
key3: "",
|
||||
},
|
||||
},
|
||||
},
|
||||
]);
|
||||
Loading…
Reference in New Issue