This commit is contained in:
2025-07-04 10:41:58 +08:00
parent 8752ed0fb3
commit c99f9d0201
6 changed files with 130 additions and 68 deletions

View File

@ -1,5 +1,4 @@
const site1 = require('./sites/site1.js'); const site1 = require('./sites/site1.js');
module.exports = { module.exports = {
// 全局配置 // 全局配置
global: { global: {
@ -14,10 +13,9 @@ module.exports = {
// API配置 // API配置
api: { api: {
baseUrl: process.env.API_BASE_URL || 'https://api.example.com', baseUrl: process.env.API_BASE_URL || 'http://192.168.1.19:8989',
endpoints: { endpoints: {
saveNews: '/news/save', batchSave: '/api/v1/industry'
batchSave: '/news/batch-save'
} }
} }
}; };

View File

@ -1,13 +1,28 @@
const userAgents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15',
'Mozilla/5.0 (Linux; Android 10) AppleWebKit/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 11.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
];
function getRandomUserAgent() {
return userAgents[Math.floor(Math.random() * userAgents.length)];
}
module.exports = { module.exports = {
name: '资讯-中国经济循环协会', name: '资讯-中国经济循环协会',
baseUrl: 'https://www.chinacace.org/news/fields?fid=1&page=1', baseUrl: 'https://www.chinacace.org',
encoding: 'utf-8', // 网页编码 encoding: 'utf-8', // 网页编码
// 分类列表配置 // 分类列表配置
categories: { categories: {
url: '/news/fields?fid', firstUrl: '/news/fields?fid=1&page=1',
aUrl: '/news/fields?fid',
selector: 'li a', // 分类选择器 selector: 'li a', // 分类选择器
maxPages: 5, // 每个分类最大爬取页数 maxPages: 50, // 每个分类最大爬取页数
extract: { extract: {
name: 'text', // 分类名称提取方式 name: 'text', // 分类名称提取方式
url: 'href' // 分类URL提取方式 url: 'href' // 分类URL提取方式
@ -16,18 +31,14 @@ module.exports = {
// 新闻列表配置 // 新闻列表配置
newsList: { newsList: {
selector: '.news-item', selector: '.news1',
extract: { extract: {
title: '.title | text', content: '.txtedit'
url: 'a | href',
summary: '.summary | text',
publishDate: '.date | text',
image: 'img | src'
} }
}, },
// 请求头 // 请求头
headers: { headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)' 'User-Agent': getRandomUserAgent()
} }
}; };

View File

@ -10,7 +10,10 @@ class BaseCrawler {
headers: config.headers, headers: config.headers,
retryTimes: config.retryTimes || 3 retryTimes: config.retryTimes || 3
}); });
// 已爬取
this.crawledUrls = new Set(); this.crawledUrls = new Set();
// 爬取失败
this.failCrawlUrls = new Set();
} }
async fetchHtml(url) { async fetchHtml(url) {
@ -41,6 +44,11 @@ class BaseCrawler {
markAsCrawled(url) { markAsCrawled(url) {
this.crawledUrls.add(url); this.crawledUrls.add(url);
} }
// 爬取失败
markAsFailCrawl(url) {
this.failCrawlUrls.add(url);
}
isCrawled(url) { isCrawled(url) {
return this.crawledUrls.has(url); return this.crawledUrls.has(url);

View File

@ -1,7 +1,9 @@
const {isValidDateTime,randomDelay} = require('../utils')
const cheerio = require('cheerio'); const cheerio = require('cheerio');
const { BaseCrawler } = require('./baseCrawler'); const { BaseCrawler } = require('./baseCrawler');
const storageService = require('../services/storageService');
class Site1Crawler extends BaseCrawler { class Site1Crawler extends BaseCrawler {
constructor(config) { constructor(config) {
super(config); super(config);
} }
@ -12,44 +14,65 @@ class Site1Crawler extends BaseCrawler {
// 1. 获取分类 // 1. 获取分类
const categories = await this.crawlCategories(); const categories = await this.crawlCategories();
this.logger.info(`发现 ${categories.length} 个分类`); this.logger.info(`发现 ${categories.length} 个分类`);
console.log(categories);
// 2. 爬取每个分类的新闻 // 2.爬取每个分类的分页条数
await categories.map(async (category,index) => {
if (index > 0) {
category.totalPage = await this.crawlTotalPage(category)
}
})
// 避免请求过于频繁
await randomDelay();
// 3. 爬取每个分类的新闻
for (const category of categories) { for (const category of categories) {
await this.crawlNewsList(category); await this.crawlNewsList(category);
// 避免请求过于频繁 // 避免请求过于频繁
await new Promise(resolve => setTimeout(resolve, this.config.minRequestInterval || 1000)); await randomDelay();
} }
this.logger.info('爬取完成'); this.logger.info('爬取完成');
} }
// 分类 // 分类
async crawlCategories() { async crawlCategories() {
console.log('this.config',this.config); const url = this.config.baseUrl + this.config.categories.firstUrl
const url = this.config.baseUrl;
const html = await this.fetchHtml(url); const html = await this.fetchHtml(url);
const $ = cheerio.load(html); const $ = cheerio.load(html);
return $(this.config.categories.selector).map((i, el) => {
const totalPage = Number($('.fany .a1').text().split('/')[1] || 1)
let result = []
await $(this.config.categories.selector).map((i, el) => {
const name = $(el).text().trim(); const name = $(el).text().trim();
const href = $(el).attr(this.config.categories.extract.url) const href = $(el).attr(this.config.categories.extract.url)
// 加个判断,只留分类 // 加个判断,只留分类
if(!href.includes(this.config.categories.url)) return if(!href.includes(this.config.categories.aUrl)) return
return { result.push({
name, name,
url: href url: href,
}; totalPage: 1
})
}).get(); }).get();
result[0].totalPage = totalPage
return result
} }
// 分类分页数
async crawlTotalPage(category) {
const html = await this.fetchHtml(this.config.baseUrl + category.url);
const $ = cheerio.load(html);
const totalPage = Number($('.fany .a1').text().split('/')[1] || 1)
return totalPage
}
// 获取新闻列表
async crawlNewsList(category) { async crawlNewsList(category) {
this.logger.info(`开始爬取分类: ${category.name}`); this.logger.info(`开始爬取分类新闻列表: ${category.name}/${category.totalPage}`);
let page = 1; let page = 1;
const allNews = [];
const maxPages = this.config.categories.maxPages || 5;
while (page <= maxPages) { while (page <= category.totalPage) {
const pageUrl = page === 1 ? category.url : `${category.url}?page=${page}`; const pageUrl = page === 1 ? this.config.baseUrl + category.url : `${this.config.baseUrl}${category.url}&page=${page}`;
console.log('pageUrl:',pageUrl);
if (this.isCrawled(pageUrl)) { if (this.isCrawled(pageUrl)) {
page++; page++;
@ -57,50 +80,59 @@ class Site1Crawler extends BaseCrawler {
} }
try { try {
// 新闻列表
const newsItems = []
const html = await this.fetchHtml(pageUrl); const html = await this.fetchHtml(pageUrl);
this.markAsCrawled(pageUrl); this.markAsCrawled(pageUrl);
const $ = cheerio.load(html); const $ = cheerio.load(html);
for (const el of $(this.config.newsList.selector)) {
const detailUrl = $(el).find('a').attr('href')
console.log('detailUrl',detailUrl)
const detailHtml = await this.fetchHtml(this.config.baseUrl + detailUrl);
const d$ = cheerio.load(detailHtml);
let source = ''
let times = ''
await d$('.newstitle i').map(async(j,dEl) => {
const t = $(dEl).text()
if (t.includes('来源')) {
source = t.split('')[1].trim()
} else if (isValidDateTime(t)) {
times = t
}
})
const newsItems = $(this.config.newsList.selector).map((i, el) => { newsItems.push({
const extractField = (selector, attr = 'text') => {
const element = $(el).find(selector);
return attr === 'text' ? element.text().trim() : element.attr(attr);
};
const newsUrl = extractField('a', 'href');
return {
site: this.config.name,
category: category.name, category: category.name,
title: extractField(this.config.newsList.extract.title), title: d$('.newstitle h1').text(),
url: newsUrl.startsWith('http') ? newsUrl : this.config.baseUrl + newsUrl, brief: $(el).find('p').text(),
summary: extractField(this.config.newsList.extract.summary), details: {
publishDate: extractField(this.config.newsList.extract.publishDate) || new Date().toISOString(), source: source,
image: extractField(this.config.newsList.extract.image, 'src') times: times,
}; content: d$('.txtedit').toString()
}).get(); }
})
await randomDelay();
}
await this.saveNews(newsItems)
if (newsItems.length === 0) break; await new Promise(resolve => setTimeout(resolve, this.config.minRequestInterval || 1000));
allNews.push(...newsItems);
this.logger.info(`${page} 页: 获取 ${newsItems.length} 条新闻`);
page++; page++;
} catch (error) { } catch (error) {
this.logger.error(`${page} 页爬取失败`, error); this.logger.error(`${page} 页爬取失败`, error);
break; this.markAsFailCrawl(pageUrl)
continue;
} }
} }
}
// 爬取详情
async crawledDetails(pageUrl) {
if (allNews.length > 0) {
await this.saveNews(allNews);
this.logger.info(` ${category.name} 完成: 共 ${allNews.length} 条新闻`);
}
} }
async saveNews(newsItems) { async saveNews(newsItems) {
// 实际项目中这里调用存储服务 // 实际项目中这里调用存储服务
// await storageService.batchSave(newsItems); await storageService.batchSave(newsItems);
console.log('保存新闻:', newsItems.length); console.log('保存新闻:', newsItems.length);
} }
} }

View File

@ -16,15 +16,13 @@ class StorageService {
async batchSave(newsItems) { async batchSave(newsItems) {
try { try {
const response = await this.api.post(config.api.endpoints.batchSave, { const response = await this.api.post(config.api.endpoints.batchSave, newsItems);
items: newsItems,
source: 'multi-site-crawler' this.logger.info(`批量保存成功: ${newsItems.length}`,response.data);
});
this.logger.info(`批量保存成功: ${newsItems.length}`);
return response.data; return response.data;
} catch (error) { } catch (error) {
this.logger.error('批量保存失败', error.response?.data || error.message); this.logger.error('批量保存失败', error);
throw error; throw error;
} }
} }

15
src/utils/index.js Normal file
View File

@ -0,0 +1,15 @@
function isValidDateTime(datetimeStr) {
// 尝试将字符串转换为 Date 对象
const date = new Date(datetimeStr);
// 检查转换后的日期是否是有效日期
return !isNaN(date.getTime()) &&
date.toString() !== 'Invalid Date';
}
async function randomDelay(min = 5000, max = 20000) {
const delay = Math.floor(Math.random() * (max - min + 1)) + min;
await new Promise(resolve => setTimeout(resolve, delay));
}
module.exports = { isValidDateTime, randomDelay };