diff --git a/src/config/index.js b/src/config/index.js index 9436651..8d1eafd 100644 --- a/src/config/index.js +++ b/src/config/index.js @@ -1,5 +1,4 @@ const site1 = require('./sites/site1.js'); - module.exports = { // 全局配置 global: { @@ -14,10 +13,9 @@ module.exports = { // API配置 api: { - baseUrl: process.env.API_BASE_URL || 'https://api.example.com', + baseUrl: process.env.API_BASE_URL || 'http://192.168.1.19:8989', endpoints: { - saveNews: '/news/save', - batchSave: '/news/batch-save' + batchSave: '/api/v1/industry' } } }; \ No newline at end of file diff --git a/src/config/sites/site1.js b/src/config/sites/site1.js index a14e17a..d327779 100644 --- a/src/config/sites/site1.js +++ b/src/config/sites/site1.js @@ -1,13 +1,28 @@ +const userAgents = [ + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15', + 'Mozilla/5.0 (Linux; Android 10) AppleWebKit/537.36', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Windows NT 11.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0', +]; + +function getRandomUserAgent() { + return userAgents[Math.floor(Math.random() * userAgents.length)]; +} + module.exports = { name: '资讯-中国经济循环协会', - baseUrl: 'https://www.chinacace.org/news/fields?fid=1&page=1', + baseUrl: 'https://www.chinacace.org', encoding: 'utf-8', // 网页编码 // 分类列表配置 categories: { - url: '/news/fields?fid', + firstUrl: '/news/fields?fid=1&page=1', + aUrl: '/news/fields?fid', selector: 'li a', // 分类选择器 - maxPages: 5, // 每个分类最大爬取页数 + maxPages: 50, // 每个分类最大爬取页数 extract: { name: 'text', // 分类名称提取方式 url: 'href' // 分类URL提取方式 @@ -16,18 +31,14 @@ module.exports = { // 新闻列表配置 newsList: { - selector: '.news-item', + selector: '.news1', extract: { - title: '.title | text', - url: 'a | href', - summary: '.summary | text', - publishDate: '.date | text', - image: 'img | src' + content: '.txtedit' } }, // 请求头 headers: { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)' + 'User-Agent': getRandomUserAgent() } }; \ No newline at end of file diff --git a/src/crawlers/baseCrawler.js b/src/crawlers/baseCrawler.js index e7a5a2e..2c55c32 100644 --- a/src/crawlers/baseCrawler.js +++ b/src/crawlers/baseCrawler.js @@ -10,7 +10,10 @@ class BaseCrawler { headers: config.headers, retryTimes: config.retryTimes || 3 }); + // 已爬取 this.crawledUrls = new Set(); + // 爬取失败 + this.failCrawlUrls = new Set(); } async fetchHtml(url) { @@ -41,6 +44,11 @@ class BaseCrawler { markAsCrawled(url) { this.crawledUrls.add(url); } + // 爬取失败 + markAsFailCrawl(url) { + this.failCrawlUrls.add(url); + } + isCrawled(url) { return this.crawledUrls.has(url); diff --git a/src/crawlers/site1Crawler.js b/src/crawlers/site1Crawler.js index 0124cf3..3279cef 100644 --- a/src/crawlers/site1Crawler.js +++ b/src/crawlers/site1Crawler.js @@ -1,7 +1,9 @@ +const {isValidDateTime,randomDelay} = require('../utils') const cheerio = require('cheerio'); const { BaseCrawler } = require('./baseCrawler'); +const storageService = require('../services/storageService'); -class Site1Crawler extends BaseCrawler { +class Site1Crawler extends BaseCrawler { constructor(config) { super(config); } @@ -12,95 +14,125 @@ class Site1Crawler extends BaseCrawler { // 1. 获取分类 const categories = await this.crawlCategories(); this.logger.info(`发现 ${categories.length} 个分类`); - console.log(categories); - // 2. 爬取每个分类的新闻 + + // 2.爬取每个分类的分页条数 + await categories.map(async (category,index) => { + if (index > 0) { + category.totalPage = await this.crawlTotalPage(category) + } + }) + // 避免请求过于频繁 + await randomDelay(); + // 3. 爬取每个分类的新闻 for (const category of categories) { await this.crawlNewsList(category); // 避免请求过于频繁 - await new Promise(resolve => setTimeout(resolve, this.config.minRequestInterval || 1000)); + await randomDelay(); } this.logger.info('爬取完成'); } // 分类 async crawlCategories() { - console.log('this.config',this.config); - - const url = this.config.baseUrl; + const url = this.config.baseUrl + this.config.categories.firstUrl const html = await this.fetchHtml(url); const $ = cheerio.load(html); - return $(this.config.categories.selector).map((i, el) => { + + const totalPage = Number($('.fany .a1').text().split('/')[1] || 1) + let result = [] + await $(this.config.categories.selector).map((i, el) => { const name = $(el).text().trim(); const href = $(el).attr(this.config.categories.extract.url) + // 加个判断,只留分类 - if(!href.includes(this.config.categories.url)) return - return { + if(!href.includes(this.config.categories.aUrl)) return + result.push({ name, - url: href - }; + url: href, + totalPage: 1 + }) }).get(); - } + result[0].totalPage = totalPage + return result + } + + // 分类分页数 + async crawlTotalPage(category) { + const html = await this.fetchHtml(this.config.baseUrl + category.url); + const $ = cheerio.load(html); + const totalPage = Number($('.fany .a1').text().split('/')[1] || 1) + return totalPage + } + // 获取新闻列表 async crawlNewsList(category) { - this.logger.info(`开始爬取分类: ${category.name}`); + this.logger.info(`开始爬取分类新闻列表: ${category.name}/${category.totalPage}`); let page = 1; - const allNews = []; - const maxPages = this.config.categories.maxPages || 5; - while (page <= maxPages) { - const pageUrl = page === 1 ? category.url : `${category.url}?page=${page}`; + while (page <= category.totalPage) { + const pageUrl = page === 1 ? this.config.baseUrl + category.url : `${this.config.baseUrl}${category.url}&page=${page}`; + console.log('pageUrl:',pageUrl); if (this.isCrawled(pageUrl)) { page++; continue; } - + try { + // 新闻列表 + const newsItems = [] const html = await this.fetchHtml(pageUrl); this.markAsCrawled(pageUrl); const $ = cheerio.load(html); - - const newsItems = $(this.config.newsList.selector).map((i, el) => { - const extractField = (selector, attr = 'text') => { - const element = $(el).find(selector); - return attr === 'text' ? element.text().trim() : element.attr(attr); - }; - - const newsUrl = extractField('a', 'href'); - - return { - site: this.config.name, + for (const el of $(this.config.newsList.selector)) { + const detailUrl = $(el).find('a').attr('href') + console.log('detailUrl',detailUrl) + const detailHtml = await this.fetchHtml(this.config.baseUrl + detailUrl); + const d$ = cheerio.load(detailHtml); + let source = '' + let times = '' + await d$('.newstitle i').map(async(j,dEl) => { + const t = $(dEl).text() + if (t.includes('来源')) { + source = t.split(':')[1].trim() + } else if (isValidDateTime(t)) { + times = t + } + }) + + newsItems.push({ category: category.name, - title: extractField(this.config.newsList.extract.title), - url: newsUrl.startsWith('http') ? newsUrl : this.config.baseUrl + newsUrl, - summary: extractField(this.config.newsList.extract.summary), - publishDate: extractField(this.config.newsList.extract.publishDate) || new Date().toISOString(), - image: extractField(this.config.newsList.extract.image, 'src') - }; - }).get(); - - if (newsItems.length === 0) break; - - allNews.push(...newsItems); - this.logger.info(` 第 ${page} 页: 获取 ${newsItems.length} 条新闻`); + title: d$('.newstitle h1').text(), + brief: $(el).find('p').text(), + details: { + source: source, + times: times, + content: d$('.txtedit').toString() + } + }) + await randomDelay(); + } + await this.saveNews(newsItems) + + await new Promise(resolve => setTimeout(resolve, this.config.minRequestInterval || 1000)); page++; } catch (error) { this.logger.error(` 第 ${page} 页爬取失败`, error); - break; + this.markAsFailCrawl(pageUrl) + continue; } } + } + // 爬取详情 + async crawledDetails(pageUrl) { - if (allNews.length > 0) { - await this.saveNews(allNews); - this.logger.info(` ${category.name} 完成: 共 ${allNews.length} 条新闻`); - } } async saveNews(newsItems) { // 实际项目中这里调用存储服务 - // await storageService.batchSave(newsItems); + await storageService.batchSave(newsItems); console.log('保存新闻:', newsItems.length); } } diff --git a/src/services/storageService.js b/src/services/storageService.js index e1d82ac..f84375e 100644 --- a/src/services/storageService.js +++ b/src/services/storageService.js @@ -16,15 +16,13 @@ class StorageService { async batchSave(newsItems) { try { - const response = await this.api.post(config.api.endpoints.batchSave, { - items: newsItems, - source: 'multi-site-crawler' - }); + const response = await this.api.post(config.api.endpoints.batchSave, newsItems); + + this.logger.info(`批量保存成功: ${newsItems.length} 条`,response.data); - this.logger.info(`批量保存成功: ${newsItems.length} 条`); return response.data; } catch (error) { - this.logger.error('批量保存失败', error.response?.data || error.message); + this.logger.error('批量保存失败', error); throw error; } } diff --git a/src/utils/index.js b/src/utils/index.js new file mode 100644 index 0000000..1505dbd --- /dev/null +++ b/src/utils/index.js @@ -0,0 +1,15 @@ +function isValidDateTime(datetimeStr) { + // 尝试将字符串转换为 Date 对象 + const date = new Date(datetimeStr); + + // 检查转换后的日期是否是有效日期 + return !isNaN(date.getTime()) && + date.toString() !== 'Invalid Date'; +} +async function randomDelay(min = 5000, max = 20000) { + const delay = Math.floor(Math.random() * (max - min + 1)) + min; + await new Promise(resolve => setTimeout(resolve, delay)); +} + + +module.exports = { isValidDateTime, randomDelay }; \ No newline at end of file