m
This commit is contained in:
@ -1,5 +1,4 @@
|
|||||||
const site1 = require('./sites/site1.js');
|
const site1 = require('./sites/site1.js');
|
||||||
|
|
||||||
module.exports = {
|
module.exports = {
|
||||||
// 全局配置
|
// 全局配置
|
||||||
global: {
|
global: {
|
||||||
@ -14,10 +13,9 @@ module.exports = {
|
|||||||
|
|
||||||
// API配置
|
// API配置
|
||||||
api: {
|
api: {
|
||||||
baseUrl: process.env.API_BASE_URL || 'https://api.example.com',
|
baseUrl: process.env.API_BASE_URL || 'http://192.168.1.19:8989',
|
||||||
endpoints: {
|
endpoints: {
|
||||||
saveNews: '/news/save',
|
batchSave: '/api/v1/industry'
|
||||||
batchSave: '/news/batch-save'
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
@ -1,13 +1,28 @@
|
|||||||
|
const userAgents = [
|
||||||
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
||||||
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15',
|
||||||
|
'Mozilla/5.0 (Linux; Android 10) AppleWebKit/537.36',
|
||||||
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||||
|
'Mozilla/5.0 (Windows NT 11.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
|
||||||
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0',
|
||||||
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
|
||||||
|
];
|
||||||
|
|
||||||
|
function getRandomUserAgent() {
|
||||||
|
return userAgents[Math.floor(Math.random() * userAgents.length)];
|
||||||
|
}
|
||||||
|
|
||||||
module.exports = {
|
module.exports = {
|
||||||
name: '资讯-中国经济循环协会',
|
name: '资讯-中国经济循环协会',
|
||||||
baseUrl: 'https://www.chinacace.org/news/fields?fid=1&page=1',
|
baseUrl: 'https://www.chinacace.org',
|
||||||
encoding: 'utf-8', // 网页编码
|
encoding: 'utf-8', // 网页编码
|
||||||
|
|
||||||
// 分类列表配置
|
// 分类列表配置
|
||||||
categories: {
|
categories: {
|
||||||
url: '/news/fields?fid',
|
firstUrl: '/news/fields?fid=1&page=1',
|
||||||
|
aUrl: '/news/fields?fid',
|
||||||
selector: 'li a', // 分类选择器
|
selector: 'li a', // 分类选择器
|
||||||
maxPages: 5, // 每个分类最大爬取页数
|
maxPages: 50, // 每个分类最大爬取页数
|
||||||
extract: {
|
extract: {
|
||||||
name: 'text', // 分类名称提取方式
|
name: 'text', // 分类名称提取方式
|
||||||
url: 'href' // 分类URL提取方式
|
url: 'href' // 分类URL提取方式
|
||||||
@ -16,18 +31,14 @@ module.exports = {
|
|||||||
|
|
||||||
// 新闻列表配置
|
// 新闻列表配置
|
||||||
newsList: {
|
newsList: {
|
||||||
selector: '.news-item',
|
selector: '.news1',
|
||||||
extract: {
|
extract: {
|
||||||
title: '.title | text',
|
content: '.txtedit'
|
||||||
url: 'a | href',
|
|
||||||
summary: '.summary | text',
|
|
||||||
publishDate: '.date | text',
|
|
||||||
image: 'img | src'
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|
||||||
// 请求头
|
// 请求头
|
||||||
headers: {
|
headers: {
|
||||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
|
'User-Agent': getRandomUserAgent()
|
||||||
}
|
}
|
||||||
};
|
};
|
@ -10,7 +10,10 @@ class BaseCrawler {
|
|||||||
headers: config.headers,
|
headers: config.headers,
|
||||||
retryTimes: config.retryTimes || 3
|
retryTimes: config.retryTimes || 3
|
||||||
});
|
});
|
||||||
|
// 已爬取
|
||||||
this.crawledUrls = new Set();
|
this.crawledUrls = new Set();
|
||||||
|
// 爬取失败
|
||||||
|
this.failCrawlUrls = new Set();
|
||||||
}
|
}
|
||||||
|
|
||||||
async fetchHtml(url) {
|
async fetchHtml(url) {
|
||||||
@ -41,6 +44,11 @@ class BaseCrawler {
|
|||||||
markAsCrawled(url) {
|
markAsCrawled(url) {
|
||||||
this.crawledUrls.add(url);
|
this.crawledUrls.add(url);
|
||||||
}
|
}
|
||||||
|
// 爬取失败
|
||||||
|
markAsFailCrawl(url) {
|
||||||
|
this.failCrawlUrls.add(url);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
isCrawled(url) {
|
isCrawled(url) {
|
||||||
return this.crawledUrls.has(url);
|
return this.crawledUrls.has(url);
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
|
const {isValidDateTime,randomDelay} = require('../utils')
|
||||||
const cheerio = require('cheerio');
|
const cheerio = require('cheerio');
|
||||||
const { BaseCrawler } = require('./baseCrawler');
|
const { BaseCrawler } = require('./baseCrawler');
|
||||||
|
const storageService = require('../services/storageService');
|
||||||
|
|
||||||
class Site1Crawler extends BaseCrawler {
|
class Site1Crawler extends BaseCrawler {
|
||||||
constructor(config) {
|
constructor(config) {
|
||||||
@ -12,44 +14,65 @@ class Site1Crawler extends BaseCrawler {
|
|||||||
// 1. 获取分类
|
// 1. 获取分类
|
||||||
const categories = await this.crawlCategories();
|
const categories = await this.crawlCategories();
|
||||||
this.logger.info(`发现 ${categories.length} 个分类`);
|
this.logger.info(`发现 ${categories.length} 个分类`);
|
||||||
console.log(categories);
|
|
||||||
// 2. 爬取每个分类的新闻
|
// 2.爬取每个分类的分页条数
|
||||||
|
await categories.map(async (category,index) => {
|
||||||
|
if (index > 0) {
|
||||||
|
category.totalPage = await this.crawlTotalPage(category)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
// 避免请求过于频繁
|
||||||
|
await randomDelay();
|
||||||
|
// 3. 爬取每个分类的新闻
|
||||||
for (const category of categories) {
|
for (const category of categories) {
|
||||||
await this.crawlNewsList(category);
|
await this.crawlNewsList(category);
|
||||||
// 避免请求过于频繁
|
// 避免请求过于频繁
|
||||||
await new Promise(resolve => setTimeout(resolve, this.config.minRequestInterval || 1000));
|
await randomDelay();
|
||||||
}
|
}
|
||||||
|
|
||||||
this.logger.info('爬取完成');
|
this.logger.info('爬取完成');
|
||||||
}
|
}
|
||||||
// 分类
|
// 分类
|
||||||
async crawlCategories() {
|
async crawlCategories() {
|
||||||
console.log('this.config',this.config);
|
const url = this.config.baseUrl + this.config.categories.firstUrl
|
||||||
|
|
||||||
const url = this.config.baseUrl;
|
|
||||||
const html = await this.fetchHtml(url);
|
const html = await this.fetchHtml(url);
|
||||||
const $ = cheerio.load(html);
|
const $ = cheerio.load(html);
|
||||||
return $(this.config.categories.selector).map((i, el) => {
|
|
||||||
|
const totalPage = Number($('.fany .a1').text().split('/')[1] || 1)
|
||||||
|
let result = []
|
||||||
|
await $(this.config.categories.selector).map((i, el) => {
|
||||||
const name = $(el).text().trim();
|
const name = $(el).text().trim();
|
||||||
const href = $(el).attr(this.config.categories.extract.url)
|
const href = $(el).attr(this.config.categories.extract.url)
|
||||||
|
|
||||||
// 加个判断,只留分类
|
// 加个判断,只留分类
|
||||||
if(!href.includes(this.config.categories.url)) return
|
if(!href.includes(this.config.categories.aUrl)) return
|
||||||
return {
|
result.push({
|
||||||
name,
|
name,
|
||||||
url: href
|
url: href,
|
||||||
};
|
totalPage: 1
|
||||||
|
})
|
||||||
}).get();
|
}).get();
|
||||||
|
result[0].totalPage = totalPage
|
||||||
|
|
||||||
|
return result
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 分类分页数
|
||||||
|
async crawlTotalPage(category) {
|
||||||
|
const html = await this.fetchHtml(this.config.baseUrl + category.url);
|
||||||
|
const $ = cheerio.load(html);
|
||||||
|
const totalPage = Number($('.fany .a1').text().split('/')[1] || 1)
|
||||||
|
return totalPage
|
||||||
|
}
|
||||||
|
// 获取新闻列表
|
||||||
async crawlNewsList(category) {
|
async crawlNewsList(category) {
|
||||||
this.logger.info(`开始爬取分类: ${category.name}`);
|
this.logger.info(`开始爬取分类新闻列表: ${category.name}/${category.totalPage}`);
|
||||||
|
|
||||||
let page = 1;
|
let page = 1;
|
||||||
const allNews = [];
|
|
||||||
const maxPages = this.config.categories.maxPages || 5;
|
|
||||||
|
|
||||||
while (page <= maxPages) {
|
while (page <= category.totalPage) {
|
||||||
const pageUrl = page === 1 ? category.url : `${category.url}?page=${page}`;
|
const pageUrl = page === 1 ? this.config.baseUrl + category.url : `${this.config.baseUrl}${category.url}&page=${page}`;
|
||||||
|
console.log('pageUrl:',pageUrl);
|
||||||
|
|
||||||
if (this.isCrawled(pageUrl)) {
|
if (this.isCrawled(pageUrl)) {
|
||||||
page++;
|
page++;
|
||||||
@ -57,50 +80,59 @@ class Site1Crawler extends BaseCrawler {
|
|||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
// 新闻列表
|
||||||
|
const newsItems = []
|
||||||
const html = await this.fetchHtml(pageUrl);
|
const html = await this.fetchHtml(pageUrl);
|
||||||
this.markAsCrawled(pageUrl);
|
this.markAsCrawled(pageUrl);
|
||||||
const $ = cheerio.load(html);
|
const $ = cheerio.load(html);
|
||||||
|
for (const el of $(this.config.newsList.selector)) {
|
||||||
|
const detailUrl = $(el).find('a').attr('href')
|
||||||
|
console.log('detailUrl',detailUrl)
|
||||||
|
const detailHtml = await this.fetchHtml(this.config.baseUrl + detailUrl);
|
||||||
|
const d$ = cheerio.load(detailHtml);
|
||||||
|
let source = ''
|
||||||
|
let times = ''
|
||||||
|
await d$('.newstitle i').map(async(j,dEl) => {
|
||||||
|
const t = $(dEl).text()
|
||||||
|
if (t.includes('来源')) {
|
||||||
|
source = t.split(':')[1].trim()
|
||||||
|
} else if (isValidDateTime(t)) {
|
||||||
|
times = t
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
const newsItems = $(this.config.newsList.selector).map((i, el) => {
|
newsItems.push({
|
||||||
const extractField = (selector, attr = 'text') => {
|
|
||||||
const element = $(el).find(selector);
|
|
||||||
return attr === 'text' ? element.text().trim() : element.attr(attr);
|
|
||||||
};
|
|
||||||
|
|
||||||
const newsUrl = extractField('a', 'href');
|
|
||||||
|
|
||||||
return {
|
|
||||||
site: this.config.name,
|
|
||||||
category: category.name,
|
category: category.name,
|
||||||
title: extractField(this.config.newsList.extract.title),
|
title: d$('.newstitle h1').text(),
|
||||||
url: newsUrl.startsWith('http') ? newsUrl : this.config.baseUrl + newsUrl,
|
brief: $(el).find('p').text(),
|
||||||
summary: extractField(this.config.newsList.extract.summary),
|
details: {
|
||||||
publishDate: extractField(this.config.newsList.extract.publishDate) || new Date().toISOString(),
|
source: source,
|
||||||
image: extractField(this.config.newsList.extract.image, 'src')
|
times: times,
|
||||||
};
|
content: d$('.txtedit').toString()
|
||||||
}).get();
|
}
|
||||||
|
})
|
||||||
|
await randomDelay();
|
||||||
|
}
|
||||||
|
await this.saveNews(newsItems)
|
||||||
|
|
||||||
if (newsItems.length === 0) break;
|
await new Promise(resolve => setTimeout(resolve, this.config.minRequestInterval || 1000));
|
||||||
|
|
||||||
allNews.push(...newsItems);
|
|
||||||
this.logger.info(` 第 ${page} 页: 获取 ${newsItems.length} 条新闻`);
|
|
||||||
page++;
|
page++;
|
||||||
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
this.logger.error(` 第 ${page} 页爬取失败`, error);
|
this.logger.error(` 第 ${page} 页爬取失败`, error);
|
||||||
break;
|
this.markAsFailCrawl(pageUrl)
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
// 爬取详情
|
||||||
|
async crawledDetails(pageUrl) {
|
||||||
|
|
||||||
if (allNews.length > 0) {
|
|
||||||
await this.saveNews(allNews);
|
|
||||||
this.logger.info(` ${category.name} 完成: 共 ${allNews.length} 条新闻`);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async saveNews(newsItems) {
|
async saveNews(newsItems) {
|
||||||
// 实际项目中这里调用存储服务
|
// 实际项目中这里调用存储服务
|
||||||
// await storageService.batchSave(newsItems);
|
await storageService.batchSave(newsItems);
|
||||||
console.log('保存新闻:', newsItems.length);
|
console.log('保存新闻:', newsItems.length);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -16,15 +16,13 @@ class StorageService {
|
|||||||
|
|
||||||
async batchSave(newsItems) {
|
async batchSave(newsItems) {
|
||||||
try {
|
try {
|
||||||
const response = await this.api.post(config.api.endpoints.batchSave, {
|
const response = await this.api.post(config.api.endpoints.batchSave, newsItems);
|
||||||
items: newsItems,
|
|
||||||
source: 'multi-site-crawler'
|
this.logger.info(`批量保存成功: ${newsItems.length} 条`,response.data);
|
||||||
});
|
|
||||||
|
|
||||||
this.logger.info(`批量保存成功: ${newsItems.length} 条`);
|
|
||||||
return response.data;
|
return response.data;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
this.logger.error('批量保存失败', error.response?.data || error.message);
|
this.logger.error('批量保存失败', error);
|
||||||
throw error;
|
throw error;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
15
src/utils/index.js
Normal file
15
src/utils/index.js
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
function isValidDateTime(datetimeStr) {
|
||||||
|
// 尝试将字符串转换为 Date 对象
|
||||||
|
const date = new Date(datetimeStr);
|
||||||
|
|
||||||
|
// 检查转换后的日期是否是有效日期
|
||||||
|
return !isNaN(date.getTime()) &&
|
||||||
|
date.toString() !== 'Invalid Date';
|
||||||
|
}
|
||||||
|
async function randomDelay(min = 5000, max = 20000) {
|
||||||
|
const delay = Math.floor(Math.random() * (max - min + 1)) + min;
|
||||||
|
await new Promise(resolve => setTimeout(resolve, delay));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
module.exports = { isValidDateTime, randomDelay };
|
Reference in New Issue
Block a user