commit 8752ed0fb3a6dda51d58269980799d4fc65ef7cc Author: chengenghua <735161452@qq.com> Date: Wed Jul 2 18:23:02 2025 +0800 first commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a6d77ca --- /dev/null +++ b/.gitignore @@ -0,0 +1,20 @@ +node_modules +.DS_Store +dist +dist-ssr +*.local +.history + +# Editor directories and files +.idea +*.suo +*.ntvs* +*.njsproj +*.sln +*.local + +stats.html +pnpm-lock.yaml +package-lock.json +.stylelintcache +.eslintcache diff --git a/package.json b/package.json new file mode 100644 index 0000000..e2c5d77 --- /dev/null +++ b/package.json @@ -0,0 +1,17 @@ +{ + "name": "djby-crawler", + "version": "1.0.0", + "main": "index.js", + "scripts": { + "test": "echo \"Error: no test specified\" && exit 1" + }, + "keywords": [], + "author": "", + "license": "ISC", + "description": "", + "dependencies": { + "axios": "^1.10.0", + "cheerio": "^1.1.0", + "dotenv": "^17.0.1" + } +} diff --git a/src/app.js b/src/app.js new file mode 100644 index 0000000..2888141 --- /dev/null +++ b/src/app.js @@ -0,0 +1,32 @@ +const config = require('./config'); +const { Site1Crawler } = require('./crawlers/site1Crawler'); +const { Logger } = require('./utils/logger'); + +const logger = new Logger('Main'); + +async function run() { + try { + logger.info('启动多网站爬虫'); + + // 初始化各网站爬虫 + const crawlers = config.sites.map(siteConfig => { + switch (siteConfig.name) { + case '资讯-中国经济循环协会': + return new Site1Crawler(siteConfig); + // 添加更多网站... + default: + throw new Error(`未知网站配置: ${siteConfig.name}`); + } + }); + + // 并发运行爬虫 + await Promise.all(crawlers.map(crawler => crawler.crawl())); + + logger.info('所有网站爬取完成'); + } catch (error) { + logger.error('爬虫运行出错', error); + process.exit(1); + } +} + +run(); \ No newline at end of file diff --git a/src/config/index.js b/src/config/index.js new file mode 100644 index 0000000..9436651 --- /dev/null +++ b/src/config/index.js @@ -0,0 +1,23 @@ +const site1 = require('./sites/site1.js'); + +module.exports = { + // 全局配置 + global: { + maxConcurrency: 3, // 最大并发网站数 + requestTimeout: 15000, // 请求超时时间(ms) + retryTimes: 3, // 重试次数 + minRequestInterval: 1000 // 最小请求间隔(ms) + }, + + // 各网站配置 + sites: [site1], + + // API配置 + api: { + baseUrl: process.env.API_BASE_URL || 'https://api.example.com', + endpoints: { + saveNews: '/news/save', + batchSave: '/news/batch-save' + } + } +}; \ No newline at end of file diff --git a/src/config/sites/site1.js b/src/config/sites/site1.js new file mode 100644 index 0000000..a14e17a --- /dev/null +++ b/src/config/sites/site1.js @@ -0,0 +1,33 @@ +module.exports = { + name: '资讯-中国经济循环协会', + baseUrl: 'https://www.chinacace.org/news/fields?fid=1&page=1', + encoding: 'utf-8', // 网页编码 + + // 分类列表配置 + categories: { + url: '/news/fields?fid', + selector: 'li a', // 分类选择器 + maxPages: 5, // 每个分类最大爬取页数 + extract: { + name: 'text', // 分类名称提取方式 + url: 'href' // 分类URL提取方式 + } + }, + + // 新闻列表配置 + newsList: { + selector: '.news-item', + extract: { + title: '.title | text', + url: 'a | href', + summary: '.summary | text', + publishDate: '.date | text', + image: 'img | src' + } + }, + + // 请求头 + headers: { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)' + } +}; \ No newline at end of file diff --git a/src/crawlers/baseCrawler.js b/src/crawlers/baseCrawler.js new file mode 100644 index 0000000..e7a5a2e --- /dev/null +++ b/src/crawlers/baseCrawler.js @@ -0,0 +1,50 @@ +const { Logger } = require('../utils/logger'); +const { HttpClient } = require('../utils/httpClient'); + +class BaseCrawler { + constructor(config) { + this.config = config; + this.logger = new Logger(config.name); + this.httpClient = new HttpClient({ + timeout: config.timeout || 15000, + headers: config.headers, + retryTimes: config.retryTimes || 3 + }); + this.crawledUrls = new Set(); + } + + async fetchHtml(url) { + try { + const html = await this.httpClient.get(url, { + encoding: this.config.encoding + }); + return html; + } catch (error) { + this.logger.error(`请求失败: ${url}`, error); + throw error; + } + } + + async crawl() { + throw new Error('必须实现 crawl 方法'); + } + + async crawlCategories() { + throw new Error('必须实现 crawlCategories 方法'); + } + + async crawlNewsList(category) { + throw new Error('必须实现 crawlNewsList 方法'); + } + + // 防止重复爬取 + markAsCrawled(url) { + this.crawledUrls.add(url); + } + + isCrawled(url) { + return this.crawledUrls.has(url); + } +} + +module.exports = { BaseCrawler }; \ No newline at end of file diff --git a/src/crawlers/site1Crawler.js b/src/crawlers/site1Crawler.js new file mode 100644 index 0000000..0124cf3 --- /dev/null +++ b/src/crawlers/site1Crawler.js @@ -0,0 +1,108 @@ +const cheerio = require('cheerio'); +const { BaseCrawler } = require('./baseCrawler'); + +class Site1Crawler extends BaseCrawler { + constructor(config) { + super(config); + } + + async crawl() { + this.logger.info('开始爬取'); + + // 1. 获取分类 + const categories = await this.crawlCategories(); + this.logger.info(`发现 ${categories.length} 个分类`); + console.log(categories); + // 2. 爬取每个分类的新闻 + for (const category of categories) { + await this.crawlNewsList(category); + // 避免请求过于频繁 + await new Promise(resolve => setTimeout(resolve, this.config.minRequestInterval || 1000)); + } + + this.logger.info('爬取完成'); + } + // 分类 + async crawlCategories() { + console.log('this.config',this.config); + + const url = this.config.baseUrl; + const html = await this.fetchHtml(url); + const $ = cheerio.load(html); + return $(this.config.categories.selector).map((i, el) => { + const name = $(el).text().trim(); + const href = $(el).attr(this.config.categories.extract.url) + // 加个判断,只留分类 + if(!href.includes(this.config.categories.url)) return + return { + name, + url: href + }; + }).get(); + } + + async crawlNewsList(category) { + this.logger.info(`开始爬取分类: ${category.name}`); + + let page = 1; + const allNews = []; + const maxPages = this.config.categories.maxPages || 5; + + while (page <= maxPages) { + const pageUrl = page === 1 ? category.url : `${category.url}?page=${page}`; + + if (this.isCrawled(pageUrl)) { + page++; + continue; + } + + try { + const html = await this.fetchHtml(pageUrl); + this.markAsCrawled(pageUrl); + const $ = cheerio.load(html); + + const newsItems = $(this.config.newsList.selector).map((i, el) => { + const extractField = (selector, attr = 'text') => { + const element = $(el).find(selector); + return attr === 'text' ? element.text().trim() : element.attr(attr); + }; + + const newsUrl = extractField('a', 'href'); + + return { + site: this.config.name, + category: category.name, + title: extractField(this.config.newsList.extract.title), + url: newsUrl.startsWith('http') ? newsUrl : this.config.baseUrl + newsUrl, + summary: extractField(this.config.newsList.extract.summary), + publishDate: extractField(this.config.newsList.extract.publishDate) || new Date().toISOString(), + image: extractField(this.config.newsList.extract.image, 'src') + }; + }).get(); + + if (newsItems.length === 0) break; + + allNews.push(...newsItems); + this.logger.info(` 第 ${page} 页: 获取 ${newsItems.length} 条新闻`); + page++; + + } catch (error) { + this.logger.error(` 第 ${page} 页爬取失败`, error); + break; + } + } + + if (allNews.length > 0) { + await this.saveNews(allNews); + this.logger.info(` ${category.name} 完成: 共 ${allNews.length} 条新闻`); + } + } + + async saveNews(newsItems) { + // 实际项目中这里调用存储服务 + // await storageService.batchSave(newsItems); + console.log('保存新闻:', newsItems.length); + } +} + +module.exports = { Site1Crawler }; \ No newline at end of file diff --git a/src/services/storageService.js b/src/services/storageService.js new file mode 100644 index 0000000..e1d82ac --- /dev/null +++ b/src/services/storageService.js @@ -0,0 +1,33 @@ +const axios = require('axios'); +const config = require('../config'); +const { Logger } = require('../utils/logger'); + +class StorageService { + constructor() { + this.api = axios.create({ + baseURL: config.api.baseUrl, + timeout: config.global.requestTimeout, + headers: { + 'Content-Type': 'application/json' + } + }); + this.logger = new Logger('Storage'); + } + + async batchSave(newsItems) { + try { + const response = await this.api.post(config.api.endpoints.batchSave, { + items: newsItems, + source: 'multi-site-crawler' + }); + + this.logger.info(`批量保存成功: ${newsItems.length} 条`); + return response.data; + } catch (error) { + this.logger.error('批量保存失败', error.response?.data || error.message); + throw error; + } + } +} + +module.exports = new StorageService(); \ No newline at end of file diff --git a/src/utils/httpClient.js b/src/utils/httpClient.js new file mode 100644 index 0000000..032760b --- /dev/null +++ b/src/utils/httpClient.js @@ -0,0 +1,41 @@ +const axios = require('axios'); +const { Logger } = require('./logger'); + +class HttpClient { + constructor(config) { + this.instance = axios.create({ + timeout: config.timeout, + headers: config.headers + }); + this.retryTimes = config.retryTimes || 3; + this.logger = new Logger('HTTP'); + } + + async get(url, options = {}) { + let lastError; + + for (let i = 0; i < this.retryTimes; i++) { + try { + const response = await this.instance.get(url, { + responseType: 'text', + ...options + }); + + if (response.status === 200) { + return response.data; + } + + throw new Error(`HTTP ${response.status}`); + } catch (error) { + lastError = error; + const waitTime = 1000 * (i + 1); + this.logger.warn(`请求失败 (${i + 1}/${this.retryTimes}), ${waitTime}ms后重试: ${url}`); + await new Promise(resolve => setTimeout(resolve, waitTime)); + } + } + + throw lastError; + } +} + +module.exports = { HttpClient }; \ No newline at end of file diff --git a/src/utils/logger.js b/src/utils/logger.js new file mode 100644 index 0000000..53abca9 --- /dev/null +++ b/src/utils/logger.js @@ -0,0 +1,23 @@ +class Logger { + constructor(name = 'Application') { + this.name = name; + } + + log(message) { + console.log(`[${this.name}] ${message}`); + } + + info(message) { + console.info(`[${this.name}] INFO: ${message}`); + } + + warn(message) { + console.warn(`[${this.name}] WARN: ${message}`); + } + + error(message) { + console.error(`[${this.name}] ERROR: ${message}`); + } +} + +module.exports = { Logger }; \ No newline at end of file