const { Logger } = require('../utils/logger'); const { HttpClient } = require('../utils/httpClient'); class BaseCrawler { constructor(config) { this.config = config; this.logger = new Logger(config.name); this.httpClient = new HttpClient({ timeout: config.timeout || 15000, headers: config.headers, retryTimes: config.retryTimes || 3 }); this.crawledUrls = new Set(); } async fetchHtml(url) { try { const html = await this.httpClient.get(url, { encoding: this.config.encoding }); return html; } catch (error) { this.logger.error(`请求失败: ${url}`, error); throw error; } } async crawl() { throw new Error('必须实现 crawl 方法'); } async crawlCategories() { throw new Error('必须实现 crawlCategories 方法'); } async crawlNewsList(category) { throw new Error('必须实现 crawlNewsList 方法'); } // 防止重复爬取 markAsCrawled(url) { this.crawledUrls.add(url); } isCrawled(url) { return this.crawledUrls.has(url); } } module.exports = { BaseCrawler };