Files
djby-crawler/src/crawlers/baseCrawler.js
2025-07-02 18:23:02 +08:00

50 lines
1.1 KiB
JavaScript

const { Logger } = require('../utils/logger');
const { HttpClient } = require('../utils/httpClient');
class BaseCrawler {
constructor(config) {
this.config = config;
this.logger = new Logger(config.name);
this.httpClient = new HttpClient({
timeout: config.timeout || 15000,
headers: config.headers,
retryTimes: config.retryTimes || 3
});
this.crawledUrls = new Set();
}
async fetchHtml(url) {
try {
const html = await this.httpClient.get(url, {
encoding: this.config.encoding
});
return html;
} catch (error) {
this.logger.error(`请求失败: ${url}`, error);
throw error;
}
}
async crawl() {
throw new Error('必须实现 crawl 方法');
}
async crawlCategories() {
throw new Error('必须实现 crawlCategories 方法');
}
async crawlNewsList(category) {
throw new Error('必须实现 crawlNewsList 方法');
}
// 防止重复爬取
markAsCrawled(url) {
this.crawledUrls.add(url);
}
isCrawled(url) {
return this.crawledUrls.has(url);
}
}
module.exports = { BaseCrawler };