50 lines
1.1 KiB
JavaScript
50 lines
1.1 KiB
JavaScript
const { Logger } = require('../utils/logger');
|
|
const { HttpClient } = require('../utils/httpClient');
|
|
|
|
class BaseCrawler {
|
|
constructor(config) {
|
|
this.config = config;
|
|
this.logger = new Logger(config.name);
|
|
this.httpClient = new HttpClient({
|
|
timeout: config.timeout || 15000,
|
|
headers: config.headers,
|
|
retryTimes: config.retryTimes || 3
|
|
});
|
|
this.crawledUrls = new Set();
|
|
}
|
|
|
|
async fetchHtml(url) {
|
|
try {
|
|
const html = await this.httpClient.get(url, {
|
|
encoding: this.config.encoding
|
|
});
|
|
return html;
|
|
} catch (error) {
|
|
this.logger.error(`请求失败: ${url}`, error);
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
async crawl() {
|
|
throw new Error('必须实现 crawl 方法');
|
|
}
|
|
|
|
async crawlCategories() {
|
|
throw new Error('必须实现 crawlCategories 方法');
|
|
}
|
|
|
|
async crawlNewsList(category) {
|
|
throw new Error('必须实现 crawlNewsList 方法');
|
|
}
|
|
|
|
// 防止重复爬取
|
|
markAsCrawled(url) {
|
|
this.crawledUrls.add(url);
|
|
}
|
|
|
|
isCrawled(url) {
|
|
return this.crawledUrls.has(url);
|
|
}
|
|
}
|
|
|
|
module.exports = { BaseCrawler }; |