first commit
This commit is contained in:
50
src/crawlers/baseCrawler.js
Normal file
50
src/crawlers/baseCrawler.js
Normal file
@ -0,0 +1,50 @@
|
||||
const { Logger } = require('../utils/logger');
|
||||
const { HttpClient } = require('../utils/httpClient');
|
||||
|
||||
class BaseCrawler {
|
||||
constructor(config) {
|
||||
this.config = config;
|
||||
this.logger = new Logger(config.name);
|
||||
this.httpClient = new HttpClient({
|
||||
timeout: config.timeout || 15000,
|
||||
headers: config.headers,
|
||||
retryTimes: config.retryTimes || 3
|
||||
});
|
||||
this.crawledUrls = new Set();
|
||||
}
|
||||
|
||||
async fetchHtml(url) {
|
||||
try {
|
||||
const html = await this.httpClient.get(url, {
|
||||
encoding: this.config.encoding
|
||||
});
|
||||
return html;
|
||||
} catch (error) {
|
||||
this.logger.error(`请求失败: ${url}`, error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
async crawl() {
|
||||
throw new Error('必须实现 crawl 方法');
|
||||
}
|
||||
|
||||
async crawlCategories() {
|
||||
throw new Error('必须实现 crawlCategories 方法');
|
||||
}
|
||||
|
||||
async crawlNewsList(category) {
|
||||
throw new Error('必须实现 crawlNewsList 方法');
|
||||
}
|
||||
|
||||
// 防止重复爬取
|
||||
markAsCrawled(url) {
|
||||
this.crawledUrls.add(url);
|
||||
}
|
||||
|
||||
isCrawled(url) {
|
||||
return this.crawledUrls.has(url);
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = { BaseCrawler };
|
Reference in New Issue
Block a user