From 7b5048b8eedc8df9462386930266f46fe114f09e Mon Sep 17 00:00:00 2001 From: Nikolai Tschacher Date: Thu, 7 Feb 2019 16:21:56 +0100 Subject: [PATCH] num_keywords are counted now. added to pluggable --- TODO.txt | 20 ++++++-------------- package.json | 2 +- src/modules/se_scraper.js | 6 ++++++ 3 files changed, 13 insertions(+), 15 deletions(-) diff --git a/TODO.txt b/TODO.txt index f82501e..44a05cc 100644 --- a/TODO.txt +++ b/TODO.txt @@ -32,22 +32,14 @@ - Bing, Baidu, Google, Duckduckgo 7.2.2019 - - add num_requests to test cases + - add num_requests to test cases [done] + + TODO: - - think about implementing ticker search for: https://quotes.wsj.com/MSFT?mod=searchresults_companyquotes - - add proxy support - add captcha service solving support - check if news instances run the same browser and if we can have one proxy per tab wokers -TODO: - - think whether it makes sense to introduce a generic scraping class? - - is scraping abstractable or is every scraper too unique? - - dont make the same mistakes as with GoogleScraper - - -TODO: - okay its fucking time to make a generic scraping class like in GoogleScraper [done] - i feel like history repeats - - write good test case for google [done] \ No newline at end of file + - write test case for: + - pluggable + - full metadata (log http headers, log ip address) diff --git a/package.json b/package.json index c162f36..380e2f2 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "se-scraper", - "version": "1.1.12", + "version": "1.1.13", "description": "A simple library using puppeteer to scrape several search engines such as Google, Duckduckgo and Bing.", "homepage": "https://scrapeulous.com/", "main": "index.js", diff --git a/src/modules/se_scraper.js b/src/modules/se_scraper.js index 2abce75..0b40c61 100644 --- a/src/modules/se_scraper.js +++ b/src/modules/se_scraper.js @@ -30,7 +30,10 @@ module.exports = class Scraper { this.results = {}; this.result_rank = 1; + // keep track of the requests done this.num_requests = 0; + // keep track of the keywords searched + this.num_keywords = 0; } async run() { @@ -96,12 +99,15 @@ module.exports = class Scraper { */ async scraping_loop() { for (let keyword of this.config.keywords) { + this.num_keywords++; this.keyword = keyword; this.results[keyword] = {}; this.result_rank = 1; if (this.pluggable.before_keyword_scraped) { await this.pluggable.before_keyword_scraped({ + num_keywords: this.num_keywords, + num_requests: this.num_requests, keyword: keyword, page: this.page, config: this.config,