mirror of
https://github.com/NikolaiT/se-scraper.git
synced 2025-06-24 03:21:34 +02:00
fixed issue https://github.com/NikolaiT/se-scraper/issues/37
This commit is contained in:
parent
60a9d52924
commit
59154694f2
11
TODO.md
11
TODO.md
@ -48,6 +48,17 @@
|
|||||||
### 12.6.2019
|
### 12.6.2019
|
||||||
- remove unnecessary sleep() calls and replace with waitFor selectors
|
- remove unnecessary sleep() calls and replace with waitFor selectors
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
### 16.7.2019
|
||||||
|
|
||||||
|
- resolve issues
|
||||||
|
- fix this https://github.com/NikolaiT/se-scraper/issues/37
|
||||||
|
|
||||||
|
- use puppeteer stealth plugin
|
||||||
|
- user random user agents plugin
|
||||||
|
- add screenshot capability (make the screen after parsing)
|
||||||
|
|
||||||
### TODO:
|
### TODO:
|
||||||
1. fix googlenewsscraper waiting for results and parsing. remove the static sleep [done]
|
1. fix googlenewsscraper waiting for results and parsing. remove the static sleep [done]
|
||||||
2. when using multiple browsers and random user agent, pass a random user agent to each perBrowserOptions
|
2. when using multiple browsers and random user agent, pass a random user agent to each perBrowserOptions
|
||||||
|
@ -3,15 +3,14 @@ const se_scraper = require('./../src/node_scraper.js');
|
|||||||
(async () => {
|
(async () => {
|
||||||
let browser_config = {
|
let browser_config = {
|
||||||
search_engine: 'google',
|
search_engine: 'google',
|
||||||
debug_level: 2,
|
debug_level: 1,
|
||||||
sleep_range: '',
|
|
||||||
output_file: '',
|
|
||||||
random_user_agent: true,
|
random_user_agent: true,
|
||||||
is_local: false,
|
is_local: false,
|
||||||
|
html_output: false,
|
||||||
throw_on_detection: false,
|
throw_on_detection: false,
|
||||||
headless: false,
|
headless: true,
|
||||||
puppeteer_cluster_config: {
|
puppeteer_cluster_config: {
|
||||||
headless: false,
|
headless: true,
|
||||||
timeout: 30 * 60 * 1000, // max timeout set to 30 minutes
|
timeout: 30 * 60 * 1000, // max timeout set to 30 minutes
|
||||||
monitor: false,
|
monitor: false,
|
||||||
concurrency: 3, // 3 == CONCURRENCY_BROWSER
|
concurrency: 3, // 3 == CONCURRENCY_BROWSER
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "se-scraper",
|
"name": "se-scraper",
|
||||||
"version": "1.3.15",
|
"version": "1.4.0",
|
||||||
"description": "A module using puppeteer to scrape several search engines such as Google, Duckduckgo, Bing or Baidu",
|
"description": "A module using puppeteer to scrape several search engines such as Google, Duckduckgo, Bing or Baidu",
|
||||||
"homepage": "https://scrapeulous.com/",
|
"homepage": "https://scrapeulous.com/",
|
||||||
"main": "index.js",
|
"main": "index.js",
|
||||||
|
@ -71,10 +71,7 @@ module.exports = class Scraper {
|
|||||||
await this.scraping_loop();
|
await this.scraping_loop();
|
||||||
}
|
}
|
||||||
|
|
||||||
return {
|
return this.results;
|
||||||
'results': this.results,
|
|
||||||
'html_output': this.html_output,
|
|
||||||
};
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -161,7 +158,6 @@ module.exports = class Scraper {
|
|||||||
this.num_keywords++;
|
this.num_keywords++;
|
||||||
this.keyword = keyword;
|
this.keyword = keyword;
|
||||||
this.results[keyword] = {};
|
this.results[keyword] = {};
|
||||||
this.html_output[keyword] = {};
|
|
||||||
this.result_rank = 1;
|
this.result_rank = 1;
|
||||||
|
|
||||||
if (this.pluggable && this.pluggable.before_keyword_scraped) {
|
if (this.pluggable && this.pluggable.before_keyword_scraped) {
|
||||||
@ -193,14 +189,13 @@ module.exports = class Scraper {
|
|||||||
}
|
}
|
||||||
|
|
||||||
let html = await this.page.content();
|
let html = await this.page.content();
|
||||||
|
|
||||||
if (this.config.html_output) {
|
|
||||||
this.html_output[keyword][this.page_num] = html;
|
|
||||||
}
|
|
||||||
|
|
||||||
let parsed = this.parse(html);
|
let parsed = this.parse(html);
|
||||||
this.results[keyword][this.page_num] = parsed ? parsed : await this.parse_async(html);
|
this.results[keyword][this.page_num] = parsed ? parsed : await this.parse_async(html);
|
||||||
|
|
||||||
|
if (this.config.html_output) {
|
||||||
|
this.results[keyword][this.page_num].html = html;
|
||||||
|
}
|
||||||
|
|
||||||
this.page_num += 1;
|
this.page_num += 1;
|
||||||
|
|
||||||
// only load the next page when we will pass the next iteration
|
// only load the next page when we will pass the next iteration
|
||||||
|
@ -342,7 +342,6 @@ class ScrapeManager {
|
|||||||
Object.assign(this.config, scrape_config);
|
Object.assign(this.config, scrape_config);
|
||||||
|
|
||||||
var results = {};
|
var results = {};
|
||||||
var html_output = {};
|
|
||||||
var num_requests = 0;
|
var num_requests = 0;
|
||||||
var metadata = {};
|
var metadata = {};
|
||||||
|
|
||||||
@ -365,7 +364,6 @@ class ScrapeManager {
|
|||||||
results = res.results;
|
results = res.results;
|
||||||
metadata = this.scraper.metadata;
|
metadata = this.scraper.metadata;
|
||||||
num_requests = this.scraper.num_requests;
|
num_requests = this.scraper.num_requests;
|
||||||
html_output = this.scraper.html_output;
|
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
// Each browser will get N/(K+1) keywords and will issue N/(K+1) * M total requests to the search engine.
|
// Each browser will get N/(K+1) keywords and will issue N/(K+1) * M total requests to the search engine.
|
||||||
@ -409,10 +407,7 @@ class ScrapeManager {
|
|||||||
|
|
||||||
// Merge results per keyword
|
// Merge results per keyword
|
||||||
for (let promiseReturn of promiseReturns) {
|
for (let promiseReturn of promiseReturns) {
|
||||||
for (let keyword of this.config.keywords) {
|
Object.assign(results, promiseReturn);
|
||||||
results[keyword] = promiseReturn.results[keyword];
|
|
||||||
html_output[keyword] = promiseReturn.html_output[keyword];
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// count total requests among all scraper instances
|
// count total requests among all scraper instances
|
||||||
@ -461,7 +456,6 @@ class ScrapeManager {
|
|||||||
|
|
||||||
return {
|
return {
|
||||||
results: results,
|
results: results,
|
||||||
html_output: (this.config.html_output) ? html_output : undefined,
|
|
||||||
metadata: metadata || {},
|
metadata: metadata || {},
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -275,9 +275,9 @@ function test_case_ads_test(response) {
|
|||||||
assert.typeOf(res.visible_link, 'string', 'link must be string');
|
assert.typeOf(res.visible_link, 'string', 'link must be string');
|
||||||
assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars');
|
assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars');
|
||||||
|
|
||||||
assert.isOk(res.visible_link, 'visible_link must be ok');
|
assert.isOk(res.link, 'visible_link must be ok');
|
||||||
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
|
assert.typeOf(res.link, 'string', 'visible_link must be string');
|
||||||
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
|
assert.isAtLeast(res.link.length, 5, 'visible_link must have at least 5 chars');
|
||||||
|
|
||||||
assert.isOk(res.title, 'title must be ok');
|
assert.isOk(res.title, 'title must be ok');
|
||||||
assert.typeOf(res.title, 'string', 'title must be string');
|
assert.typeOf(res.title, 'string', 'title must be string');
|
||||||
@ -287,7 +287,7 @@ function test_case_ads_test(response) {
|
|||||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||||
|
|
||||||
assert.typeOf(res.links, 'array', 'snippet must be array');
|
assert.typeOf(res.links, 'array', 'links must be array');
|
||||||
}
|
}
|
||||||
|
|
||||||
for (let res of obj.bottom_ads) {
|
for (let res of obj.bottom_ads) {
|
||||||
@ -299,9 +299,9 @@ function test_case_ads_test(response) {
|
|||||||
assert.typeOf(res.visible_link, 'string', 'link must be string');
|
assert.typeOf(res.visible_link, 'string', 'link must be string');
|
||||||
assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars');
|
assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars');
|
||||||
|
|
||||||
assert.isOk(res.visible_link, 'visible_link must be ok');
|
assert.isOk(res.link, 'visible_link must be ok');
|
||||||
assert.typeOf(res.visible_link, 'string', 'visible_link must be string');
|
assert.typeOf(res.link, 'string', 'visible_link must be string');
|
||||||
assert.isAtLeast(res.visible_link.length, 5, 'visible_link must have at least 5 chars');
|
assert.isAtLeast(res.link.length, 5, 'visible_link must have at least 5 chars');
|
||||||
|
|
||||||
assert.isOk(res.title, 'title must be ok');
|
assert.isOk(res.title, 'title must be ok');
|
||||||
assert.typeOf(res.title, 'string', 'title must be string');
|
assert.typeOf(res.title, 'string', 'title must be string');
|
||||||
@ -311,7 +311,102 @@ function test_case_ads_test(response) {
|
|||||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||||
|
|
||||||
assert.typeOf(res.links, 'array', 'snippet must be array');
|
assert.typeOf(res.links, 'array', 'links must be array');
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
const product_keywords = ['autoreifen bmw'];
|
||||||
|
|
||||||
|
async function products_test() {
|
||||||
|
let config = {
|
||||||
|
compress: false,
|
||||||
|
debug_level: 1,
|
||||||
|
headless: true,
|
||||||
|
block_assets: false,
|
||||||
|
random_user_agent: false, // dont try to trick google with ads
|
||||||
|
};
|
||||||
|
|
||||||
|
let scrape_config = {
|
||||||
|
search_engine: 'google',
|
||||||
|
keywords: ads_keywords,
|
||||||
|
num_pages: 1,
|
||||||
|
};
|
||||||
|
|
||||||
|
console.log('products_test()');
|
||||||
|
test_case_products_test( await se_scraper.scrape(config, scrape_config) );
|
||||||
|
}
|
||||||
|
|
||||||
|
function test_case_products_test(response) {
|
||||||
|
assert.equal(response.metadata.num_requests, 2);
|
||||||
|
|
||||||
|
for (let query in response.results) {
|
||||||
|
|
||||||
|
assert.containsAllKeys(response.results, ads_keywords, 'not all keywords were scraped.');
|
||||||
|
|
||||||
|
for (let page_number in response.results[query]) {
|
||||||
|
|
||||||
|
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||||
|
|
||||||
|
let obj = response.results[query][page_number];
|
||||||
|
|
||||||
|
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'top_ads', 'bottom_ads', 'places'], 'not all keys are in the object');
|
||||||
|
|
||||||
|
assert.isAtLeast(obj.results.length, 7, 'results must have at least 7 SERP objects');
|
||||||
|
assert.equal(obj.no_results, false, 'no results should be false');
|
||||||
|
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||||
|
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||||
|
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||||
|
|
||||||
|
assert(obj.top_products.length >= 1 || obj.right_products.length >= 1, 'top_products or right_products must have at least 1 SERP object');
|
||||||
|
|
||||||
|
for (let res of obj.top_products) {
|
||||||
|
|
||||||
|
assert.isOk(res.tracking_link, 'link must be ok');
|
||||||
|
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||||
|
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||||
|
|
||||||
|
assert.isOk(res.link, 'link must be ok');
|
||||||
|
assert.typeOf(res.link, 'string', 'link must be string');
|
||||||
|
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||||
|
|
||||||
|
assert.isOk(res.price, 'price must be ok');
|
||||||
|
assert.typeOf(res.price, 'string', 'price must be string');
|
||||||
|
assert.isAtLeast(res.price.length, 5, 'price must have at least 5 chars');
|
||||||
|
|
||||||
|
assert.isOk(res.title, 'title must be ok');
|
||||||
|
assert.typeOf(res.title, 'string', 'title must be string');
|
||||||
|
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||||
|
|
||||||
|
assert.isOk(res.vendor_link, 'vendor_link must be ok');
|
||||||
|
assert.typeOf(res.vendor_link, 'string', 'vendor_link must be string');
|
||||||
|
assert.isAtLeast(res.vendor_link.length, 10, 'vendor_link must have at least 10 chars');
|
||||||
|
}
|
||||||
|
|
||||||
|
for (let res of obj.right_products) {
|
||||||
|
assert.isOk(res.tracking_link, 'link must be ok');
|
||||||
|
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||||
|
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||||
|
|
||||||
|
assert.isOk(res.link, 'link must be ok');
|
||||||
|
assert.typeOf(res.link, 'string', 'link must be string');
|
||||||
|
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||||
|
|
||||||
|
assert.isOk(res.price, 'price must be ok');
|
||||||
|
assert.typeOf(res.price, 'string', 'price must be string');
|
||||||
|
assert.isAtLeast(res.price.length, 5, 'price must have at least 5 chars');
|
||||||
|
|
||||||
|
assert.isOk(res.title, 'title must be ok');
|
||||||
|
assert.typeOf(res.title, 'string', 'title must be string');
|
||||||
|
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||||
|
|
||||||
|
assert.isOk(res.vendor_link, 'vendor_link must be ok');
|
||||||
|
assert.typeOf(res.vendor_link, 'string', 'vendor_link must be string');
|
||||||
|
assert.isAtLeast(res.vendor_link.length, 10, 'vendor_link must have at least 10 chars');
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -325,4 +420,5 @@ describe('Google', function(){
|
|||||||
it('effective query', effective_query_test);
|
it('effective query', effective_query_test);
|
||||||
it('html output query', html_output_query_test);
|
it('html output query', html_output_query_test);
|
||||||
it('ads', ads_test);
|
it('ads', ads_test);
|
||||||
|
it('products test', products_test);
|
||||||
});
|
});
|
||||||
|
Loading…
x
Reference in New Issue
Block a user