too late to find a proper commit description

This commit is contained in:
Nikolai Tschacher 2019-09-23 23:38:38 +02:00
parent 95a5ee56d8
commit 5e47c27c70
6 changed files with 167 additions and 57 deletions

View File

@ -4,14 +4,13 @@ const se_scraper = require('./../src/node_scraper.js');
let browser_config = {
debug_level: 1,
test_evasion: false,
log_http_headers: true,
log_ip_address: true,
log_http_headers: false,
log_ip_address: false,
random_user_agent: false,
apply_evasion_techniques: true,
screen_output: false,
html_output: true,
html_output: false,
clean_html_output: true,
compress: true,
};
let scrape_job = {

View File

@ -2,7 +2,9 @@
<module type="WEB_MODULE" version="4">
<component name="NewModuleRootManager" inherit-compiler-output="true">
<exclude-output />
<content url="file://$MODULE_DIR$" />
<content url="file://$MODULE_DIR$">
<excludeFolder url="file://$MODULE_DIR$/test/static_tests/html" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>

View File

@ -3,52 +3,106 @@ const Scraper = require('./se_scraper');
class BingScraper extends Scraper {
parse(html) {
// load the page source into cheerio
const $ = cheerio.load(html);
async parse_async(html) {
// perform queries
const results = [];
$('#b_content #b_results .b_algo').each((i, link) => {
results.push({
link: $(link).find('h2 a').attr('href'),
title: $(link).find('h2').text(),
snippet: $(link).find('.b_caption p').text(),
visible_link: $(link).find('cite').text(),
})
let results = await this.page.evaluate(() => {
let _text = (el, s) => {
let n = el.querySelector(s);
if (n) {
return n.innerText;
} else {
return '';
}
};
let _attr = (el, s, attr) => {
let n = el.querySelector(s);
if (n) {
return n.getAttribute(attr);
} else {
return null;
}
};
let results = {
num_results: '',
no_results: false,
effective_query: '',
results: [],
ads: [],
right_side_ads: [],
};
let num_results_el = document.querySelector('#b_content .sb_count');
if (num_results_el) {
results.num_results = num_results_el.innerText;
}
let organic_results = document.querySelectorAll('#b_content #b_results .b_algo');
organic_results.forEach((el) => {
let serp_obj = {
link: _attr(el, 'h2 a', 'href'),
title: _text(el, 'h2'),
snippet: _text(el, '.b_caption p'),
visible_link: _text(el, 'cite'),
};
results.results.push(serp_obj);
});
// check if no results
results.no_results = (results.results.length === 0);
// parse bing ads
const ads = [];
$('.b_ad .sb_add').each((i, element) => {
ads.push({
visible_link: $(element).find('.b_adurl cite').text(),
tracking_link: $(element).find('h2 a').attr('href'),
//link: $(element).find('link').attr('href'),
title: $(element).find('h2 a').text(),
snippet: $(element).find('.b_caption').text(),
})
let ads = document.querySelectorAll('#b_results .b_ad .sb_add');
ads.forEach((el) => {
let ad_obj = {
title: _text(el, 'h2 a'),
snippet: _text(el, '.b_caption p'),
visible_link: _text(el, '.b_adurl cite'),
tracking_link: _attr(el, 'h2 a', 'href'),
};
results.ads.push(ad_obj);
});
// 'Including results for', 'Einschließlich Ergebnisse'
let no_results = this.no_results(
['There are no results', 'Es gibt keine Ergebnisse'],
$('#b_results').text()
);
// right side ads
let right_side_ads = document.querySelectorAll('#b_context .b_ad .sb_add');
let effective_query = $('#sp_requery a').first().text() || '';
right_side_ads.forEach((el) => {
const cleaned = this.clean_results(results, ['title', 'link']);
const ads_cleaned = this.clean_results(ads, ['title', 'visible_link', 'tracking_link']);
let ad_obj = {
title: _text(el, 'h2 a'),
snippet: _text(el, '.b_caption p'),
visible_link: _text(el, '.b_adurl cite'),
tracking_link: _attr(el, 'h2 a', 'href'),
};
return {
time: (new Date()).toUTCString(),
no_results: no_results,
effective_query: effective_query,
num_results: $('#b_content .sb_count').text(),
results: cleaned,
ads: ads_cleaned,
results.right_side_ads.push(ad_obj);
});
let effective_query_el = document.querySelector('#sp_requery a');
if (effective_query_el) {
results.effective_query = effective_query_el.innerText;
}
return results;
});
results.results = this.clean_results(results.results, ['title', 'link']);
results.ads = this.clean_results(results.ads, ['title', 'visible_link', 'tracking_link']);
results.time = (new Date()).toUTCString();
return results;
}
async load_start_page() {

View File

@ -76,11 +76,10 @@ class ScrapeManager {
log_http_headers: false,
// how long to sleep between requests. a random sleep interval within the range [a,b]
// is drawn before every request. empty string for no sleeping.
sleep_range: undefined,
sleep_range: null,
// which search engine to scrape
search_engine: 'google',
search_engine_name: 'google',
compress: false, // compress
// whether debug information should be printed
// level 0: print nothing
// level 1: print most important info
@ -114,7 +113,7 @@ class ScrapeManager {
// this module should export the functions:
// get_browser, handle_metadata, close_browser
//custom_func: resolve('examples/pluggable.js'),
custom_func: undefined,
custom_func: null,
throw_on_detection: false,
// use a proxy for all connections
// example: 'socks5://78.94.172.42:1080'

View File

@ -34,6 +34,11 @@ async function bing_ads() {
bing_search_with_ads3( await scraper.scrape(scrape_config) );
scrape_config.keywords = ['service auto garage'];
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/bing4.html');
bing_search_with_ads4( await scraper.scrape(scrape_config) );
await scraper.quit();
}
@ -52,7 +57,9 @@ function bing_search_with_ads(response) {
assert.include(obj.num_results, '1100000', 'num results not included');
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'ads'], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 6, 'results must have at least 6 SERP objects');
assert.isAtLeast(obj.ads.length, 12, 'there are 12 ads');
assert.isAtLeast(obj.ads.length, 7, 'there are 7 ads');
assert.isAtLeast(obj.right_side_ads.length, 5, 'there are 5 ads');
assert.equal(obj.no_results, false, 'no results should be false');
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
@ -79,7 +86,9 @@ function bing_search_with_ads2(response) {
assert.include(obj.num_results, '44300000', 'num results not included');
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'ads'], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 6, 'results must have at least 6 SERP objects');
assert.isAtLeast(obj.ads.length, 12, 'there are 12 ads');
assert.isAtLeast(obj.ads.length, 7, 'there are 7 ads');
assert.isAtLeast(obj.right_side_ads.length, 5, 'there are 5 ads');
assert.equal(obj.no_results, false, 'no results should be false');
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
@ -102,7 +111,7 @@ function bing_search_with_ads3(response) {
let obj = response.results[query][page_number];
assert.include(obj.num_results, '65.500.000 results', 'num results not included');
assert.include(obj.num_results, '65.500.000 Results', 'num results not included');
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'ads'], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 10, 'results must have at least 10 SERP objects');
assert.isAtLeast(obj.ads.length, 3, 'there are 3 ads');
@ -117,6 +126,32 @@ function bing_search_with_ads3(response) {
}
}
function bing_search_with_ads4(response) {
assert.equal(response.metadata.num_requests, 1);
for (let query in response.results) {
for (let page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.include(obj.num_results, '4.200.000 Ergebnisse', 'num results not included');
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'ads'], 'not all keys are in the object');
assert.isAtLeast(obj.results.length, 9, 'results must have at least 9 SERP objects');
assert.isAtLeast(obj.ads.length, 3, 'there are 3 ads');
assert.equal(obj.no_results, false, 'no results should be false');
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
confirm_results_ok(obj);
}
}
}
function confirm_results_ok(obj) {
@ -160,6 +195,25 @@ function confirm_results_ok(obj) {
assert.typeOf(res.snippet, 'string', 'snippet must be string');
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
}
for (let res of obj.right_side_ads) {
assert.isOk(res.tracking_link, 'link must be ok');
assert.typeOf(res.tracking_link, 'string', 'link must be string');
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.visible_link, 'link must be ok');
assert.typeOf(res.visible_link, 'string', 'link must be string');
assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 8, 'title must have at least 8 chars');
assert.isOk(res.snippet, 'snippet must be ok');
assert.typeOf(res.snippet, 'string', 'snippet must be string');
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
}
}
describe('Bing', function(){

View File

@ -73,9 +73,11 @@ function normal_search_test_case(response) {
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 5, 'title must have at least 5 chars');
if (res.snippet) {
assert.isOk(res.snippet, 'snippet must be ok');
assert.typeOf(res.snippet, 'string', 'snippet must be string');
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
}
assert.isNumber(res.rank, 'rank must be integer');
assert.equal(res.rank, total_rank++, 'rank ist wrong');