forked from extern/se-scraper
too late to find a proper commit description
This commit is contained in:
parent
95a5ee56d8
commit
5e47c27c70
@ -4,14 +4,13 @@ const se_scraper = require('./../src/node_scraper.js');
|
||||
let browser_config = {
|
||||
debug_level: 1,
|
||||
test_evasion: false,
|
||||
log_http_headers: true,
|
||||
log_ip_address: true,
|
||||
log_http_headers: false,
|
||||
log_ip_address: false,
|
||||
random_user_agent: false,
|
||||
apply_evasion_techniques: true,
|
||||
screen_output: false,
|
||||
html_output: true,
|
||||
html_output: false,
|
||||
clean_html_output: true,
|
||||
compress: true,
|
||||
};
|
||||
|
||||
let scrape_job = {
|
||||
|
@ -2,7 +2,9 @@
|
||||
<module type="WEB_MODULE" version="4">
|
||||
<component name="NewModuleRootManager" inherit-compiler-output="true">
|
||||
<exclude-output />
|
||||
<content url="file://$MODULE_DIR$" />
|
||||
<content url="file://$MODULE_DIR$">
|
||||
<excludeFolder url="file://$MODULE_DIR$/test/static_tests/html" />
|
||||
</content>
|
||||
<orderEntry type="inheritedJdk" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
|
@ -3,52 +3,106 @@ const Scraper = require('./se_scraper');
|
||||
|
||||
class BingScraper extends Scraper {
|
||||
|
||||
parse(html) {
|
||||
// load the page source into cheerio
|
||||
const $ = cheerio.load(html);
|
||||
async parse_async(html) {
|
||||
|
||||
// perform queries
|
||||
const results = [];
|
||||
$('#b_content #b_results .b_algo').each((i, link) => {
|
||||
results.push({
|
||||
link: $(link).find('h2 a').attr('href'),
|
||||
title: $(link).find('h2').text(),
|
||||
snippet: $(link).find('.b_caption p').text(),
|
||||
visible_link: $(link).find('cite').text(),
|
||||
})
|
||||
let results = await this.page.evaluate(() => {
|
||||
|
||||
let _text = (el, s) => {
|
||||
let n = el.querySelector(s);
|
||||
|
||||
if (n) {
|
||||
return n.innerText;
|
||||
} else {
|
||||
return '';
|
||||
}
|
||||
};
|
||||
|
||||
let _attr = (el, s, attr) => {
|
||||
let n = el.querySelector(s);
|
||||
|
||||
if (n) {
|
||||
return n.getAttribute(attr);
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
};
|
||||
|
||||
let results = {
|
||||
num_results: '',
|
||||
no_results: false,
|
||||
effective_query: '',
|
||||
results: [],
|
||||
ads: [],
|
||||
right_side_ads: [],
|
||||
};
|
||||
|
||||
let num_results_el = document.querySelector('#b_content .sb_count');
|
||||
|
||||
if (num_results_el) {
|
||||
results.num_results = num_results_el.innerText;
|
||||
}
|
||||
|
||||
let organic_results = document.querySelectorAll('#b_content #b_results .b_algo');
|
||||
|
||||
organic_results.forEach((el) => {
|
||||
|
||||
let serp_obj = {
|
||||
link: _attr(el, 'h2 a', 'href'),
|
||||
title: _text(el, 'h2'),
|
||||
snippet: _text(el, '.b_caption p'),
|
||||
visible_link: _text(el, 'cite'),
|
||||
};
|
||||
|
||||
results.results.push(serp_obj);
|
||||
});
|
||||
|
||||
// check if no results
|
||||
results.no_results = (results.results.length === 0);
|
||||
|
||||
// parse bing ads
|
||||
const ads = [];
|
||||
$('.b_ad .sb_add').each((i, element) => {
|
||||
ads.push({
|
||||
visible_link: $(element).find('.b_adurl cite').text(),
|
||||
tracking_link: $(element).find('h2 a').attr('href'),
|
||||
//link: $(element).find('link').attr('href'),
|
||||
title: $(element).find('h2 a').text(),
|
||||
snippet: $(element).find('.b_caption').text(),
|
||||
})
|
||||
let ads = document.querySelectorAll('#b_results .b_ad .sb_add');
|
||||
|
||||
ads.forEach((el) => {
|
||||
|
||||
let ad_obj = {
|
||||
title: _text(el, 'h2 a'),
|
||||
snippet: _text(el, '.b_caption p'),
|
||||
visible_link: _text(el, '.b_adurl cite'),
|
||||
tracking_link: _attr(el, 'h2 a', 'href'),
|
||||
};
|
||||
|
||||
results.ads.push(ad_obj);
|
||||
});
|
||||
|
||||
// 'Including results for', 'Einschließlich Ergebnisse'
|
||||
let no_results = this.no_results(
|
||||
['There are no results', 'Es gibt keine Ergebnisse'],
|
||||
$('#b_results').text()
|
||||
);
|
||||
// right side ads
|
||||
let right_side_ads = document.querySelectorAll('#b_context .b_ad .sb_add');
|
||||
|
||||
let effective_query = $('#sp_requery a').first().text() || '';
|
||||
right_side_ads.forEach((el) => {
|
||||
|
||||
const cleaned = this.clean_results(results, ['title', 'link']);
|
||||
const ads_cleaned = this.clean_results(ads, ['title', 'visible_link', 'tracking_link']);
|
||||
let ad_obj = {
|
||||
title: _text(el, 'h2 a'),
|
||||
snippet: _text(el, '.b_caption p'),
|
||||
visible_link: _text(el, '.b_adurl cite'),
|
||||
tracking_link: _attr(el, 'h2 a', 'href'),
|
||||
};
|
||||
|
||||
return {
|
||||
time: (new Date()).toUTCString(),
|
||||
no_results: no_results,
|
||||
effective_query: effective_query,
|
||||
num_results: $('#b_content .sb_count').text(),
|
||||
results: cleaned,
|
||||
ads: ads_cleaned,
|
||||
results.right_side_ads.push(ad_obj);
|
||||
});
|
||||
|
||||
|
||||
let effective_query_el = document.querySelector('#sp_requery a');
|
||||
|
||||
if (effective_query_el) {
|
||||
results.effective_query = effective_query_el.innerText;
|
||||
}
|
||||
|
||||
return results;
|
||||
});
|
||||
|
||||
results.results = this.clean_results(results.results, ['title', 'link']);
|
||||
results.ads = this.clean_results(results.ads, ['title', 'visible_link', 'tracking_link']);
|
||||
results.time = (new Date()).toUTCString();
|
||||
return results;
|
||||
}
|
||||
|
||||
async load_start_page() {
|
||||
|
@ -76,11 +76,10 @@ class ScrapeManager {
|
||||
log_http_headers: false,
|
||||
// how long to sleep between requests. a random sleep interval within the range [a,b]
|
||||
// is drawn before every request. empty string for no sleeping.
|
||||
sleep_range: undefined,
|
||||
sleep_range: null,
|
||||
// which search engine to scrape
|
||||
search_engine: 'google',
|
||||
search_engine_name: 'google',
|
||||
compress: false, // compress
|
||||
// whether debug information should be printed
|
||||
// level 0: print nothing
|
||||
// level 1: print most important info
|
||||
@ -114,7 +113,7 @@ class ScrapeManager {
|
||||
// this module should export the functions:
|
||||
// get_browser, handle_metadata, close_browser
|
||||
//custom_func: resolve('examples/pluggable.js'),
|
||||
custom_func: undefined,
|
||||
custom_func: null,
|
||||
throw_on_detection: false,
|
||||
// use a proxy for all connections
|
||||
// example: 'socks5://78.94.172.42:1080'
|
||||
|
@ -34,6 +34,11 @@ async function bing_ads() {
|
||||
|
||||
bing_search_with_ads3( await scraper.scrape(scrape_config) );
|
||||
|
||||
scrape_config.keywords = ['service auto garage'];
|
||||
scrape_config.scrape_from_file = 'file://' + path.join(__dirname, './html/bing4.html');
|
||||
|
||||
bing_search_with_ads4( await scraper.scrape(scrape_config) );
|
||||
|
||||
await scraper.quit();
|
||||
}
|
||||
|
||||
@ -52,7 +57,9 @@ function bing_search_with_ads(response) {
|
||||
assert.include(obj.num_results, '1’100’000', 'num results not included');
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'ads'], 'not all keys are in the object');
|
||||
assert.isAtLeast(obj.results.length, 6, 'results must have at least 6 SERP objects');
|
||||
assert.isAtLeast(obj.ads.length, 12, 'there are 12 ads');
|
||||
assert.isAtLeast(obj.ads.length, 7, 'there are 7 ads');
|
||||
|
||||
assert.isAtLeast(obj.right_side_ads.length, 5, 'there are 5 ads');
|
||||
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
@ -79,7 +86,9 @@ function bing_search_with_ads2(response) {
|
||||
assert.include(obj.num_results, '44’300’000', 'num results not included');
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'ads'], 'not all keys are in the object');
|
||||
assert.isAtLeast(obj.results.length, 6, 'results must have at least 6 SERP objects');
|
||||
assert.isAtLeast(obj.ads.length, 12, 'there are 12 ads');
|
||||
assert.isAtLeast(obj.ads.length, 7, 'there are 7 ads');
|
||||
|
||||
assert.isAtLeast(obj.right_side_ads.length, 5, 'there are 5 ads');
|
||||
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
@ -102,7 +111,7 @@ function bing_search_with_ads3(response) {
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.include(obj.num_results, '65.500.000 results', 'num results not included');
|
||||
assert.include(obj.num_results, '65.500.000 Results', 'num results not included');
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'ads'], 'not all keys are in the object');
|
||||
assert.isAtLeast(obj.results.length, 10, 'results must have at least 10 SERP objects');
|
||||
assert.isAtLeast(obj.ads.length, 3, 'there are 3 ads');
|
||||
@ -117,6 +126,32 @@ function bing_search_with_ads3(response) {
|
||||
}
|
||||
}
|
||||
|
||||
function bing_search_with_ads4(response) {
|
||||
assert.equal(response.metadata.num_requests, 1);
|
||||
|
||||
for (let query in response.results) {
|
||||
|
||||
for (let page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.include(obj.num_results, '4.200.000 Ergebnisse', 'num results not included');
|
||||
assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'ads'], 'not all keys are in the object');
|
||||
assert.isAtLeast(obj.results.length, 9, 'results must have at least 9 SERP objects');
|
||||
assert.isAtLeast(obj.ads.length, 3, 'there are 3 ads');
|
||||
|
||||
assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
confirm_results_ok(obj);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
function confirm_results_ok(obj) {
|
||||
|
||||
@ -160,6 +195,25 @@ function confirm_results_ok(obj) {
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
}
|
||||
|
||||
for (let res of obj.right_side_ads) {
|
||||
|
||||
assert.isOk(res.tracking_link, 'link must be ok');
|
||||
assert.typeOf(res.tracking_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.tracking_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.visible_link, 'link must be ok');
|
||||
assert.typeOf(res.visible_link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.visible_link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 8, 'title must have at least 8 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
}
|
||||
}
|
||||
|
||||
describe('Bing', function(){
|
||||
|
@ -73,9 +73,11 @@ function normal_search_test_case(response) {
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 5, 'title must have at least 5 chars');
|
||||
|
||||
if (res.snippet) {
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
}
|
||||
|
||||
assert.isNumber(res.rank, 'rank must be integer');
|
||||
assert.equal(res.rank, total_rank++, 'rank ist wrong');
|
||||
|
Loading…
Reference in New Issue
Block a user