mirror of
https://github.com/NikolaiT/se-scraper.git
synced 2025-06-23 19:11:34 +02:00
tested and works
This commit is contained in:
parent
581568ff18
commit
987e3d7342
40
README.md
40
README.md
@ -26,6 +26,36 @@ Additionally **se-scraper** supports investment ticker search from the following
|
|||||||
|
|
||||||
This module uses puppeteer. It was created by the Developer of https://github.com/NikolaiT/GoogleScraper, a module with 1800 Stars on Github.
|
This module uses puppeteer. It was created by the Developer of https://github.com/NikolaiT/GoogleScraper, a module with 1800 Stars on Github.
|
||||||
|
|
||||||
|
### Quickstart
|
||||||
|
|
||||||
|
Install with
|
||||||
|
|
||||||
|
```bash
|
||||||
|
npm install se-scraper
|
||||||
|
```
|
||||||
|
|
||||||
|
then create a file with the following contents and start scraping.
|
||||||
|
|
||||||
|
```js
|
||||||
|
const se_scraper = require('se-scraper');
|
||||||
|
|
||||||
|
let config = {
|
||||||
|
search_engine: 'google',
|
||||||
|
debug: false,
|
||||||
|
verbose: false,
|
||||||
|
keywords: ['news', 'scraping scrapeulous.com'],
|
||||||
|
num_pages: 3,
|
||||||
|
output_file: 'data.json',
|
||||||
|
};
|
||||||
|
|
||||||
|
function callback(err, response) {
|
||||||
|
if (err) { console.error(err) }
|
||||||
|
console.dir(response, {depth: null, colors: true});
|
||||||
|
}
|
||||||
|
|
||||||
|
se_scraper.scrape(config, callback);
|
||||||
|
```
|
||||||
|
|
||||||
### Technical Notes
|
### Technical Notes
|
||||||
|
|
||||||
Scraping is done with a headless chromium browser using the automation library puppeteer. Puppeteer is a Node library which provides a high-level API to control headless Chrome or Chromium over the DevTools Protocol.
|
Scraping is done with a headless chromium browser using the automation library puppeteer. Puppeteer is a Node library which provides a high-level API to control headless Chrome or Chromium over the DevTools Protocol.
|
||||||
@ -75,13 +105,7 @@ Consider the following resources:
|
|||||||
|
|
||||||
* https://intoli.com/blog/making-chrome-headless-undetectable/
|
* https://intoli.com/blog/making-chrome-headless-undetectable/
|
||||||
|
|
||||||
### Installation and Usage
|
### Advanced Usage
|
||||||
|
|
||||||
Install with
|
|
||||||
|
|
||||||
```bash
|
|
||||||
npm install se-scraper
|
|
||||||
```
|
|
||||||
|
|
||||||
Use se-scraper by calling it with a script such as the one below.
|
Use se-scraper by calling it with a script such as the one below.
|
||||||
|
|
||||||
@ -162,9 +186,7 @@ Supported options for the `search_engine` config key:
|
|||||||
'baidu'
|
'baidu'
|
||||||
'youtube'
|
'youtube'
|
||||||
'duckduckgo_news'
|
'duckduckgo_news'
|
||||||
'google_dr'
|
|
||||||
'yahoo_news'
|
'yahoo_news'
|
||||||
// ticker search
|
|
||||||
'bloomberg'
|
'bloomberg'
|
||||||
'reuters'
|
'reuters'
|
||||||
'cnbc'
|
'cnbc'
|
||||||
|
17
examples/quickstart.js
Normal file
17
examples/quickstart.js
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
const se_scraper = require('./../index.js');
|
||||||
|
|
||||||
|
let config = {
|
||||||
|
search_engine: 'duckduckgo',
|
||||||
|
debug: false,
|
||||||
|
verbose: false,
|
||||||
|
keywords: ['news'],
|
||||||
|
num_pages: 2,
|
||||||
|
output_file: 'data.json',
|
||||||
|
};
|
||||||
|
|
||||||
|
function callback(err, response) {
|
||||||
|
if (err) { console.error(err) }
|
||||||
|
console.dir(response, {depth: null, colors: true});
|
||||||
|
}
|
||||||
|
|
||||||
|
se_scraper.scrape(config, callback);
|
4
index.js
4
index.js
@ -8,11 +8,11 @@ exports.scrape = async function(config, callback) {
|
|||||||
// the user agent to scrape with
|
// the user agent to scrape with
|
||||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||||
// if random_user_agent is set to True, a random user agent is chosen
|
// if random_user_agent is set to True, a random user agent is chosen
|
||||||
random_user_agent: false,
|
random_user_agent: true,
|
||||||
// whether to select manual settings in visible mode
|
// whether to select manual settings in visible mode
|
||||||
set_manual_settings: false,
|
set_manual_settings: false,
|
||||||
// get meta data of scraping in return object
|
// get meta data of scraping in return object
|
||||||
write_meta_data: true,
|
write_meta_data: false,
|
||||||
log_http_headers: false,
|
log_http_headers: false,
|
||||||
// how long to sleep between requests. a random sleep interval within the range [a,b]
|
// how long to sleep between requests. a random sleep interval within the range [a,b]
|
||||||
// is drawn before every request. empty string for no sleeping.
|
// is drawn before every request. empty string for no sleeping.
|
||||||
|
14
run.js
14
run.js
@ -12,23 +12,23 @@ let config = {
|
|||||||
// is drawn before every request. empty string for no sleeping.
|
// is drawn before every request. empty string for no sleeping.
|
||||||
sleep_range: '[1,2]',
|
sleep_range: '[1,2]',
|
||||||
// which search engine to scrape
|
// which search engine to scrape
|
||||||
search_engine: 'google_news',
|
search_engine: 'google',
|
||||||
// whether debug information should be printed
|
// whether debug information should be printed
|
||||||
// debug info is useful for developers when debugging
|
// debug info is useful for developers when debugging
|
||||||
debug: true,
|
debug: false,
|
||||||
// whether verbose program output should be printed
|
// whether verbose program output should be printed
|
||||||
// this output is informational
|
// this output is informational
|
||||||
verbose: true,
|
verbose: true,
|
||||||
// an array of keywords to scrape
|
// an array of keywords to scrape
|
||||||
keywords: ['hacking', 'trump'],
|
keywords: ['news'],
|
||||||
// alternatively you can specify a keyword_file. this overwrites the keywords array
|
// alternatively you can specify a keyword_file. this overwrites the keywords array
|
||||||
keyword_file: '',
|
keyword_file: '',
|
||||||
// the number of pages to scrape for each keyword
|
// the number of pages to scrape for each keyword
|
||||||
num_pages: 1,
|
num_pages: 2,
|
||||||
// whether to start the browser in headless mode
|
// whether to start the browser in headless mode
|
||||||
headless: false,
|
headless: true,
|
||||||
// path to output file, data will be stored in JSON
|
// path to output file, data will be stored in JSON
|
||||||
output_file: 'data.json',
|
output_file: '',
|
||||||
// whether to prevent images, css, fonts from being loaded
|
// whether to prevent images, css, fonts from being loaded
|
||||||
// will speed up scraping a great deal
|
// will speed up scraping a great deal
|
||||||
block_assets: true,
|
block_assets: true,
|
||||||
@ -41,7 +41,7 @@ let config = {
|
|||||||
// use a proxy for all connections
|
// use a proxy for all connections
|
||||||
// example: 'socks5://78.94.172.42:1080'
|
// example: 'socks5://78.94.172.42:1080'
|
||||||
// example: 'http://118.174.233.10:48400'
|
// example: 'http://118.174.233.10:48400'
|
||||||
//proxy: 'socks5://78.94.172.42:1080',
|
proxy: '',
|
||||||
};
|
};
|
||||||
|
|
||||||
function callback(err, response) {
|
function callback(err, response) {
|
||||||
|
@ -1,109 +1,78 @@
|
|||||||
const cheerio = require('cheerio');
|
const cheerio = require('cheerio');
|
||||||
const sfunctions = require('./functions.js');
|
const Scraper = require('./se_scraper');
|
||||||
|
|
||||||
module.exports = {
|
class BaiduScraper extends Scraper {
|
||||||
scrape_baidu_pup: scrape_baidu_pup,
|
parse(html) {
|
||||||
};
|
// load the page source into cheerio
|
||||||
|
const $ = cheerio.load(html);
|
||||||
|
|
||||||
async function scrape_baidu_pup(page, event, context, pluggable) {
|
// perform queries
|
||||||
await page.goto('https://www.baidu.com/');
|
const results = [];
|
||||||
|
$('#content_left .result').each((i, link) => {
|
||||||
|
results.push({
|
||||||
|
link: $(link).find('h3 a').attr('href'),
|
||||||
|
title: $(link).find('h3').text(),
|
||||||
|
snippet: $(link).find('.c-abstract').text(),
|
||||||
|
visible_link: $(link).find('.f13').text(),
|
||||||
|
})
|
||||||
|
});
|
||||||
|
|
||||||
try {
|
const cleaned = [];
|
||||||
await page.waitForSelector('input[name="wd"]', { timeout: 5000 });
|
for (var i=0; i < results.length; i++) {
|
||||||
} catch (e) {
|
let res = results[i];
|
||||||
return results;
|
if (res.link && res.link.trim()) {
|
||||||
}
|
res.rank = this.result_rank++;
|
||||||
|
cleaned.push(res);
|
||||||
let keywords = event.keywords;
|
|
||||||
var results = {};
|
|
||||||
|
|
||||||
for (var i = 0; i < keywords.length; i++) {
|
|
||||||
|
|
||||||
keyword = keywords[i];
|
|
||||||
|
|
||||||
if (pluggable.before_keyword_scraped) {
|
|
||||||
await pluggable.before_keyword_scraped({
|
|
||||||
keyword: keyword,
|
|
||||||
page: page,
|
|
||||||
event: event,
|
|
||||||
context: context,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
const input = await page.$('input[name="wd"]');
|
|
||||||
// overwrites last text in input
|
|
||||||
await input.click({ clickCount: 3 });
|
|
||||||
await input.type(keyword);
|
|
||||||
await input.focus();
|
|
||||||
await page.keyboard.press("Enter");
|
|
||||||
|
|
||||||
if (event.sleep_range) {
|
|
||||||
await sfunctions.random_sleep(event);
|
|
||||||
}
|
|
||||||
|
|
||||||
// in baidu we have a issue with waiting for a selector
|
|
||||||
// or waiting for navigation
|
|
||||||
// therefore, we just manually sleep
|
|
||||||
|
|
||||||
// issue in baidu: https://github.com/GoogleChrome/puppeteer/issues/609
|
|
||||||
// https://github.com/GoogleChrome/puppeteer/issues/2671
|
|
||||||
// await page.evaluate( () => {
|
|
||||||
// if ( ! window.Node ) {
|
|
||||||
// window.Node = {};
|
|
||||||
// }
|
|
||||||
// if ( ! Node.ELEMENT_NODE ) {
|
|
||||||
// Node.ELEMENT_NODE = 1;
|
|
||||||
// }
|
|
||||||
// } );
|
|
||||||
// await page.waitForSelector('.result', { timeout: 5000 });
|
|
||||||
|
|
||||||
// this should be reasonable for normal internet connections
|
|
||||||
await sfunctions.sleep(2000);
|
|
||||||
|
|
||||||
if (event.debug === true && event.is_local === true) {
|
|
||||||
await page.screenshot({path: `debug/${keyword}.png`});
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
let html = await page.content();
|
return {
|
||||||
results[keyword] = parse(html);
|
time: (new Date()).toUTCString(),
|
||||||
|
no_results: false,
|
||||||
} catch (e) {
|
num_results: $('.nums_text').text(),
|
||||||
console.error(`Problem with scraping ${keyword}: ${e}`);
|
results: cleaned,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return results;
|
async load_start_page() {
|
||||||
|
try {
|
||||||
|
await this.page.goto('https://www.baidu.com/');
|
||||||
|
await this.page.waitForSelector('input[name="wd"]', { timeout: 5000 });
|
||||||
|
} catch (e) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
async search_keyword(keyword) {
|
||||||
|
const input = await this.page.$('input[name="wd"]');
|
||||||
|
// overwrites last text in input
|
||||||
|
await input.click({ clickCount: 3 });
|
||||||
|
await input.type(keyword);
|
||||||
|
await input.focus();
|
||||||
|
await this.page.keyboard.press("Enter");
|
||||||
|
}
|
||||||
|
|
||||||
|
async next_page() {
|
||||||
|
let next_page_link = await this.page.$('.sb_pagN', {timeout: 1000});
|
||||||
|
if (!next_page_link) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
await next_page_link.click();
|
||||||
|
await this.page.waitForNavigation();
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
async wait_for_results() {
|
||||||
|
// TODO: very very bad, but nobody uses baidu, or does someone?
|
||||||
|
await this.sleep(2000);
|
||||||
|
}
|
||||||
|
|
||||||
|
async detected() {
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function parse(html) {
|
module.exports = {
|
||||||
// load the page source into cheerio
|
BaiduScraper: BaiduScraper,
|
||||||
const $ = cheerio.load(html);
|
};
|
||||||
|
|
||||||
// perform queries
|
|
||||||
const results = [];
|
|
||||||
$('#content_left .result').each((i, link) => {
|
|
||||||
results.push({
|
|
||||||
link: $(link).find('h3 a').attr('href'),
|
|
||||||
title: $(link).find('h3').text(),
|
|
||||||
snippet: $(link).find('.c-abstract').text(),
|
|
||||||
visible_link: $(link).find('.f13').text(),
|
|
||||||
})
|
|
||||||
});
|
|
||||||
|
|
||||||
const cleaned = [];
|
|
||||||
for (var i=0; i < results.length; i++) {
|
|
||||||
let res = results[i];
|
|
||||||
if (res.link && res.link.trim()) {
|
|
||||||
res.rank = i+1;
|
|
||||||
cleaned.push(res);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return {
|
|
||||||
time: (new Date()).toUTCString(),
|
|
||||||
no_results: false,
|
|
||||||
num_results: $('.nums_text').text(),
|
|
||||||
results: cleaned,
|
|
||||||
}
|
|
||||||
}
|
|
@ -29,7 +29,7 @@ class BingScraper extends Scraper {
|
|||||||
for (var i=0; i < results.length; i++) {
|
for (var i=0; i < results.length; i++) {
|
||||||
let res = results[i];
|
let res = results[i];
|
||||||
if (res.link && res.link.trim() && res.title && res.title.trim()) {
|
if (res.link && res.link.trim() && res.title && res.title.trim()) {
|
||||||
res.rank = i+1;
|
res.rank = this.result_rank++;
|
||||||
cleaned.push(res);
|
cleaned.push(res);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -104,7 +104,7 @@ class BingNewsScraper extends Scraper {
|
|||||||
for (var i=0; i < results.length; i++) {
|
for (var i=0; i < results.length; i++) {
|
||||||
let res = results[i];
|
let res = results[i];
|
||||||
if (res.link && res.link.trim() && res.title && res.title.trim()) {
|
if (res.link && res.link.trim() && res.title && res.title.trim()) {
|
||||||
res.rank = i+1;
|
res.rank = this.result_rank++;
|
||||||
cleaned.push(res);
|
cleaned.push(res);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,94 +1,148 @@
|
|||||||
const cheerio = require('cheerio');
|
const cheerio = require('cheerio');
|
||||||
const sfunctions = require('./functions.js');
|
const Scraper = require('./se_scraper');
|
||||||
|
|
||||||
module.exports = {
|
class DuckduckgoScraper extends Scraper {
|
||||||
scrape_duckduckgo_news_pup: scrape_duckduckgo_news_pup,
|
|
||||||
};
|
|
||||||
|
|
||||||
async function scrape_duckduckgo_news_pup(page, event, context, pluggable) {
|
parse(html) {
|
||||||
await page.goto('https://duckduckgo.com/?q=42&t=h_&iar=news&ia=news');
|
// load the page source into cheerio
|
||||||
|
const $ = cheerio.load(html);
|
||||||
|
|
||||||
try {
|
// perform queries
|
||||||
await page.waitForSelector('input[name="q"]', { timeout: 5000 });
|
const results = [];
|
||||||
} catch (e) {
|
$('.result__body').each((i, link) => {
|
||||||
return results;
|
results.push({
|
||||||
}
|
link: $(link).find('.result__title .result__a').attr('href'),
|
||||||
|
title: $(link).find('.result__title .result__a').text(),
|
||||||
let keywords = event.keywords;
|
date: $(link).find('.result__timestamp').text(),
|
||||||
var results = {};
|
snippet: $(link).find('.result__snippet').text(),
|
||||||
|
visible_link: $(link).find('.result__url').attr('href'),
|
||||||
for (var i = 0; i < keywords.length; i++) {
|
|
||||||
|
|
||||||
keyword = keywords[i];
|
|
||||||
|
|
||||||
if (pluggable.before_keyword_scraped) {
|
|
||||||
await pluggable.before_keyword_scraped({
|
|
||||||
keyword: keyword,
|
|
||||||
page: page,
|
|
||||||
event: event,
|
|
||||||
context: context,
|
|
||||||
});
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
const cleaned = [];
|
||||||
|
for (var i=0; i < results.length; i++) {
|
||||||
|
let res = results[i];
|
||||||
|
if (res.link && res.link.trim() && res.title && res.title.trim()) {
|
||||||
|
res.rank = this.result_rank++;
|
||||||
|
cleaned.push(res);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
return {
|
||||||
const input = await page.$('input[name="q"]');
|
time: (new Date()).toUTCString(),
|
||||||
// overwrites last text in input
|
results: cleaned
|
||||||
await input.click({ clickCount: 3 });
|
|
||||||
await sfunctions.sleep(150);
|
|
||||||
await input.type(keyword);
|
|
||||||
await sfunctions.sleep(150);
|
|
||||||
await input.focus();
|
|
||||||
await page.keyboard.press("Enter");
|
|
||||||
|
|
||||||
if (event.sleep_range) {
|
|
||||||
await sfunctions.random_sleep(event);
|
|
||||||
}
|
|
||||||
|
|
||||||
// await page.waitForSelector('.result--news', { timeout: 5000 });
|
|
||||||
await page.waitForSelector('.serp__results', { timeout: 5000 });
|
|
||||||
|
|
||||||
await sfunctions.sleep(1500);
|
|
||||||
|
|
||||||
if (event.debug === true && event.is_local === true) {
|
|
||||||
await page.screenshot({path: `debug/${keyword}.png`});
|
|
||||||
}
|
|
||||||
let html = await page.content();
|
|
||||||
results[keyword] = parse_duckduckgo_news_results(html, event.max_results);
|
|
||||||
|
|
||||||
} catch (e) {
|
|
||||||
console.error(`Problem with scraping ${keyword}: ${e}`);
|
|
||||||
return results;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return results;
|
|
||||||
|
async load_start_page() {
|
||||||
|
try {
|
||||||
|
await this.page.goto('https://duckduckgo.com/');
|
||||||
|
await this.page.waitForSelector('input[name="q"]', { timeout: 5000 });
|
||||||
|
} catch (e) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
async search_keyword(keyword) {
|
||||||
|
const input = await this.page.$('input[name="q"]');
|
||||||
|
await this.set_input_value(`input[name="q"]`, keyword);
|
||||||
|
await this.sleep(50);
|
||||||
|
await input.focus();
|
||||||
|
await this.page.keyboard.press("Enter");
|
||||||
|
}
|
||||||
|
|
||||||
|
async next_page() {
|
||||||
|
let next_page_link = await this.page.$('a.result--more__btn', {timeout: 1000});
|
||||||
|
if (!next_page_link) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
await next_page_link.click();
|
||||||
|
//await this.page.waitForNavigation();
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
async wait_for_results() {
|
||||||
|
await this.page.waitForSelector('.serp__results', { timeout: 5000 });
|
||||||
|
}
|
||||||
|
|
||||||
|
async detected() {
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function parse_duckduckgo_news_results(html) {
|
|
||||||
// load the page source into cheerio
|
|
||||||
const $ = cheerio.load(html);
|
|
||||||
|
|
||||||
// perform queries
|
class DuckduckgoNewsScraper extends Scraper {
|
||||||
const results = [];
|
|
||||||
$('.result--news').each((i, link) => {
|
parse(html) {
|
||||||
results.push({
|
// load the page source into cheerio
|
||||||
link: $(link).find('.result__title .result__a').attr('href'),
|
const $ = cheerio.load(html);
|
||||||
title: $(link).find('.result__title .result__a').text(),
|
|
||||||
date: $(link).find('.result__timestamp').text(),
|
// perform queries
|
||||||
snippet: $(link).find('.result__snippet').text(),
|
const results = [];
|
||||||
|
$('.result--news').each((i, link) => {
|
||||||
|
results.push({
|
||||||
|
link: $(link).find('.result__title .result__a').attr('href'),
|
||||||
|
title: $(link).find('.result__title .result__a').text(),
|
||||||
|
date: $(link).find('.result__timestamp').text(),
|
||||||
|
snippet: $(link).find('.result__snippet').text(),
|
||||||
|
});
|
||||||
});
|
});
|
||||||
});
|
|
||||||
|
|
||||||
const cleaned = [];
|
const cleaned = [];
|
||||||
for (var i=0; i < results.length; i++) {
|
for (var i=0; i < results.length; i++) {
|
||||||
let res = results[i];
|
let res = results[i];
|
||||||
if (res.link && res.link.trim() && res.title && res.title.trim()) {
|
if (res.link && res.link.trim() && res.title && res.title.trim()) {
|
||||||
res.rank = i+1;
|
res.rank = this.result_rank++;
|
||||||
cleaned.push(res);
|
cleaned.push(res);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
time: (new Date()).toUTCString(),
|
||||||
|
results: cleaned
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return {
|
async load_start_page() {
|
||||||
time: (new Date()).toUTCString(),
|
try {
|
||||||
results: cleaned
|
await page.goto('https://duckduckgo.com/?q=42&t=h_&iar=news&ia=news');
|
||||||
|
await page.waitForSelector('input[name="q"]', { timeout: 5000 });
|
||||||
|
} catch (e) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
async search_keyword(keyword) {
|
||||||
|
const input = await this.page.$('input[name="q"]');
|
||||||
|
await this.set_input_value(`input[name="q"]`, keyword);
|
||||||
|
await this.sleep(50);
|
||||||
|
await input.focus();
|
||||||
|
await this.page.keyboard.press("Enter");
|
||||||
|
}
|
||||||
|
|
||||||
|
async next_page() {
|
||||||
|
let next_page_link = await this.page.$('.sb_pagN', {timeout: 1000});
|
||||||
|
if (!next_page_link) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
await next_page_link.click();
|
||||||
|
await this.page.waitForNavigation();
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
async wait_for_results() {
|
||||||
|
await this.page.waitForSelector('.serp__results', { timeout: 5000 });
|
||||||
|
await this.sleep(1500);
|
||||||
|
}
|
||||||
|
|
||||||
|
async detected() {
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
module.exports = {
|
||||||
|
DuckduckgoNewsScraper: DuckduckgoNewsScraper,
|
||||||
|
DuckduckgoScraper: DuckduckgoScraper,
|
||||||
|
};
|
@ -1,40 +0,0 @@
|
|||||||
module.exports = {
|
|
||||||
no_results: no_results,
|
|
||||||
effective_query: effective_query,
|
|
||||||
sleep: sleep,
|
|
||||||
random_sleep: random_sleep,
|
|
||||||
set_input_value: set_input_value,
|
|
||||||
|
|
||||||
};
|
|
||||||
|
|
||||||
async function set_input_value(page, selector, value) {
|
|
||||||
await page.waitFor(selector);
|
|
||||||
await page.evaluate((value, selector) => {
|
|
||||||
return document.querySelector(selector).value = value;
|
|
||||||
}, value, selector);
|
|
||||||
}
|
|
||||||
|
|
||||||
function no_results(needles, html) {
|
|
||||||
return !needles.map((needle) => { return html.indexOf(needle)})
|
|
||||||
.every((res) => { return res == -1});
|
|
||||||
}
|
|
||||||
|
|
||||||
function effective_query(needles, html) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
function sleep(ms) {
|
|
||||||
return new Promise(resolve => {
|
|
||||||
setTimeout(resolve, ms)
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
async function random_sleep(config) {
|
|
||||||
var min, max;
|
|
||||||
[min, max] = config.sleep_range;
|
|
||||||
var rand = Math.floor(Math.random() * (max - min + 1) + min); //Generate Random number
|
|
||||||
if (config.debug === true) {
|
|
||||||
console.log(`Sleeping for ${rand}s`);
|
|
||||||
}
|
|
||||||
await sleep(rand * 1000);
|
|
||||||
}
|
|
@ -1,5 +1,4 @@
|
|||||||
const cheerio = require('cheerio');
|
const cheerio = require('cheerio');
|
||||||
const sfunctions = require('./functions.js');
|
|
||||||
const Scraper = require('./se_scraper');
|
const Scraper = require('./se_scraper');
|
||||||
|
|
||||||
class GoogleScraper extends Scraper {
|
class GoogleScraper extends Scraper {
|
||||||
@ -20,7 +19,7 @@ class GoogleScraper extends Scraper {
|
|||||||
})
|
})
|
||||||
});
|
});
|
||||||
|
|
||||||
let no_results = sfunctions.no_results(
|
let no_results = this.no_results(
|
||||||
['Es wurden keine mit deiner Suchanfrage', 'did not match any documents', 'Keine Ergebnisse für',
|
['Es wurden keine mit deiner Suchanfrage', 'did not match any documents', 'Keine Ergebnisse für',
|
||||||
'No results found for', 'Ergebnisse für', 'Showing results for'],
|
'No results found for', 'Ergebnisse für', 'Showing results for'],
|
||||||
$('#main').text()
|
$('#main').text()
|
||||||
@ -35,7 +34,7 @@ class GoogleScraper extends Scraper {
|
|||||||
for (var i=0; i < results.length; i++) {
|
for (var i=0; i < results.length; i++) {
|
||||||
let res = results[i];
|
let res = results[i];
|
||||||
if (res.link && res.link.trim() && res.title && res.title.trim()) {
|
if (res.link && res.link.trim() && res.title && res.title.trim()) {
|
||||||
res.rank = i+1;
|
res.rank = this.result_rank++;
|
||||||
cleaned.push(res);
|
cleaned.push(res);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -108,7 +107,7 @@ class GoogleNewsOldScraper extends Scraper {
|
|||||||
})
|
})
|
||||||
});
|
});
|
||||||
|
|
||||||
let no_results = sfunctions.no_results(
|
let no_results = this.no_results(
|
||||||
['Es wurden keine mit deiner Suchanfrage', 'did not match any documents', 'Keine Ergebnisse für',
|
['Es wurden keine mit deiner Suchanfrage', 'did not match any documents', 'Keine Ergebnisse für',
|
||||||
'No results found for', 'Ergebnisse für', 'Showing results for', 'did not match any news results'],
|
'No results found for', 'Ergebnisse für', 'Showing results for', 'did not match any news results'],
|
||||||
$('#main').text()
|
$('#main').text()
|
||||||
@ -123,7 +122,7 @@ class GoogleNewsOldScraper extends Scraper {
|
|||||||
for (var i=0; i < results.length; i++) {
|
for (var i=0; i < results.length; i++) {
|
||||||
let res = results[i];
|
let res = results[i];
|
||||||
if (res.link && res.link.trim()) {
|
if (res.link && res.link.trim()) {
|
||||||
res.rank = i+1;
|
res.rank = this.result_rank++;
|
||||||
cleaned.push(res);
|
cleaned.push(res);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -161,7 +160,7 @@ class GoogleNewsOldScraper extends Scraper {
|
|||||||
async wait_for_results() {
|
async wait_for_results() {
|
||||||
//await this.page.waitForNavigation({ timeout: this.STANDARD_TIMEOUT });
|
//await this.page.waitForNavigation({ timeout: this.STANDARD_TIMEOUT });
|
||||||
await this.page.waitForSelector('#main', { timeout: this.STANDARD_TIMEOUT });
|
await this.page.waitForSelector('#main', { timeout: this.STANDARD_TIMEOUT });
|
||||||
await this.sleep(200);
|
await this.sleep(500);
|
||||||
}
|
}
|
||||||
|
|
||||||
async detected() {
|
async detected() {
|
||||||
@ -190,7 +189,7 @@ class GoogleImageScraper extends Scraper {
|
|||||||
})
|
})
|
||||||
});
|
});
|
||||||
|
|
||||||
let no_results = sfunctions.no_results(
|
let no_results = this.no_results(
|
||||||
['stimmt mit keinem Bildergebnis', 'Keine Ergebnisse für', 'not match any image results', 'No results found for',
|
['stimmt mit keinem Bildergebnis', 'Keine Ergebnisse für', 'not match any image results', 'No results found for',
|
||||||
'Showing results for', 'Ergebnisse für'],
|
'Showing results for', 'Ergebnisse für'],
|
||||||
$('#main').text()
|
$('#main').text()
|
||||||
@ -206,7 +205,7 @@ class GoogleImageScraper extends Scraper {
|
|||||||
let res = results[i];
|
let res = results[i];
|
||||||
if (res.link && res.link.trim() && res.link.trim().length > 10) {
|
if (res.link && res.link.trim() && res.link.trim().length > 10) {
|
||||||
res.link = res.link.trim();
|
res.link = res.link.trim();
|
||||||
res.rank = i+1;
|
res.rank = this.result_rank++;
|
||||||
cleaned.push(res);
|
cleaned.push(res);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -252,7 +251,7 @@ class GoogleImageScraper extends Scraper {
|
|||||||
|
|
||||||
async wait_for_results() {
|
async wait_for_results() {
|
||||||
await this.page.waitForSelector('#main', { timeout: this.STANDARD_TIMEOUT });
|
await this.page.waitForSelector('#main', { timeout: this.STANDARD_TIMEOUT });
|
||||||
await this.sleep(100);
|
await this.sleep(500);
|
||||||
}
|
}
|
||||||
|
|
||||||
async detected() {
|
async detected() {
|
||||||
@ -296,7 +295,7 @@ class GoogleNewsScraper extends Scraper {
|
|||||||
this.all_results.add(title);
|
this.all_results.add(title);
|
||||||
});
|
});
|
||||||
|
|
||||||
let no_results = sfunctions.no_results(
|
let no_results = this.no_results(
|
||||||
['Es wurden keine mit deiner Suchanfrage', 'did not match any documents', 'Keine Ergebnisse für',
|
['Es wurden keine mit deiner Suchanfrage', 'did not match any documents', 'Keine Ergebnisse für',
|
||||||
'No results found for', 'Ergebnisse für', 'Showing results for', 'did not match any news results'],
|
'No results found for', 'Ergebnisse für', 'Showing results for', 'did not match any news results'],
|
||||||
$('body').text()
|
$('body').text()
|
||||||
@ -308,7 +307,7 @@ class GoogleNewsScraper extends Scraper {
|
|||||||
for (var i=0; i < results.length; i++) {
|
for (var i=0; i < results.length; i++) {
|
||||||
let res = results[i];
|
let res = results[i];
|
||||||
if (res.title && res.title.trim()) {
|
if (res.title && res.title.trim()) {
|
||||||
res.rank = i+1;
|
res.rank = this.result_rank++;
|
||||||
cleaned.push(res);
|
cleaned.push(res);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -333,6 +332,7 @@ class GoogleNewsScraper extends Scraper {
|
|||||||
// parse here front page results
|
// parse here front page results
|
||||||
let html = await this.page.content();
|
let html = await this.page.content();
|
||||||
this.results['frontpage'] = this.parse(html);
|
this.results['frontpage'] = this.parse(html);
|
||||||
|
this.result_rank = 1;
|
||||||
} catch(e) {
|
} catch(e) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -367,7 +367,6 @@ class GoogleNewsScraper extends Scraper {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
function clean_image_url(url) {
|
function clean_image_url(url) {
|
||||||
// Example:
|
// Example:
|
||||||
// https://www.google.com/imgres?imgurl=https%3A%2F%2Fupload.wikimedia.org%2Fwikipedia%2Fen%2Fthumb%2Ff%2Ffd%2F1928_Edward_Campbell.jpg%2F220px-1928_Edward_Campbell.jpg&imgrefurl=https%3A%2F%2Fwww.revolvy.com%2Fpage%2FSir-Edward-Campbell%252C-1st-Baronet&docid=BMkW_GerTIY4GM&tbnid=TmQapIxDCQbQhM%3A&vet=10ahUKEwje_LLE_YXeAhXisaQKHVAEBSAQMwiNAShEMEQ..i&w=220&h=290&bih=1696&biw=1280&q=John%20MacLeod%20Breadalbane%20Councillor%20Prince%20Edward%20Island&ved=0ahUKEwje_LLE_YXeAhXisaQKHVAEBSAQMwiNAShEMEQ&iact=mrc&uact=8
|
// https://www.google.com/imgres?imgurl=https%3A%2F%2Fupload.wikimedia.org%2Fwikipedia%2Fen%2Fthumb%2Ff%2Ffd%2F1928_Edward_Campbell.jpg%2F220px-1928_Edward_Campbell.jpg&imgrefurl=https%3A%2F%2Fwww.revolvy.com%2Fpage%2FSir-Edward-Campbell%252C-1st-Baronet&docid=BMkW_GerTIY4GM&tbnid=TmQapIxDCQbQhM%3A&vet=10ahUKEwje_LLE_YXeAhXisaQKHVAEBSAQMwiNAShEMEQ..i&w=220&h=290&bih=1696&biw=1280&q=John%20MacLeod%20Breadalbane%20Councillor%20Prince%20Edward%20Island&ved=0ahUKEwje_LLE_YXeAhXisaQKHVAEBSAQMwiNAShEMEQ&iact=mrc&uact=8
|
||||||
|
@ -1,186 +1,157 @@
|
|||||||
const cheerio = require('cheerio');
|
const cheerio = require('cheerio');
|
||||||
const sfunctions = require('./functions.js');
|
const Scraper = require('./se_scraper');
|
||||||
|
|
||||||
|
class InfospaceScraper extends Scraper {
|
||||||
|
|
||||||
|
parse(html) {
|
||||||
|
// load the page source into cheerio
|
||||||
|
const $ = cheerio.load(html);
|
||||||
|
|
||||||
|
// perform queries
|
||||||
|
const results = [];
|
||||||
|
$('.result').each((i, link) => {
|
||||||
|
results.push({
|
||||||
|
link: $(link).find('a.title').attr('href'),
|
||||||
|
title: $(link).find('a.title').text(),
|
||||||
|
snippet: $(link).find('.description').text(),
|
||||||
|
visible_link: $(link).find('.url').text(),
|
||||||
|
})
|
||||||
|
});
|
||||||
|
|
||||||
|
const cleaned = [];
|
||||||
|
for (var i=0; i < results.length; i++) {
|
||||||
|
let res = results[i];
|
||||||
|
if (res.link && res.link.trim()) {
|
||||||
|
res.rank = this.result_rank++;
|
||||||
|
cleaned.push(res);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let no_results = this.no_results(
|
||||||
|
['No search results were found for'],
|
||||||
|
$('.layout__mainline').text()
|
||||||
|
);
|
||||||
|
|
||||||
|
return {
|
||||||
|
time: (new Date()).toUTCString(),
|
||||||
|
no_results: no_results,
|
||||||
|
num_results: '',
|
||||||
|
results: cleaned,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async load_start_page() {
|
||||||
|
try {
|
||||||
|
await this.page.goto('http://infospace.com/index.html');
|
||||||
|
await this.page.waitForSelector('input[name="q"]', { timeout: 5000 });
|
||||||
|
} catch (e) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
async search_keyword(keyword) {
|
||||||
|
const input = await this.page.$('input[id="q"]');
|
||||||
|
await this.set_input_value('input[id="q"]', keyword);
|
||||||
|
await this.sleep(50);
|
||||||
|
await input.focus();
|
||||||
|
await this.page.keyboard.press("Enter");
|
||||||
|
}
|
||||||
|
|
||||||
|
async next_page() {
|
||||||
|
let next_page_link = await this.page.$('a.next', {timeout: 1000});
|
||||||
|
if (!next_page_link) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
await next_page_link.click();
|
||||||
|
await this.page.waitForNavigation();
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
async wait_for_results() {
|
||||||
|
await this.page.waitForSelector('.mainline-results', { timeout: 5000 }); // TODO: this is not the best selector.
|
||||||
|
await this.sleep(250);
|
||||||
|
}
|
||||||
|
|
||||||
|
async detected() {
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class WebcrawlerNewsScraper extends Scraper {
|
||||||
|
|
||||||
|
parse(html) {
|
||||||
|
// load the page source into cheerio
|
||||||
|
const $ = cheerio.load(html);
|
||||||
|
|
||||||
|
// perform queries
|
||||||
|
const results = [];
|
||||||
|
$('.article').each((i, link) => {
|
||||||
|
let source = $(link).find('.source').text();
|
||||||
|
let date = source.split(',')[1] || '';
|
||||||
|
results.push({
|
||||||
|
link: $(link).find('a').attr('href'),
|
||||||
|
title: $(link).find('.title').text(),
|
||||||
|
publisher: $(link).find('.source').text(),
|
||||||
|
date: date,
|
||||||
|
snippet: $(link).find('.description').text(),
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
const cleaned = [];
|
||||||
|
for (var i=0; i < results.length; i++) {
|
||||||
|
let res = results[i];
|
||||||
|
if (res.link && res.link.trim() && res.title && res.title.trim()) {
|
||||||
|
res.rank = this.result_rank++;
|
||||||
|
cleaned.push(res);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
time: (new Date()).toUTCString(),
|
||||||
|
results: cleaned
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async load_start_page() {
|
||||||
|
try {
|
||||||
|
await this.page.goto('https://www.webcrawler.com/?qc=news');
|
||||||
|
await this.page.waitForSelector('input[name="q"]', { timeout: 5000 });
|
||||||
|
} catch (e) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
async search_keyword(keyword) {
|
||||||
|
const input = await this.page.$('input[name="q"]');
|
||||||
|
await this.set_input_value('input[name="q"]', keyword);
|
||||||
|
await this.sleep(50);
|
||||||
|
await input.focus();
|
||||||
|
await this.page.keyboard.press("Enter");
|
||||||
|
}
|
||||||
|
|
||||||
|
async next_page() {
|
||||||
|
let next_page_link = await this.page.$('.pagination__num--next', {timeout: 1000});
|
||||||
|
if (!next_page_link) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
await next_page_link.click();
|
||||||
|
await this.page.waitForNavigation();
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
async wait_for_results() {
|
||||||
|
await this.page.waitForSelector('.mainline-results', { timeout: 5000 });
|
||||||
|
await this.sleep(150);
|
||||||
|
}
|
||||||
|
|
||||||
|
async detected() {
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
module.exports = {
|
module.exports = {
|
||||||
scrape_infospace_pup: scrape_infospace_pup,
|
InfospaceScraper: InfospaceScraper,
|
||||||
scrape_webcrawler_news_pup: scrape_webcrawler_news_pup,
|
WebcrawlerNewsScraper: WebcrawlerNewsScraper,
|
||||||
};
|
};
|
||||||
|
|
||||||
async function scrape_infospace_pup(page, event, context, pluggable) {
|
|
||||||
await page.goto('http://infospace.com/index.html');
|
|
||||||
|
|
||||||
try {
|
|
||||||
await page.waitForSelector('input[name="q"]', { timeout: 5000 });
|
|
||||||
} catch (e) {
|
|
||||||
return results;
|
|
||||||
}
|
|
||||||
|
|
||||||
let keywords = event.keywords;
|
|
||||||
var results = {};
|
|
||||||
|
|
||||||
for (var i = 0; i < keywords.length; i++) {
|
|
||||||
|
|
||||||
keyword = keywords[i];
|
|
||||||
|
|
||||||
if (pluggable.before_keyword_scraped) {
|
|
||||||
await pluggable.before_keyword_scraped({
|
|
||||||
keyword: keyword,
|
|
||||||
page: page,
|
|
||||||
event: event,
|
|
||||||
context: context,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
const input = await page.$('input[id="q"]');
|
|
||||||
// overwrites last text in input
|
|
||||||
await input.click({ clickCount: 3 });
|
|
||||||
await input.type(keyword);
|
|
||||||
await input.focus();
|
|
||||||
await page.keyboard.press("Enter");
|
|
||||||
|
|
||||||
if (event.sleep_range) {
|
|
||||||
await sfunctions.random_sleep(event);
|
|
||||||
}
|
|
||||||
|
|
||||||
await page.waitForSelector('.mainline-results', { timeout: 5000 }); // TODO: this is not the best selector.
|
|
||||||
await sfunctions.sleep(250);
|
|
||||||
if (event.debug === true && event.is_local === true) {
|
|
||||||
await page.screenshot({path: `debug/${keyword}.png`});
|
|
||||||
}
|
|
||||||
|
|
||||||
let html = await page.content();
|
|
||||||
results[keyword] = parse(html);
|
|
||||||
|
|
||||||
} catch (e) {
|
|
||||||
console.error(`Problem with scraping ${keyword}: ${e}`);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return results;
|
|
||||||
}
|
|
||||||
|
|
||||||
function parse(html) {
|
|
||||||
// load the page source into cheerio
|
|
||||||
const $ = cheerio.load(html);
|
|
||||||
|
|
||||||
// perform queries
|
|
||||||
const results = [];
|
|
||||||
$('.result').each((i, link) => {
|
|
||||||
results.push({
|
|
||||||
link: $(link).find('a.title').attr('href'),
|
|
||||||
title: $(link).find('a.title').text(),
|
|
||||||
snippet: $(link).find('.description').text(),
|
|
||||||
visible_link: $(link).find('.url').text(),
|
|
||||||
})
|
|
||||||
});
|
|
||||||
|
|
||||||
const cleaned = [];
|
|
||||||
for (var i=0; i < results.length; i++) {
|
|
||||||
let res = results[i];
|
|
||||||
if (res.link && res.link.trim()) {
|
|
||||||
res.rank = i+1;
|
|
||||||
cleaned.push(res);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let no_results = sfunctions.no_results(
|
|
||||||
['No search results were found for'],
|
|
||||||
$('.layout__mainline').text()
|
|
||||||
);
|
|
||||||
|
|
||||||
return {
|
|
||||||
time: (new Date()).toUTCString(),
|
|
||||||
no_results: no_results,
|
|
||||||
num_results: '',
|
|
||||||
results: cleaned,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async function scrape_webcrawler_news_pup(page, event, context, pluggable) {
|
|
||||||
await page.goto('https://www.webcrawler.com/?qc=news');
|
|
||||||
|
|
||||||
try {
|
|
||||||
await page.waitForSelector('input[name="q"]', { timeout: 5000 });
|
|
||||||
} catch (e) {
|
|
||||||
return results;
|
|
||||||
}
|
|
||||||
|
|
||||||
let keywords = event.keywords;
|
|
||||||
var results = {};
|
|
||||||
|
|
||||||
for (var i = 0; i < keywords.length; i++) {
|
|
||||||
|
|
||||||
keyword = keywords[i];
|
|
||||||
|
|
||||||
if (pluggable.before_keyword_scraped) {
|
|
||||||
await pluggable.before_keyword_scraped({
|
|
||||||
keyword: keyword,
|
|
||||||
page: page,
|
|
||||||
event: event,
|
|
||||||
context: context,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
const input = await page.$('input[name="q"]');
|
|
||||||
// overwrites last text in input
|
|
||||||
await input.click({ clickCount: 3 });
|
|
||||||
await sfunctions.sleep(150);
|
|
||||||
await input.type(keyword);
|
|
||||||
await sfunctions.sleep(150);
|
|
||||||
await input.focus();
|
|
||||||
await page.keyboard.press("Enter");
|
|
||||||
|
|
||||||
if (event.sleep_range) {
|
|
||||||
await sfunctions.random_sleep(event);
|
|
||||||
}
|
|
||||||
|
|
||||||
await page.waitForSelector('.mainline-results', { timeout: 5000 });
|
|
||||||
|
|
||||||
if (event.debug === true && event.is_local === true) {
|
|
||||||
await page.screenshot({path: `debug/${keyword}.png`});
|
|
||||||
}
|
|
||||||
let html = await page.content();
|
|
||||||
results[keyword] = parse_webcrawler_news_results(html, event.max_results);
|
|
||||||
|
|
||||||
} catch (e) {
|
|
||||||
console.error(`Problem with scraping ${keyword}: ${e}`);
|
|
||||||
return results;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return results;
|
|
||||||
}
|
|
||||||
|
|
||||||
function parse_webcrawler_news_results(html) {
|
|
||||||
// load the page source into cheerio
|
|
||||||
const $ = cheerio.load(html);
|
|
||||||
|
|
||||||
// perform queries
|
|
||||||
const results = [];
|
|
||||||
$('.article').each((i, link) => {
|
|
||||||
let source = $(link).find('.source').text();
|
|
||||||
let date = source.split(',')[1] || '';
|
|
||||||
results.push({
|
|
||||||
link: $(link).find('a').attr('href'),
|
|
||||||
title: $(link).find('.title').text(),
|
|
||||||
publisher: $(link).find('.source').text(),
|
|
||||||
date: date,
|
|
||||||
snippet: $(link).find('.description').text(),
|
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
||||||
const cleaned = [];
|
|
||||||
for (var i=0; i < results.length; i++) {
|
|
||||||
let res = results[i];
|
|
||||||
if (res.link && res.link.trim() && res.title && res.title.trim()) {
|
|
||||||
res.rank = i+1;
|
|
||||||
cleaned.push(res);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return {
|
|
||||||
time: (new Date()).toUTCString(),
|
|
||||||
results: cleaned
|
|
||||||
}
|
|
||||||
}
|
|
@ -81,6 +81,8 @@ module.exports = class Scraper {
|
|||||||
*/
|
*/
|
||||||
async scraping_loop() {
|
async scraping_loop() {
|
||||||
|
|
||||||
|
this.result_rank = 1;
|
||||||
|
|
||||||
for (let keyword of this.config.keywords) {
|
for (let keyword of this.config.keywords) {
|
||||||
this.keyword = keyword;
|
this.keyword = keyword;
|
||||||
this.results[keyword] = {};
|
this.results[keyword] = {};
|
||||||
@ -121,7 +123,7 @@ module.exports = class Scraper {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
} while (page_num < event.num_pages);
|
} while (page_num <= event.num_pages);
|
||||||
|
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
|
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
const cheerio = require('cheerio');
|
const cheerio = require('cheerio');
|
||||||
const sfunctions = require('./functions.js');
|
|
||||||
|
|
||||||
module.exports = {
|
module.exports = {
|
||||||
scrape_yahoo_finance_pup: scrape_yahoo_finance_pup,
|
scrape_yahoo_finance_pup: scrape_yahoo_finance_pup,
|
||||||
@ -7,9 +6,14 @@ module.exports = {
|
|||||||
scrape_reuters_finance_pup: scrape_reuters_finance_pup,
|
scrape_reuters_finance_pup: scrape_reuters_finance_pup,
|
||||||
scrape_cnbc_finance_pup: scrape_cnbc_finance_pup,
|
scrape_cnbc_finance_pup: scrape_cnbc_finance_pup,
|
||||||
scrape_marketwatch_finance_pup: scrape_marketwatch_finance_pup,
|
scrape_marketwatch_finance_pup: scrape_marketwatch_finance_pup,
|
||||||
|
not_implemented: undefined,
|
||||||
};
|
};
|
||||||
|
|
||||||
// https://www.google.com/search?q=MSFT&tbm=fin
|
function sleep(ms) {
|
||||||
|
return new Promise(resolve => {
|
||||||
|
setTimeout(resolve, ms)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
async function scrape_yahoo_finance_pup(page, event, context, pluggable) {
|
async function scrape_yahoo_finance_pup(page, event, context, pluggable) {
|
||||||
var results = {};
|
var results = {};
|
||||||
@ -40,7 +44,7 @@ async function scrape_yahoo_finance_pup(page, event, context, pluggable) {
|
|||||||
await page.screenshot({path: `debug/${keyword}.png`});
|
await page.screenshot({path: `debug/${keyword}.png`});
|
||||||
}
|
}
|
||||||
|
|
||||||
await sfunctions.sleep(1000);
|
await sleep(1000);
|
||||||
|
|
||||||
let html = await page.content();
|
let html = await page.content();
|
||||||
results[keyword] = parse(html);
|
results[keyword] = parse(html);
|
||||||
@ -90,7 +94,7 @@ async function scrape_marketwatch_finance_pup(page, event, context, pluggable) {
|
|||||||
await page.screenshot({path: `debug/${keyword}.png`});
|
await page.screenshot({path: `debug/${keyword}.png`});
|
||||||
}
|
}
|
||||||
|
|
||||||
await sfunctions.sleep(500);
|
await sleep(500);
|
||||||
|
|
||||||
let newsData = await page.evaluate(() => {
|
let newsData = await page.evaluate(() => {
|
||||||
let results = [];
|
let results = [];
|
||||||
@ -150,7 +154,7 @@ async function scrape_bloomberg_finance_pup(page, event, context, pluggable) {
|
|||||||
await page.screenshot({path: `debug/${keyword}.png`});
|
await page.screenshot({path: `debug/${keyword}.png`});
|
||||||
}
|
}
|
||||||
|
|
||||||
await sfunctions.sleep(1000);
|
await sleep(1000);
|
||||||
|
|
||||||
let news_items = await page.$x('//*[starts-with(@class,"newsItem")]');
|
let news_items = await page.$x('//*[starts-with(@class,"newsItem")]');
|
||||||
for (let item of news_items) {
|
for (let item of news_items) {
|
||||||
@ -189,7 +193,7 @@ async function scrape_reuters_finance_pup(page, event, context, pluggable) {
|
|||||||
await page.screenshot({path: `debug/${keyword}.png`});
|
await page.screenshot({path: `debug/${keyword}.png`});
|
||||||
}
|
}
|
||||||
|
|
||||||
await sfunctions.sleep(500);
|
await sleep(500);
|
||||||
|
|
||||||
let newsData = await page.evaluate(() => {
|
let newsData = await page.evaluate(() => {
|
||||||
let results = [];
|
let results = [];
|
||||||
@ -246,7 +250,7 @@ async function scrape_cnbc_finance_pup(page, event, context, pluggable) {
|
|||||||
await page.screenshot({path: `debug/${keyword}.png`});
|
await page.screenshot({path: `debug/${keyword}.png`});
|
||||||
}
|
}
|
||||||
|
|
||||||
await sfunctions.sleep(500);
|
await sleep(500);
|
||||||
|
|
||||||
let newsData = await page.evaluate(() => {
|
let newsData = await page.evaluate(() => {
|
||||||
let results = [];
|
let results = [];
|
||||||
|
@ -1,121 +1,105 @@
|
|||||||
const cheerio = require('cheerio');
|
const cheerio = require('cheerio');
|
||||||
const sfunctions = require('./functions.js');
|
const Scraper = require('./se_scraper');
|
||||||
|
|
||||||
module.exports = {
|
class YoutubeScraper extends Scraper {
|
||||||
scrape_youtube_pup: scrape_youtube_pup,
|
|
||||||
};
|
|
||||||
|
|
||||||
const all_videos = new Set();
|
parse(html) {
|
||||||
|
// load the page source into cheerio
|
||||||
|
const $ = cheerio.load(html);
|
||||||
|
|
||||||
async function scrape_youtube_pup(page, event, context, pluggable) {
|
// perform queries
|
||||||
await page.goto('https://www.youtube.com');
|
const results = [];
|
||||||
|
$('#contents ytd-video-renderer,#contents ytd-grid-video-renderer').each((i, link) => {
|
||||||
|
results.push({
|
||||||
|
link: $(link).find('#video-title').attr('href'),
|
||||||
|
title: $(link).find('#video-title').text(),
|
||||||
|
snippet: $(link).find('#description-text').text(),
|
||||||
|
channel: $(link).find('#byline a').text(),
|
||||||
|
channel_link: $(link).find('#byline a').attr('href'),
|
||||||
|
num_views: $(link).find('#metadata-line span:nth-child(1)').text(),
|
||||||
|
release_date: $(link).find('#metadata-line span:nth-child(2)').text(),
|
||||||
|
})
|
||||||
|
});
|
||||||
|
|
||||||
try {
|
let no_results = this.no_results(
|
||||||
await page.waitForSelector('input[id="search"]', { timeout: 5000 });
|
['No results found', 'Keine Ergebnisse', 'Es werden Ergebnisse angezeigt', 'Showing results for' ],
|
||||||
} catch (e) {
|
$('yt-showing-results-for-renderer').text()
|
||||||
return results;
|
);
|
||||||
}
|
|
||||||
|
|
||||||
let keywords = event.keywords;
|
let effective_query = $('#corrected-link').text() || '';
|
||||||
var results = {};
|
|
||||||
|
|
||||||
// before we do anything, parse the results of the front page of youtube
|
const cleaned = [];
|
||||||
await page.waitForSelector('ytd-video-renderer,ytd-grid-video-renderer', { timeout: 10000 });
|
for (var i=0; i < results.length; i++) {
|
||||||
await sfunctions.sleep(500);
|
let res = results[i];
|
||||||
|
if (res.link && res.link.trim() && res.title && res.title.trim()) {
|
||||||
|
res.title = res.title.trim();
|
||||||
|
res.snippet = res.snippet.trim();
|
||||||
|
res.rank = this.result_rank++;
|
||||||
|
|
||||||
let html = await page.content();
|
// check if this result has been used before
|
||||||
results['__frontpage__'] = parse(html);
|
if (this.all_videos.has(res.title) === false) {
|
||||||
|
cleaned.push(res);
|
||||||
for (var i = 0; i < keywords.length; i++) {
|
}
|
||||||
|
this.all_videos.add(res.title);
|
||||||
keyword = keywords[i];
|
|
||||||
|
|
||||||
if (pluggable.before_keyword_scraped) {
|
|
||||||
await pluggable.before_keyword_scraped({
|
|
||||||
keyword: keyword,
|
|
||||||
page: page,
|
|
||||||
event: event,
|
|
||||||
context: context,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
const input = await page.$('input[id="search"]');
|
|
||||||
// overwrites last text in input
|
|
||||||
await input.click({ clickCount: 3 });
|
|
||||||
await input.type(keyword);
|
|
||||||
await input.focus();
|
|
||||||
await page.keyboard.press("Enter");
|
|
||||||
|
|
||||||
if (event.sleep_range) {
|
|
||||||
await sfunctions.random_sleep(event);
|
|
||||||
}
|
|
||||||
|
|
||||||
await page.waitForFunction(`document.title.indexOf('${keyword}') !== -1`, { timeout: 5000 });
|
|
||||||
await page.waitForSelector('ytd-video-renderer,ytd-grid-video-renderer', { timeout: 5000 });
|
|
||||||
await sfunctions.sleep(500);
|
|
||||||
|
|
||||||
if (event.debug === true && event.is_local === true) {
|
|
||||||
await page.screenshot({path: `debug/${keyword}.png`});
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
let html = await page.content();
|
return {
|
||||||
results[keyword] = parse(html);
|
time: (new Date()).toUTCString(),
|
||||||
|
no_results: no_results,
|
||||||
} catch (e) {
|
effective_query: effective_query,
|
||||||
console.error(`Problem with scraping ${keyword}: ${e}`);
|
num_results: '',
|
||||||
|
results: cleaned,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return results;
|
async load_start_page() {
|
||||||
|
try {
|
||||||
|
this.all_videos = new Set();
|
||||||
|
await this.page.goto('https://www.youtube.com', {
|
||||||
|
referer: 'https://google.com'
|
||||||
|
});
|
||||||
|
await this.page.waitForSelector('input[id="search"]', { timeout: 5000 });
|
||||||
|
// before we do anything, parse the results of the front page of youtube
|
||||||
|
await this.page.waitForSelector('ytd-video-renderer,ytd-grid-video-renderer', { timeout: 10000 });
|
||||||
|
await this.sleep(500);
|
||||||
|
let html = await this.page.content();
|
||||||
|
this.results['frontpage'] = this.parse(html);
|
||||||
|
this.result_rank = 1;
|
||||||
|
} catch(e) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
async search_keyword(keyword) {
|
||||||
|
const input = await this.page.$('input[id="search"]');
|
||||||
|
// overwrites last text in input
|
||||||
|
await input.click({ clickCount: 3 });
|
||||||
|
await input.type(keyword);
|
||||||
|
await input.focus();
|
||||||
|
await this.page.keyboard.press("Enter");
|
||||||
|
}
|
||||||
|
|
||||||
|
async next_page() {
|
||||||
|
// youtube needs scrolling
|
||||||
|
// TODO: implement scrolling, no priority right now
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
async wait_for_results() {
|
||||||
|
await this.page.waitForFunction(`document.title.indexOf('${this.keyword}') !== -1`, { timeout: 5000 });
|
||||||
|
await this.page.waitForSelector('ytd-video-renderer,ytd-grid-video-renderer', { timeout: 5000 });
|
||||||
|
await this.sleep(500);
|
||||||
|
}
|
||||||
|
|
||||||
|
async detected() {
|
||||||
|
const title = await this.page.title();
|
||||||
|
let html = await this.page.content();
|
||||||
|
return html.indexOf('detected unusual traffic') !== -1 || title.indexOf('/sorry/') !== -1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function parse(html) {
|
module.exports = {
|
||||||
// load the page source into cheerio
|
YoutubeScraper: YoutubeScraper,
|
||||||
const $ = cheerio.load(html);
|
};
|
||||||
|
|
||||||
// perform queries
|
|
||||||
const results = [];
|
|
||||||
$('#contents ytd-video-renderer,#contents ytd-grid-video-renderer').each((i, link) => {
|
|
||||||
results.push({
|
|
||||||
link: $(link).find('#video-title').attr('href'),
|
|
||||||
title: $(link).find('#video-title').text(),
|
|
||||||
snippet: $(link).find('#description-text').text(),
|
|
||||||
channel: $(link).find('#byline a').text(),
|
|
||||||
channel_link: $(link).find('#byline a').attr('href'),
|
|
||||||
num_views: $(link).find('#metadata-line span:nth-child(1)').text(),
|
|
||||||
release_date: $(link).find('#metadata-line span:nth-child(2)').text(),
|
|
||||||
})
|
|
||||||
});
|
|
||||||
|
|
||||||
let no_results = sfunctions.no_results(
|
|
||||||
['No results found', 'Keine Ergebnisse', 'Es werden Ergebnisse angezeigt', 'Showing results for' ],
|
|
||||||
$('yt-showing-results-for-renderer').text()
|
|
||||||
);
|
|
||||||
|
|
||||||
let effective_query = $('#corrected-link').text() || '';
|
|
||||||
|
|
||||||
const cleaned = [];
|
|
||||||
for (var i=0; i < results.length; i++) {
|
|
||||||
let res = results[i];
|
|
||||||
if (res.link && res.link.trim() && res.title && res.title.trim()) {
|
|
||||||
res.title = res.title.trim();
|
|
||||||
res.snippet = res.snippet.trim();
|
|
||||||
res.rank = i+1;
|
|
||||||
|
|
||||||
// check if this result has been used before
|
|
||||||
if (all_videos.has(res.title) === false) {
|
|
||||||
cleaned.push(res);
|
|
||||||
}
|
|
||||||
all_videos.add(res.title);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return {
|
|
||||||
time: (new Date()).toUTCString(),
|
|
||||||
no_results: no_results,
|
|
||||||
effective_query: effective_query,
|
|
||||||
num_results: '',
|
|
||||||
results: cleaned,
|
|
||||||
}
|
|
||||||
}
|
|
@ -129,6 +129,8 @@ module.exports.handler = async function handler (event, context, callback) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var results = {};
|
||||||
|
|
||||||
Scraper = {
|
Scraper = {
|
||||||
google: google.GoogleScraper,
|
google: google.GoogleScraper,
|
||||||
google_news_old: google.GoogleNewsOldScraper,
|
google_news_old: google.GoogleNewsOldScraper,
|
||||||
@ -136,28 +138,32 @@ module.exports.handler = async function handler (event, context, callback) {
|
|||||||
google_image: google.GoogleImageScraper,
|
google_image: google.GoogleImageScraper,
|
||||||
bing: bing.BingScraper,
|
bing: bing.BingScraper,
|
||||||
bing_news: bing.BingNewsScraper,
|
bing_news: bing.BingNewsScraper,
|
||||||
|
duckduckgo: duckduckgo.DuckduckgoScraper,
|
||||||
|
duckduckgo_news: duckduckgo.DuckduckgoNewsScraper,
|
||||||
|
infospace: infospace.InfospaceScraper,
|
||||||
|
webcrawler: infospace.WebcrawlerNewsScraper,
|
||||||
|
baidu: baidu.BaiduScraper,
|
||||||
|
youtube: youtube.YoutubeScraper,
|
||||||
|
|
||||||
|
yahoo_news: tickersearch.not_implemented,
|
||||||
|
bloomberg: tickersearch.not_implemented,
|
||||||
|
reuters: tickersearch.not_implemented,
|
||||||
|
cnbc: tickersearch.not_implemented,
|
||||||
|
marketwatch: tickersearch.not_implemented,
|
||||||
|
|
||||||
infospace: infospace.scrape_infospace_pup,
|
|
||||||
webcrawler: infospace.scrape_webcrawler_news_pup,
|
|
||||||
baidu: baidu.scrape_baidu_pup,
|
|
||||||
youtube: youtube.scrape_youtube_pup,
|
|
||||||
duckduckgo_news: duckduckgo.scrape_duckduckgo_news_pup,
|
|
||||||
google_dr: google.scrape_google_pup_dr,
|
|
||||||
yahoo_news: tickersearch.scrape_yahoo_finance_pup,
|
|
||||||
bloomberg: tickersearch.scrape_bloomberg_finance_pup,
|
|
||||||
reuters: tickersearch.scrape_reuters_finance_pup,
|
|
||||||
cnbc: tickersearch.scrape_cnbc_finance_pup,
|
|
||||||
marketwatch: tickersearch.scrape_marketwatch_finance_pup,
|
|
||||||
}[config.search_engine];
|
}[config.search_engine];
|
||||||
|
|
||||||
let scraper = new Scraper({
|
if (Scraper === undefined) {
|
||||||
browser: browser,
|
console.info('Currently not implemented search_engine: ', config.search_engine);
|
||||||
config: config,
|
} else {
|
||||||
context: context,
|
let scraper = new Scraper({
|
||||||
pluggable: pluggable,
|
browser: browser,
|
||||||
});
|
config: config,
|
||||||
|
context: context,
|
||||||
let results = await scraper.run();
|
pluggable: pluggable,
|
||||||
|
});
|
||||||
|
var results = await scraper.run();
|
||||||
|
}
|
||||||
|
|
||||||
if (pluggable.close_browser) {
|
if (pluggable.close_browser) {
|
||||||
await pluggable.close_browser();
|
await pluggable.close_browser();
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
const handler = require('./../src/node_scraper.js');
|
const handler = require('./../src/node_scraper.js');
|
||||||
|
|
||||||
var assert = require('chai').assert;
|
var assert = require('chai').assert;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -13,22 +12,26 @@ function sleep(ms) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
const search_engines = ['google', 'google_image', 'google_news', 'youtube', 'bing', 'infospace', 'baidu'];
|
const search_engines = ['google', 'google_image', 'google_news', 'youtube', 'bing', 'infospace', 'duckduckgo'];
|
||||||
|
const keywords = ['news', 'weather'];
|
||||||
|
|
||||||
async function tests() {
|
async function tests() {
|
||||||
|
|
||||||
const keywords = ['Google scraper NikolaiT', 'the idiot'];
|
|
||||||
|
|
||||||
event = {
|
event = {
|
||||||
search_engine: 'google',
|
search_engine: 'google',
|
||||||
compress: 'false',
|
compress: false,
|
||||||
debug: 'false',
|
debug: false,
|
||||||
verbose: 'false',
|
verbose: false,
|
||||||
keywords: keywords,
|
keywords: keywords,
|
||||||
|
keyword_file: '',
|
||||||
|
num_pages: 1,
|
||||||
|
headless: true,
|
||||||
|
output_file: '',
|
||||||
|
block_assets: true,
|
||||||
|
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||||
|
random_user_agent: false,
|
||||||
};
|
};
|
||||||
|
|
||||||
for (var i = 0; i < search_engines.length; i++) {
|
for (let se of search_engines) {
|
||||||
se = search_engines[i];
|
|
||||||
console.log(`Testing ${se}...`);
|
console.log(`Testing ${se}...`);
|
||||||
event.search_engine = se;
|
event.search_engine = se;
|
||||||
await handler.handler(event, undefined, test_case);
|
await handler.handler(event, undefined, test_case);
|
||||||
@ -47,8 +50,7 @@ async function no_results_test() {
|
|||||||
keywords: keywords,
|
keywords: keywords,
|
||||||
};
|
};
|
||||||
|
|
||||||
for (var i = 0; i < search_engines.length; i++) {
|
for (let se of search_engines) {
|
||||||
se = search_engines[i];
|
|
||||||
console.log(`Testing ${se}...`);
|
console.log(`Testing ${se}...`);
|
||||||
event.search_engine = se;
|
event.search_engine = se;
|
||||||
await handler.handler(event, undefined, test_case_no_results);
|
await handler.handler(event, undefined, test_case_no_results);
|
||||||
@ -61,19 +63,18 @@ async function effective_query_test() {
|
|||||||
const keywords = ['mount evverrest'];
|
const keywords = ['mount evverrest'];
|
||||||
|
|
||||||
event = {
|
event = {
|
||||||
write_meta_data: 'true',
|
write_meta_data: true,
|
||||||
job_name: 'test-job',
|
job_name: 'test-job',
|
||||||
search_engine: '',
|
search_engine: '',
|
||||||
compress: 'false',
|
compress: false,
|
||||||
debug: 'false',
|
debug: false,
|
||||||
verbose: 'false',
|
verbose: false,
|
||||||
keywords: keywords,
|
keywords: keywords,
|
||||||
};
|
};
|
||||||
|
|
||||||
const effective_query_engines = ['google', 'google_image', 'google_news', 'youtube', 'bing'];
|
const effective_query_engines = ['google', 'google_image', 'google_news', 'youtube', 'bing'];
|
||||||
|
|
||||||
for (var i = 0; i < effective_query_engines.length; i++) {
|
for (let se of search_engines) {
|
||||||
se = effective_query_engines[i];
|
|
||||||
console.log(`Testing ${se}...`);
|
console.log(`Testing ${se}...`);
|
||||||
event.search_engine = se;
|
event.search_engine = se;
|
||||||
await handler.handler(event, undefined, test_case_effective_query);
|
await handler.handler(event, undefined, test_case_effective_query);
|
||||||
@ -90,27 +91,47 @@ function test_case(err, response) {
|
|||||||
assert.equal(response.headers['Content-Type'], 'text/json', 'content type is not text/json');
|
assert.equal(response.headers['Content-Type'], 'text/json', 'content type is not text/json');
|
||||||
assert.equal(response.statusCode, 200, 'status code must be 200');
|
assert.equal(response.statusCode, 200, 'status code must be 200');
|
||||||
|
|
||||||
for (key in response.results) {
|
let total_rank = 1;
|
||||||
kw = response.results[key];
|
|
||||||
// at least 6 results
|
|
||||||
assert.isAtLeast(kw.results.length, 6, 'results must have at least 6 links');
|
|
||||||
assert.equal(kw.no_results, false, 'no results should be false');
|
|
||||||
assert.typeOf(kw.num_results, 'string', 'num_results must be a string');
|
|
||||||
assert.isAtLeast(kw.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
|
||||||
assert.typeOf(Date.parse(kw.time), 'number', 'time should be a valid date');
|
|
||||||
|
|
||||||
for (let res of kw.results) {
|
for (query in response.results) {
|
||||||
assert.isOk(res.link, 'link must be ok');
|
|
||||||
assert.typeOf(res.link, 'string', 'link must be string');
|
|
||||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
|
||||||
|
|
||||||
assert.isOk(res.title, 'title must be ok');
|
assert.containsAllKeys(response.results, keywords, 'not all keywords were scraped.');
|
||||||
assert.typeOf(res.title, 'string', 'title must be string');
|
|
||||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
|
||||||
|
|
||||||
assert.isOk(res.snippet, 'snippet must be ok');
|
for (page_number in response.results[query]) {
|
||||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
|
||||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||||
|
|
||||||
|
let obj = response.results[query][page_number];
|
||||||
|
|
||||||
|
assert.containsAllKeys(obj, ['results', 'time',], 'not all keys are in the object');
|
||||||
|
|
||||||
|
// at least 6 results
|
||||||
|
assert.isAtLeast(obj.results.length, 6, 'results must have at least 6 SERP objects');
|
||||||
|
// TODO: fix this
|
||||||
|
// assert.equal(obj.no_results, false, 'no results should be false');
|
||||||
|
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||||
|
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||||
|
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||||
|
|
||||||
|
for (let res of obj.results) {
|
||||||
|
|
||||||
|
assert.containsAllKeys(res, ['link', 'title', 'rank'], 'not all keys are in the SERP object');
|
||||||
|
|
||||||
|
assert.isOk(res.link, 'link must be ok');
|
||||||
|
assert.typeOf(res.link, 'string', 'link must be string');
|
||||||
|
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||||
|
|
||||||
|
assert.isOk(res.title, 'title must be ok');
|
||||||
|
assert.typeOf(res.title, 'string', 'title must be string');
|
||||||
|
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||||
|
|
||||||
|
assert.isOk(res.snippet, 'snippet must be ok');
|
||||||
|
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||||
|
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||||
|
|
||||||
|
assert.isNumber(res.rank, 'rank must be integer');
|
||||||
|
assert.equal(res.rank, total_rank++, 'rank ist wrong');
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user