tested and works

This commit is contained in:
Nikolai Tschacher 2019-01-30 23:53:09 +01:00
parent 581568ff18
commit 987e3d7342
16 changed files with 608 additions and 599 deletions

View File

@ -26,6 +26,36 @@ Additionally **se-scraper** supports investment ticker search from the following
This module uses puppeteer. It was created by the Developer of https://github.com/NikolaiT/GoogleScraper, a module with 1800 Stars on Github.
### Quickstart
Install with
```bash
npm install se-scraper
```
then create a file with the following contents and start scraping.
```js
const se_scraper = require('se-scraper');
let config = {
search_engine: 'google',
debug: false,
verbose: false,
keywords: ['news', 'scraping scrapeulous.com'],
num_pages: 3,
output_file: 'data.json',
};
function callback(err, response) {
if (err) { console.error(err) }
console.dir(response, {depth: null, colors: true});
}
se_scraper.scrape(config, callback);
```
### Technical Notes
Scraping is done with a headless chromium browser using the automation library puppeteer. Puppeteer is a Node library which provides a high-level API to control headless Chrome or Chromium over the DevTools Protocol.
@ -75,13 +105,7 @@ Consider the following resources:
* https://intoli.com/blog/making-chrome-headless-undetectable/
### Installation and Usage
Install with
```bash
npm install se-scraper
```
### Advanced Usage
Use se-scraper by calling it with a script such as the one below.
@ -162,9 +186,7 @@ Supported options for the `search_engine` config key:
'baidu'
'youtube'
'duckduckgo_news'
'google_dr'
'yahoo_news'
// ticker search
'bloomberg'
'reuters'
'cnbc'

File diff suppressed because one or more lines are too long

17
examples/quickstart.js Normal file
View File

@ -0,0 +1,17 @@
const se_scraper = require('./../index.js');
let config = {
search_engine: 'duckduckgo',
debug: false,
verbose: false,
keywords: ['news'],
num_pages: 2,
output_file: 'data.json',
};
function callback(err, response) {
if (err) { console.error(err) }
console.dir(response, {depth: null, colors: true});
}
se_scraper.scrape(config, callback);

View File

@ -8,11 +8,11 @@ exports.scrape = async function(config, callback) {
// the user agent to scrape with
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
// if random_user_agent is set to True, a random user agent is chosen
random_user_agent: false,
random_user_agent: true,
// whether to select manual settings in visible mode
set_manual_settings: false,
// get meta data of scraping in return object
write_meta_data: true,
write_meta_data: false,
log_http_headers: false,
// how long to sleep between requests. a random sleep interval within the range [a,b]
// is drawn before every request. empty string for no sleeping.

14
run.js
View File

@ -12,23 +12,23 @@ let config = {
// is drawn before every request. empty string for no sleeping.
sleep_range: '[1,2]',
// which search engine to scrape
search_engine: 'google_news',
search_engine: 'google',
// whether debug information should be printed
// debug info is useful for developers when debugging
debug: true,
debug: false,
// whether verbose program output should be printed
// this output is informational
verbose: true,
// an array of keywords to scrape
keywords: ['hacking', 'trump'],
keywords: ['news'],
// alternatively you can specify a keyword_file. this overwrites the keywords array
keyword_file: '',
// the number of pages to scrape for each keyword
num_pages: 1,
num_pages: 2,
// whether to start the browser in headless mode
headless: false,
headless: true,
// path to output file, data will be stored in JSON
output_file: 'data.json',
output_file: '',
// whether to prevent images, css, fonts from being loaded
// will speed up scraping a great deal
block_assets: true,
@ -41,7 +41,7 @@ let config = {
// use a proxy for all connections
// example: 'socks5://78.94.172.42:1080'
// example: 'http://118.174.233.10:48400'
//proxy: 'socks5://78.94.172.42:1080',
proxy: '',
};
function callback(err, response) {

View File

@ -1,109 +1,78 @@
const cheerio = require('cheerio');
const sfunctions = require('./functions.js');
const Scraper = require('./se_scraper');
module.exports = {
scrape_baidu_pup: scrape_baidu_pup,
};
class BaiduScraper extends Scraper {
parse(html) {
// load the page source into cheerio
const $ = cheerio.load(html);
async function scrape_baidu_pup(page, event, context, pluggable) {
await page.goto('https://www.baidu.com/');
// perform queries
const results = [];
$('#content_left .result').each((i, link) => {
results.push({
link: $(link).find('h3 a').attr('href'),
title: $(link).find('h3').text(),
snippet: $(link).find('.c-abstract').text(),
visible_link: $(link).find('.f13').text(),
})
});
try {
await page.waitForSelector('input[name="wd"]', { timeout: 5000 });
} catch (e) {
return results;
}
let keywords = event.keywords;
var results = {};
for (var i = 0; i < keywords.length; i++) {
keyword = keywords[i];
if (pluggable.before_keyword_scraped) {
await pluggable.before_keyword_scraped({
keyword: keyword,
page: page,
event: event,
context: context,
});
}
try {
const input = await page.$('input[name="wd"]');
// overwrites last text in input
await input.click({ clickCount: 3 });
await input.type(keyword);
await input.focus();
await page.keyboard.press("Enter");
if (event.sleep_range) {
await sfunctions.random_sleep(event);
}
// in baidu we have a issue with waiting for a selector
// or waiting for navigation
// therefore, we just manually sleep
// issue in baidu: https://github.com/GoogleChrome/puppeteer/issues/609
// https://github.com/GoogleChrome/puppeteer/issues/2671
// await page.evaluate( () => {
// if ( ! window.Node ) {
// window.Node = {};
// }
// if ( ! Node.ELEMENT_NODE ) {
// Node.ELEMENT_NODE = 1;
// }
// } );
// await page.waitForSelector('.result', { timeout: 5000 });
// this should be reasonable for normal internet connections
await sfunctions.sleep(2000);
if (event.debug === true && event.is_local === true) {
await page.screenshot({path: `debug/${keyword}.png`});
const cleaned = [];
for (var i=0; i < results.length; i++) {
let res = results[i];
if (res.link && res.link.trim()) {
res.rank = this.result_rank++;
cleaned.push(res);
}
}
let html = await page.content();
results[keyword] = parse(html);
} catch (e) {
console.error(`Problem with scraping ${keyword}: ${e}`);
return {
time: (new Date()).toUTCString(),
no_results: false,
num_results: $('.nums_text').text(),
results: cleaned,
}
}
return results;
async load_start_page() {
try {
await this.page.goto('https://www.baidu.com/');
await this.page.waitForSelector('input[name="wd"]', { timeout: 5000 });
} catch (e) {
return false;
}
return true;
}
async search_keyword(keyword) {
const input = await this.page.$('input[name="wd"]');
// overwrites last text in input
await input.click({ clickCount: 3 });
await input.type(keyword);
await input.focus();
await this.page.keyboard.press("Enter");
}
async next_page() {
let next_page_link = await this.page.$('.sb_pagN', {timeout: 1000});
if (!next_page_link) {
return false;
}
await next_page_link.click();
await this.page.waitForNavigation();
return true;
}
async wait_for_results() {
// TODO: very very bad, but nobody uses baidu, or does someone?
await this.sleep(2000);
}
async detected() {
}
}
function parse(html) {
// load the page source into cheerio
const $ = cheerio.load(html);
// perform queries
const results = [];
$('#content_left .result').each((i, link) => {
results.push({
link: $(link).find('h3 a').attr('href'),
title: $(link).find('h3').text(),
snippet: $(link).find('.c-abstract').text(),
visible_link: $(link).find('.f13').text(),
})
});
const cleaned = [];
for (var i=0; i < results.length; i++) {
let res = results[i];
if (res.link && res.link.trim()) {
res.rank = i+1;
cleaned.push(res);
}
}
return {
time: (new Date()).toUTCString(),
no_results: false,
num_results: $('.nums_text').text(),
results: cleaned,
}
}
module.exports = {
BaiduScraper: BaiduScraper,
};

View File

@ -29,7 +29,7 @@ class BingScraper extends Scraper {
for (var i=0; i < results.length; i++) {
let res = results[i];
if (res.link && res.link.trim() && res.title && res.title.trim()) {
res.rank = i+1;
res.rank = this.result_rank++;
cleaned.push(res);
}
}
@ -104,7 +104,7 @@ class BingNewsScraper extends Scraper {
for (var i=0; i < results.length; i++) {
let res = results[i];
if (res.link && res.link.trim() && res.title && res.title.trim()) {
res.rank = i+1;
res.rank = this.result_rank++;
cleaned.push(res);
}
}

View File

@ -1,94 +1,148 @@
const cheerio = require('cheerio');
const sfunctions = require('./functions.js');
const Scraper = require('./se_scraper');
module.exports = {
scrape_duckduckgo_news_pup: scrape_duckduckgo_news_pup,
};
class DuckduckgoScraper extends Scraper {
async function scrape_duckduckgo_news_pup(page, event, context, pluggable) {
await page.goto('https://duckduckgo.com/?q=42&t=h_&iar=news&ia=news');
parse(html) {
// load the page source into cheerio
const $ = cheerio.load(html);
try {
await page.waitForSelector('input[name="q"]', { timeout: 5000 });
} catch (e) {
return results;
}
let keywords = event.keywords;
var results = {};
for (var i = 0; i < keywords.length; i++) {
keyword = keywords[i];
if (pluggable.before_keyword_scraped) {
await pluggable.before_keyword_scraped({
keyword: keyword,
page: page,
event: event,
context: context,
// perform queries
const results = [];
$('.result__body').each((i, link) => {
results.push({
link: $(link).find('.result__title .result__a').attr('href'),
title: $(link).find('.result__title .result__a').text(),
date: $(link).find('.result__timestamp').text(),
snippet: $(link).find('.result__snippet').text(),
visible_link: $(link).find('.result__url').attr('href'),
});
});
const cleaned = [];
for (var i=0; i < results.length; i++) {
let res = results[i];
if (res.link && res.link.trim() && res.title && res.title.trim()) {
res.rank = this.result_rank++;
cleaned.push(res);
}
}
try {
const input = await page.$('input[name="q"]');
// overwrites last text in input
await input.click({ clickCount: 3 });
await sfunctions.sleep(150);
await input.type(keyword);
await sfunctions.sleep(150);
await input.focus();
await page.keyboard.press("Enter");
if (event.sleep_range) {
await sfunctions.random_sleep(event);
}
// await page.waitForSelector('.result--news', { timeout: 5000 });
await page.waitForSelector('.serp__results', { timeout: 5000 });
await sfunctions.sleep(1500);
if (event.debug === true && event.is_local === true) {
await page.screenshot({path: `debug/${keyword}.png`});
}
let html = await page.content();
results[keyword] = parse_duckduckgo_news_results(html, event.max_results);
} catch (e) {
console.error(`Problem with scraping ${keyword}: ${e}`);
return results;
return {
time: (new Date()).toUTCString(),
results: cleaned
}
}
return results;
async load_start_page() {
try {
await this.page.goto('https://duckduckgo.com/');
await this.page.waitForSelector('input[name="q"]', { timeout: 5000 });
} catch (e) {
return false;
}
return true;
}
async search_keyword(keyword) {
const input = await this.page.$('input[name="q"]');
await this.set_input_value(`input[name="q"]`, keyword);
await this.sleep(50);
await input.focus();
await this.page.keyboard.press("Enter");
}
async next_page() {
let next_page_link = await this.page.$('a.result--more__btn', {timeout: 1000});
if (!next_page_link) {
return false;
}
await next_page_link.click();
//await this.page.waitForNavigation();
return true;
}
async wait_for_results() {
await this.page.waitForSelector('.serp__results', { timeout: 5000 });
}
async detected() {
}
}
function parse_duckduckgo_news_results(html) {
// load the page source into cheerio
const $ = cheerio.load(html);
// perform queries
const results = [];
$('.result--news').each((i, link) => {
results.push({
link: $(link).find('.result__title .result__a').attr('href'),
title: $(link).find('.result__title .result__a').text(),
date: $(link).find('.result__timestamp').text(),
snippet: $(link).find('.result__snippet').text(),
class DuckduckgoNewsScraper extends Scraper {
parse(html) {
// load the page source into cheerio
const $ = cheerio.load(html);
// perform queries
const results = [];
$('.result--news').each((i, link) => {
results.push({
link: $(link).find('.result__title .result__a').attr('href'),
title: $(link).find('.result__title .result__a').text(),
date: $(link).find('.result__timestamp').text(),
snippet: $(link).find('.result__snippet').text(),
});
});
});
const cleaned = [];
for (var i=0; i < results.length; i++) {
let res = results[i];
if (res.link && res.link.trim() && res.title && res.title.trim()) {
res.rank = i+1;
cleaned.push(res);
const cleaned = [];
for (var i=0; i < results.length; i++) {
let res = results[i];
if (res.link && res.link.trim() && res.title && res.title.trim()) {
res.rank = this.result_rank++;
cleaned.push(res);
}
}
return {
time: (new Date()).toUTCString(),
results: cleaned
}
}
return {
time: (new Date()).toUTCString(),
results: cleaned
async load_start_page() {
try {
await page.goto('https://duckduckgo.com/?q=42&t=h_&iar=news&ia=news');
await page.waitForSelector('input[name="q"]', { timeout: 5000 });
} catch (e) {
return false;
}
return true;
}
}
async search_keyword(keyword) {
const input = await this.page.$('input[name="q"]');
await this.set_input_value(`input[name="q"]`, keyword);
await this.sleep(50);
await input.focus();
await this.page.keyboard.press("Enter");
}
async next_page() {
let next_page_link = await this.page.$('.sb_pagN', {timeout: 1000});
if (!next_page_link) {
return false;
}
await next_page_link.click();
await this.page.waitForNavigation();
return true;
}
async wait_for_results() {
await this.page.waitForSelector('.serp__results', { timeout: 5000 });
await this.sleep(1500);
}
async detected() {
}
}
module.exports = {
DuckduckgoNewsScraper: DuckduckgoNewsScraper,
DuckduckgoScraper: DuckduckgoScraper,
};

View File

@ -1,40 +0,0 @@
module.exports = {
no_results: no_results,
effective_query: effective_query,
sleep: sleep,
random_sleep: random_sleep,
set_input_value: set_input_value,
};
async function set_input_value(page, selector, value) {
await page.waitFor(selector);
await page.evaluate((value, selector) => {
return document.querySelector(selector).value = value;
}, value, selector);
}
function no_results(needles, html) {
return !needles.map((needle) => { return html.indexOf(needle)})
.every((res) => { return res == -1});
}
function effective_query(needles, html) {
return;
}
function sleep(ms) {
return new Promise(resolve => {
setTimeout(resolve, ms)
})
}
async function random_sleep(config) {
var min, max;
[min, max] = config.sleep_range;
var rand = Math.floor(Math.random() * (max - min + 1) + min); //Generate Random number
if (config.debug === true) {
console.log(`Sleeping for ${rand}s`);
}
await sleep(rand * 1000);
}

View File

@ -1,5 +1,4 @@
const cheerio = require('cheerio');
const sfunctions = require('./functions.js');
const Scraper = require('./se_scraper');
class GoogleScraper extends Scraper {
@ -20,7 +19,7 @@ class GoogleScraper extends Scraper {
})
});
let no_results = sfunctions.no_results(
let no_results = this.no_results(
['Es wurden keine mit deiner Suchanfrage', 'did not match any documents', 'Keine Ergebnisse für',
'No results found for', 'Ergebnisse für', 'Showing results for'],
$('#main').text()
@ -35,7 +34,7 @@ class GoogleScraper extends Scraper {
for (var i=0; i < results.length; i++) {
let res = results[i];
if (res.link && res.link.trim() && res.title && res.title.trim()) {
res.rank = i+1;
res.rank = this.result_rank++;
cleaned.push(res);
}
}
@ -108,7 +107,7 @@ class GoogleNewsOldScraper extends Scraper {
})
});
let no_results = sfunctions.no_results(
let no_results = this.no_results(
['Es wurden keine mit deiner Suchanfrage', 'did not match any documents', 'Keine Ergebnisse für',
'No results found for', 'Ergebnisse für', 'Showing results for', 'did not match any news results'],
$('#main').text()
@ -123,7 +122,7 @@ class GoogleNewsOldScraper extends Scraper {
for (var i=0; i < results.length; i++) {
let res = results[i];
if (res.link && res.link.trim()) {
res.rank = i+1;
res.rank = this.result_rank++;
cleaned.push(res);
}
}
@ -161,7 +160,7 @@ class GoogleNewsOldScraper extends Scraper {
async wait_for_results() {
//await this.page.waitForNavigation({ timeout: this.STANDARD_TIMEOUT });
await this.page.waitForSelector('#main', { timeout: this.STANDARD_TIMEOUT });
await this.sleep(200);
await this.sleep(500);
}
async detected() {
@ -190,7 +189,7 @@ class GoogleImageScraper extends Scraper {
})
});
let no_results = sfunctions.no_results(
let no_results = this.no_results(
['stimmt mit keinem Bildergebnis', 'Keine Ergebnisse für', 'not match any image results', 'No results found for',
'Showing results for', 'Ergebnisse für'],
$('#main').text()
@ -206,7 +205,7 @@ class GoogleImageScraper extends Scraper {
let res = results[i];
if (res.link && res.link.trim() && res.link.trim().length > 10) {
res.link = res.link.trim();
res.rank = i+1;
res.rank = this.result_rank++;
cleaned.push(res);
}
}
@ -252,7 +251,7 @@ class GoogleImageScraper extends Scraper {
async wait_for_results() {
await this.page.waitForSelector('#main', { timeout: this.STANDARD_TIMEOUT });
await this.sleep(100);
await this.sleep(500);
}
async detected() {
@ -296,7 +295,7 @@ class GoogleNewsScraper extends Scraper {
this.all_results.add(title);
});
let no_results = sfunctions.no_results(
let no_results = this.no_results(
['Es wurden keine mit deiner Suchanfrage', 'did not match any documents', 'Keine Ergebnisse für',
'No results found for', 'Ergebnisse für', 'Showing results for', 'did not match any news results'],
$('body').text()
@ -308,7 +307,7 @@ class GoogleNewsScraper extends Scraper {
for (var i=0; i < results.length; i++) {
let res = results[i];
if (res.title && res.title.trim()) {
res.rank = i+1;
res.rank = this.result_rank++;
cleaned.push(res);
}
}
@ -333,6 +332,7 @@ class GoogleNewsScraper extends Scraper {
// parse here front page results
let html = await this.page.content();
this.results['frontpage'] = this.parse(html);
this.result_rank = 1;
} catch(e) {
return false;
}
@ -367,7 +367,6 @@ class GoogleNewsScraper extends Scraper {
}
}
function clean_image_url(url) {
// Example:
// https://www.google.com/imgres?imgurl=https%3A%2F%2Fupload.wikimedia.org%2Fwikipedia%2Fen%2Fthumb%2Ff%2Ffd%2F1928_Edward_Campbell.jpg%2F220px-1928_Edward_Campbell.jpg&imgrefurl=https%3A%2F%2Fwww.revolvy.com%2Fpage%2FSir-Edward-Campbell%252C-1st-Baronet&docid=BMkW_GerTIY4GM&tbnid=TmQapIxDCQbQhM%3A&vet=10ahUKEwje_LLE_YXeAhXisaQKHVAEBSAQMwiNAShEMEQ..i&w=220&h=290&bih=1696&biw=1280&q=John%20MacLeod%20Breadalbane%20Councillor%20Prince%20Edward%20Island&ved=0ahUKEwje_LLE_YXeAhXisaQKHVAEBSAQMwiNAShEMEQ&iact=mrc&uact=8

View File

@ -1,186 +1,157 @@
const cheerio = require('cheerio');
const sfunctions = require('./functions.js');
const Scraper = require('./se_scraper');
class InfospaceScraper extends Scraper {
parse(html) {
// load the page source into cheerio
const $ = cheerio.load(html);
// perform queries
const results = [];
$('.result').each((i, link) => {
results.push({
link: $(link).find('a.title').attr('href'),
title: $(link).find('a.title').text(),
snippet: $(link).find('.description').text(),
visible_link: $(link).find('.url').text(),
})
});
const cleaned = [];
for (var i=0; i < results.length; i++) {
let res = results[i];
if (res.link && res.link.trim()) {
res.rank = this.result_rank++;
cleaned.push(res);
}
}
let no_results = this.no_results(
['No search results were found for'],
$('.layout__mainline').text()
);
return {
time: (new Date()).toUTCString(),
no_results: no_results,
num_results: '',
results: cleaned,
}
}
async load_start_page() {
try {
await this.page.goto('http://infospace.com/index.html');
await this.page.waitForSelector('input[name="q"]', { timeout: 5000 });
} catch (e) {
return false;
}
return true;
}
async search_keyword(keyword) {
const input = await this.page.$('input[id="q"]');
await this.set_input_value('input[id="q"]', keyword);
await this.sleep(50);
await input.focus();
await this.page.keyboard.press("Enter");
}
async next_page() {
let next_page_link = await this.page.$('a.next', {timeout: 1000});
if (!next_page_link) {
return false;
}
await next_page_link.click();
await this.page.waitForNavigation();
return true;
}
async wait_for_results() {
await this.page.waitForSelector('.mainline-results', { timeout: 5000 }); // TODO: this is not the best selector.
await this.sleep(250);
}
async detected() {
}
}
class WebcrawlerNewsScraper extends Scraper {
parse(html) {
// load the page source into cheerio
const $ = cheerio.load(html);
// perform queries
const results = [];
$('.article').each((i, link) => {
let source = $(link).find('.source').text();
let date = source.split(',')[1] || '';
results.push({
link: $(link).find('a').attr('href'),
title: $(link).find('.title').text(),
publisher: $(link).find('.source').text(),
date: date,
snippet: $(link).find('.description').text(),
});
});
const cleaned = [];
for (var i=0; i < results.length; i++) {
let res = results[i];
if (res.link && res.link.trim() && res.title && res.title.trim()) {
res.rank = this.result_rank++;
cleaned.push(res);
}
}
return {
time: (new Date()).toUTCString(),
results: cleaned
}
}
async load_start_page() {
try {
await this.page.goto('https://www.webcrawler.com/?qc=news');
await this.page.waitForSelector('input[name="q"]', { timeout: 5000 });
} catch (e) {
return false;
}
return true;
}
async search_keyword(keyword) {
const input = await this.page.$('input[name="q"]');
await this.set_input_value('input[name="q"]', keyword);
await this.sleep(50);
await input.focus();
await this.page.keyboard.press("Enter");
}
async next_page() {
let next_page_link = await this.page.$('.pagination__num--next', {timeout: 1000});
if (!next_page_link) {
return false;
}
await next_page_link.click();
await this.page.waitForNavigation();
return true;
}
async wait_for_results() {
await this.page.waitForSelector('.mainline-results', { timeout: 5000 });
await this.sleep(150);
}
async detected() {
}
}
module.exports = {
scrape_infospace_pup: scrape_infospace_pup,
scrape_webcrawler_news_pup: scrape_webcrawler_news_pup,
};
async function scrape_infospace_pup(page, event, context, pluggable) {
await page.goto('http://infospace.com/index.html');
try {
await page.waitForSelector('input[name="q"]', { timeout: 5000 });
} catch (e) {
return results;
}
let keywords = event.keywords;
var results = {};
for (var i = 0; i < keywords.length; i++) {
keyword = keywords[i];
if (pluggable.before_keyword_scraped) {
await pluggable.before_keyword_scraped({
keyword: keyword,
page: page,
event: event,
context: context,
});
}
try {
const input = await page.$('input[id="q"]');
// overwrites last text in input
await input.click({ clickCount: 3 });
await input.type(keyword);
await input.focus();
await page.keyboard.press("Enter");
if (event.sleep_range) {
await sfunctions.random_sleep(event);
}
await page.waitForSelector('.mainline-results', { timeout: 5000 }); // TODO: this is not the best selector.
await sfunctions.sleep(250);
if (event.debug === true && event.is_local === true) {
await page.screenshot({path: `debug/${keyword}.png`});
}
let html = await page.content();
results[keyword] = parse(html);
} catch (e) {
console.error(`Problem with scraping ${keyword}: ${e}`);
}
}
return results;
}
function parse(html) {
// load the page source into cheerio
const $ = cheerio.load(html);
// perform queries
const results = [];
$('.result').each((i, link) => {
results.push({
link: $(link).find('a.title').attr('href'),
title: $(link).find('a.title').text(),
snippet: $(link).find('.description').text(),
visible_link: $(link).find('.url').text(),
})
});
const cleaned = [];
for (var i=0; i < results.length; i++) {
let res = results[i];
if (res.link && res.link.trim()) {
res.rank = i+1;
cleaned.push(res);
}
}
let no_results = sfunctions.no_results(
['No search results were found for'],
$('.layout__mainline').text()
);
return {
time: (new Date()).toUTCString(),
no_results: no_results,
num_results: '',
results: cleaned,
}
}
async function scrape_webcrawler_news_pup(page, event, context, pluggable) {
await page.goto('https://www.webcrawler.com/?qc=news');
try {
await page.waitForSelector('input[name="q"]', { timeout: 5000 });
} catch (e) {
return results;
}
let keywords = event.keywords;
var results = {};
for (var i = 0; i < keywords.length; i++) {
keyword = keywords[i];
if (pluggable.before_keyword_scraped) {
await pluggable.before_keyword_scraped({
keyword: keyword,
page: page,
event: event,
context: context,
});
}
try {
const input = await page.$('input[name="q"]');
// overwrites last text in input
await input.click({ clickCount: 3 });
await sfunctions.sleep(150);
await input.type(keyword);
await sfunctions.sleep(150);
await input.focus();
await page.keyboard.press("Enter");
if (event.sleep_range) {
await sfunctions.random_sleep(event);
}
await page.waitForSelector('.mainline-results', { timeout: 5000 });
if (event.debug === true && event.is_local === true) {
await page.screenshot({path: `debug/${keyword}.png`});
}
let html = await page.content();
results[keyword] = parse_webcrawler_news_results(html, event.max_results);
} catch (e) {
console.error(`Problem with scraping ${keyword}: ${e}`);
return results;
}
}
return results;
}
function parse_webcrawler_news_results(html) {
// load the page source into cheerio
const $ = cheerio.load(html);
// perform queries
const results = [];
$('.article').each((i, link) => {
let source = $(link).find('.source').text();
let date = source.split(',')[1] || '';
results.push({
link: $(link).find('a').attr('href'),
title: $(link).find('.title').text(),
publisher: $(link).find('.source').text(),
date: date,
snippet: $(link).find('.description').text(),
});
});
const cleaned = [];
for (var i=0; i < results.length; i++) {
let res = results[i];
if (res.link && res.link.trim() && res.title && res.title.trim()) {
res.rank = i+1;
cleaned.push(res);
}
}
return {
time: (new Date()).toUTCString(),
results: cleaned
}
}
InfospaceScraper: InfospaceScraper,
WebcrawlerNewsScraper: WebcrawlerNewsScraper,
};

View File

@ -81,6 +81,8 @@ module.exports = class Scraper {
*/
async scraping_loop() {
this.result_rank = 1;
for (let keyword of this.config.keywords) {
this.keyword = keyword;
this.results[keyword] = {};
@ -121,7 +123,7 @@ module.exports = class Scraper {
break;
}
} while (page_num < event.num_pages);
} while (page_num <= event.num_pages);
} catch (e) {

View File

@ -1,5 +1,4 @@
const cheerio = require('cheerio');
const sfunctions = require('./functions.js');
module.exports = {
scrape_yahoo_finance_pup: scrape_yahoo_finance_pup,
@ -7,9 +6,14 @@ module.exports = {
scrape_reuters_finance_pup: scrape_reuters_finance_pup,
scrape_cnbc_finance_pup: scrape_cnbc_finance_pup,
scrape_marketwatch_finance_pup: scrape_marketwatch_finance_pup,
not_implemented: undefined,
};
// https://www.google.com/search?q=MSFT&tbm=fin
function sleep(ms) {
return new Promise(resolve => {
setTimeout(resolve, ms)
})
}
async function scrape_yahoo_finance_pup(page, event, context, pluggable) {
var results = {};
@ -40,7 +44,7 @@ async function scrape_yahoo_finance_pup(page, event, context, pluggable) {
await page.screenshot({path: `debug/${keyword}.png`});
}
await sfunctions.sleep(1000);
await sleep(1000);
let html = await page.content();
results[keyword] = parse(html);
@ -90,7 +94,7 @@ async function scrape_marketwatch_finance_pup(page, event, context, pluggable) {
await page.screenshot({path: `debug/${keyword}.png`});
}
await sfunctions.sleep(500);
await sleep(500);
let newsData = await page.evaluate(() => {
let results = [];
@ -150,7 +154,7 @@ async function scrape_bloomberg_finance_pup(page, event, context, pluggable) {
await page.screenshot({path: `debug/${keyword}.png`});
}
await sfunctions.sleep(1000);
await sleep(1000);
let news_items = await page.$x('//*[starts-with(@class,"newsItem")]');
for (let item of news_items) {
@ -189,7 +193,7 @@ async function scrape_reuters_finance_pup(page, event, context, pluggable) {
await page.screenshot({path: `debug/${keyword}.png`});
}
await sfunctions.sleep(500);
await sleep(500);
let newsData = await page.evaluate(() => {
let results = [];
@ -246,7 +250,7 @@ async function scrape_cnbc_finance_pup(page, event, context, pluggable) {
await page.screenshot({path: `debug/${keyword}.png`});
}
await sfunctions.sleep(500);
await sleep(500);
let newsData = await page.evaluate(() => {
let results = [];

View File

@ -1,121 +1,105 @@
const cheerio = require('cheerio');
const sfunctions = require('./functions.js');
const Scraper = require('./se_scraper');
module.exports = {
scrape_youtube_pup: scrape_youtube_pup,
};
class YoutubeScraper extends Scraper {
const all_videos = new Set();
parse(html) {
// load the page source into cheerio
const $ = cheerio.load(html);
async function scrape_youtube_pup(page, event, context, pluggable) {
await page.goto('https://www.youtube.com');
// perform queries
const results = [];
$('#contents ytd-video-renderer,#contents ytd-grid-video-renderer').each((i, link) => {
results.push({
link: $(link).find('#video-title').attr('href'),
title: $(link).find('#video-title').text(),
snippet: $(link).find('#description-text').text(),
channel: $(link).find('#byline a').text(),
channel_link: $(link).find('#byline a').attr('href'),
num_views: $(link).find('#metadata-line span:nth-child(1)').text(),
release_date: $(link).find('#metadata-line span:nth-child(2)').text(),
})
});
try {
await page.waitForSelector('input[id="search"]', { timeout: 5000 });
} catch (e) {
return results;
}
let no_results = this.no_results(
['No results found', 'Keine Ergebnisse', 'Es werden Ergebnisse angezeigt', 'Showing results for' ],
$('yt-showing-results-for-renderer').text()
);
let keywords = event.keywords;
var results = {};
let effective_query = $('#corrected-link').text() || '';
// before we do anything, parse the results of the front page of youtube
await page.waitForSelector('ytd-video-renderer,ytd-grid-video-renderer', { timeout: 10000 });
await sfunctions.sleep(500);
const cleaned = [];
for (var i=0; i < results.length; i++) {
let res = results[i];
if (res.link && res.link.trim() && res.title && res.title.trim()) {
res.title = res.title.trim();
res.snippet = res.snippet.trim();
res.rank = this.result_rank++;
let html = await page.content();
results['__frontpage__'] = parse(html);
for (var i = 0; i < keywords.length; i++) {
keyword = keywords[i];
if (pluggable.before_keyword_scraped) {
await pluggable.before_keyword_scraped({
keyword: keyword,
page: page,
event: event,
context: context,
});
}
try {
const input = await page.$('input[id="search"]');
// overwrites last text in input
await input.click({ clickCount: 3 });
await input.type(keyword);
await input.focus();
await page.keyboard.press("Enter");
if (event.sleep_range) {
await sfunctions.random_sleep(event);
}
await page.waitForFunction(`document.title.indexOf('${keyword}') !== -1`, { timeout: 5000 });
await page.waitForSelector('ytd-video-renderer,ytd-grid-video-renderer', { timeout: 5000 });
await sfunctions.sleep(500);
if (event.debug === true && event.is_local === true) {
await page.screenshot({path: `debug/${keyword}.png`});
// check if this result has been used before
if (this.all_videos.has(res.title) === false) {
cleaned.push(res);
}
this.all_videos.add(res.title);
}
}
let html = await page.content();
results[keyword] = parse(html);
} catch (e) {
console.error(`Problem with scraping ${keyword}: ${e}`);
return {
time: (new Date()).toUTCString(),
no_results: no_results,
effective_query: effective_query,
num_results: '',
results: cleaned,
}
}
return results;
async load_start_page() {
try {
this.all_videos = new Set();
await this.page.goto('https://www.youtube.com', {
referer: 'https://google.com'
});
await this.page.waitForSelector('input[id="search"]', { timeout: 5000 });
// before we do anything, parse the results of the front page of youtube
await this.page.waitForSelector('ytd-video-renderer,ytd-grid-video-renderer', { timeout: 10000 });
await this.sleep(500);
let html = await this.page.content();
this.results['frontpage'] = this.parse(html);
this.result_rank = 1;
} catch(e) {
return false;
}
return true;
}
async search_keyword(keyword) {
const input = await this.page.$('input[id="search"]');
// overwrites last text in input
await input.click({ clickCount: 3 });
await input.type(keyword);
await input.focus();
await this.page.keyboard.press("Enter");
}
async next_page() {
// youtube needs scrolling
// TODO: implement scrolling, no priority right now
return false;
}
async wait_for_results() {
await this.page.waitForFunction(`document.title.indexOf('${this.keyword}') !== -1`, { timeout: 5000 });
await this.page.waitForSelector('ytd-video-renderer,ytd-grid-video-renderer', { timeout: 5000 });
await this.sleep(500);
}
async detected() {
const title = await this.page.title();
let html = await this.page.content();
return html.indexOf('detected unusual traffic') !== -1 || title.indexOf('/sorry/') !== -1;
}
}
function parse(html) {
// load the page source into cheerio
const $ = cheerio.load(html);
// perform queries
const results = [];
$('#contents ytd-video-renderer,#contents ytd-grid-video-renderer').each((i, link) => {
results.push({
link: $(link).find('#video-title').attr('href'),
title: $(link).find('#video-title').text(),
snippet: $(link).find('#description-text').text(),
channel: $(link).find('#byline a').text(),
channel_link: $(link).find('#byline a').attr('href'),
num_views: $(link).find('#metadata-line span:nth-child(1)').text(),
release_date: $(link).find('#metadata-line span:nth-child(2)').text(),
})
});
let no_results = sfunctions.no_results(
['No results found', 'Keine Ergebnisse', 'Es werden Ergebnisse angezeigt', 'Showing results for' ],
$('yt-showing-results-for-renderer').text()
);
let effective_query = $('#corrected-link').text() || '';
const cleaned = [];
for (var i=0; i < results.length; i++) {
let res = results[i];
if (res.link && res.link.trim() && res.title && res.title.trim()) {
res.title = res.title.trim();
res.snippet = res.snippet.trim();
res.rank = i+1;
// check if this result has been used before
if (all_videos.has(res.title) === false) {
cleaned.push(res);
}
all_videos.add(res.title);
}
}
return {
time: (new Date()).toUTCString(),
no_results: no_results,
effective_query: effective_query,
num_results: '',
results: cleaned,
}
}
module.exports = {
YoutubeScraper: YoutubeScraper,
};

View File

@ -129,6 +129,8 @@ module.exports.handler = async function handler (event, context, callback) {
}
}
var results = {};
Scraper = {
google: google.GoogleScraper,
google_news_old: google.GoogleNewsOldScraper,
@ -136,28 +138,32 @@ module.exports.handler = async function handler (event, context, callback) {
google_image: google.GoogleImageScraper,
bing: bing.BingScraper,
bing_news: bing.BingNewsScraper,
duckduckgo: duckduckgo.DuckduckgoScraper,
duckduckgo_news: duckduckgo.DuckduckgoNewsScraper,
infospace: infospace.InfospaceScraper,
webcrawler: infospace.WebcrawlerNewsScraper,
baidu: baidu.BaiduScraper,
youtube: youtube.YoutubeScraper,
yahoo_news: tickersearch.not_implemented,
bloomberg: tickersearch.not_implemented,
reuters: tickersearch.not_implemented,
cnbc: tickersearch.not_implemented,
marketwatch: tickersearch.not_implemented,
infospace: infospace.scrape_infospace_pup,
webcrawler: infospace.scrape_webcrawler_news_pup,
baidu: baidu.scrape_baidu_pup,
youtube: youtube.scrape_youtube_pup,
duckduckgo_news: duckduckgo.scrape_duckduckgo_news_pup,
google_dr: google.scrape_google_pup_dr,
yahoo_news: tickersearch.scrape_yahoo_finance_pup,
bloomberg: tickersearch.scrape_bloomberg_finance_pup,
reuters: tickersearch.scrape_reuters_finance_pup,
cnbc: tickersearch.scrape_cnbc_finance_pup,
marketwatch: tickersearch.scrape_marketwatch_finance_pup,
}[config.search_engine];
let scraper = new Scraper({
browser: browser,
config: config,
context: context,
pluggable: pluggable,
});
let results = await scraper.run();
if (Scraper === undefined) {
console.info('Currently not implemented search_engine: ', config.search_engine);
} else {
let scraper = new Scraper({
browser: browser,
config: config,
context: context,
pluggable: pluggable,
});
var results = await scraper.run();
}
if (pluggable.close_browser) {
await pluggable.close_browser();

View File

@ -1,5 +1,4 @@
const handler = require('./../src/node_scraper.js');
var assert = require('chai').assert;
/*
@ -13,22 +12,26 @@ function sleep(ms) {
})
}
const search_engines = ['google', 'google_image', 'google_news', 'youtube', 'bing', 'infospace', 'baidu'];
const search_engines = ['google', 'google_image', 'google_news', 'youtube', 'bing', 'infospace', 'duckduckgo'];
const keywords = ['news', 'weather'];
async function tests() {
const keywords = ['Google scraper NikolaiT', 'the idiot'];
event = {
search_engine: 'google',
compress: 'false',
debug: 'false',
verbose: 'false',
compress: false,
debug: false,
verbose: false,
keywords: keywords,
keyword_file: '',
num_pages: 1,
headless: true,
output_file: '',
block_assets: true,
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
random_user_agent: false,
};
for (var i = 0; i < search_engines.length; i++) {
se = search_engines[i];
for (let se of search_engines) {
console.log(`Testing ${se}...`);
event.search_engine = se;
await handler.handler(event, undefined, test_case);
@ -47,8 +50,7 @@ async function no_results_test() {
keywords: keywords,
};
for (var i = 0; i < search_engines.length; i++) {
se = search_engines[i];
for (let se of search_engines) {
console.log(`Testing ${se}...`);
event.search_engine = se;
await handler.handler(event, undefined, test_case_no_results);
@ -61,19 +63,18 @@ async function effective_query_test() {
const keywords = ['mount evverrest'];
event = {
write_meta_data: 'true',
write_meta_data: true,
job_name: 'test-job',
search_engine: '',
compress: 'false',
debug: 'false',
verbose: 'false',
compress: false,
debug: false,
verbose: false,
keywords: keywords,
};
const effective_query_engines = ['google', 'google_image', 'google_news', 'youtube', 'bing'];
for (var i = 0; i < effective_query_engines.length; i++) {
se = effective_query_engines[i];
for (let se of search_engines) {
console.log(`Testing ${se}...`);
event.search_engine = se;
await handler.handler(event, undefined, test_case_effective_query);
@ -90,27 +91,47 @@ function test_case(err, response) {
assert.equal(response.headers['Content-Type'], 'text/json', 'content type is not text/json');
assert.equal(response.statusCode, 200, 'status code must be 200');
for (key in response.results) {
kw = response.results[key];
// at least 6 results
assert.isAtLeast(kw.results.length, 6, 'results must have at least 6 links');
assert.equal(kw.no_results, false, 'no results should be false');
assert.typeOf(kw.num_results, 'string', 'num_results must be a string');
assert.isAtLeast(kw.num_results.length, 5, 'num_results should be a string of at least 5 chars');
assert.typeOf(Date.parse(kw.time), 'number', 'time should be a valid date');
let total_rank = 1;
for (let res of kw.results) {
assert.isOk(res.link, 'link must be ok');
assert.typeOf(res.link, 'string', 'link must be string');
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
for (query in response.results) {
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
assert.containsAllKeys(response.results, keywords, 'not all keywords were scraped.');
assert.isOk(res.snippet, 'snippet must be ok');
assert.typeOf(res.snippet, 'string', 'snippet must be string');
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
for (page_number in response.results[query]) {
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
let obj = response.results[query][page_number];
assert.containsAllKeys(obj, ['results', 'time',], 'not all keys are in the object');
// at least 6 results
assert.isAtLeast(obj.results.length, 6, 'results must have at least 6 SERP objects');
// TODO: fix this
// assert.equal(obj.no_results, false, 'no results should be false');
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
for (let res of obj.results) {
assert.containsAllKeys(res, ['link', 'title', 'rank'], 'not all keys are in the SERP object');
assert.isOk(res.link, 'link must be ok');
assert.typeOf(res.link, 'string', 'link must be string');
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
assert.isOk(res.snippet, 'snippet must be ok');
assert.typeOf(res.snippet, 'string', 'snippet must be string');
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
assert.isNumber(res.rank, 'rank must be integer');
assert.equal(res.rank, total_rank++, 'rank ist wrong');
}
}
}
}