mirror of
https://github.com/NikolaiT/se-scraper.git
synced 2025-02-16 16:50:45 +01:00
tested and works
This commit is contained in:
parent
581568ff18
commit
987e3d7342
40
README.md
40
README.md
@ -26,6 +26,36 @@ Additionally **se-scraper** supports investment ticker search from the following
|
||||
|
||||
This module uses puppeteer. It was created by the Developer of https://github.com/NikolaiT/GoogleScraper, a module with 1800 Stars on Github.
|
||||
|
||||
### Quickstart
|
||||
|
||||
Install with
|
||||
|
||||
```bash
|
||||
npm install se-scraper
|
||||
```
|
||||
|
||||
then create a file with the following contents and start scraping.
|
||||
|
||||
```js
|
||||
const se_scraper = require('se-scraper');
|
||||
|
||||
let config = {
|
||||
search_engine: 'google',
|
||||
debug: false,
|
||||
verbose: false,
|
||||
keywords: ['news', 'scraping scrapeulous.com'],
|
||||
num_pages: 3,
|
||||
output_file: 'data.json',
|
||||
};
|
||||
|
||||
function callback(err, response) {
|
||||
if (err) { console.error(err) }
|
||||
console.dir(response, {depth: null, colors: true});
|
||||
}
|
||||
|
||||
se_scraper.scrape(config, callback);
|
||||
```
|
||||
|
||||
### Technical Notes
|
||||
|
||||
Scraping is done with a headless chromium browser using the automation library puppeteer. Puppeteer is a Node library which provides a high-level API to control headless Chrome or Chromium over the DevTools Protocol.
|
||||
@ -75,13 +105,7 @@ Consider the following resources:
|
||||
|
||||
* https://intoli.com/blog/making-chrome-headless-undetectable/
|
||||
|
||||
### Installation and Usage
|
||||
|
||||
Install with
|
||||
|
||||
```bash
|
||||
npm install se-scraper
|
||||
```
|
||||
### Advanced Usage
|
||||
|
||||
Use se-scraper by calling it with a script such as the one below.
|
||||
|
||||
@ -162,9 +186,7 @@ Supported options for the `search_engine` config key:
|
||||
'baidu'
|
||||
'youtube'
|
||||
'duckduckgo_news'
|
||||
'google_dr'
|
||||
'yahoo_news'
|
||||
// ticker search
|
||||
'bloomberg'
|
||||
'reuters'
|
||||
'cnbc'
|
||||
|
17
examples/quickstart.js
Normal file
17
examples/quickstart.js
Normal file
@ -0,0 +1,17 @@
|
||||
const se_scraper = require('./../index.js');
|
||||
|
||||
let config = {
|
||||
search_engine: 'duckduckgo',
|
||||
debug: false,
|
||||
verbose: false,
|
||||
keywords: ['news'],
|
||||
num_pages: 2,
|
||||
output_file: 'data.json',
|
||||
};
|
||||
|
||||
function callback(err, response) {
|
||||
if (err) { console.error(err) }
|
||||
console.dir(response, {depth: null, colors: true});
|
||||
}
|
||||
|
||||
se_scraper.scrape(config, callback);
|
4
index.js
4
index.js
@ -8,11 +8,11 @@ exports.scrape = async function(config, callback) {
|
||||
// the user agent to scrape with
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
// if random_user_agent is set to True, a random user agent is chosen
|
||||
random_user_agent: false,
|
||||
random_user_agent: true,
|
||||
// whether to select manual settings in visible mode
|
||||
set_manual_settings: false,
|
||||
// get meta data of scraping in return object
|
||||
write_meta_data: true,
|
||||
write_meta_data: false,
|
||||
log_http_headers: false,
|
||||
// how long to sleep between requests. a random sleep interval within the range [a,b]
|
||||
// is drawn before every request. empty string for no sleeping.
|
||||
|
14
run.js
14
run.js
@ -12,23 +12,23 @@ let config = {
|
||||
// is drawn before every request. empty string for no sleeping.
|
||||
sleep_range: '[1,2]',
|
||||
// which search engine to scrape
|
||||
search_engine: 'google_news',
|
||||
search_engine: 'google',
|
||||
// whether debug information should be printed
|
||||
// debug info is useful for developers when debugging
|
||||
debug: true,
|
||||
debug: false,
|
||||
// whether verbose program output should be printed
|
||||
// this output is informational
|
||||
verbose: true,
|
||||
// an array of keywords to scrape
|
||||
keywords: ['hacking', 'trump'],
|
||||
keywords: ['news'],
|
||||
// alternatively you can specify a keyword_file. this overwrites the keywords array
|
||||
keyword_file: '',
|
||||
// the number of pages to scrape for each keyword
|
||||
num_pages: 1,
|
||||
num_pages: 2,
|
||||
// whether to start the browser in headless mode
|
||||
headless: false,
|
||||
headless: true,
|
||||
// path to output file, data will be stored in JSON
|
||||
output_file: 'data.json',
|
||||
output_file: '',
|
||||
// whether to prevent images, css, fonts from being loaded
|
||||
// will speed up scraping a great deal
|
||||
block_assets: true,
|
||||
@ -41,7 +41,7 @@ let config = {
|
||||
// use a proxy for all connections
|
||||
// example: 'socks5://78.94.172.42:1080'
|
||||
// example: 'http://118.174.233.10:48400'
|
||||
//proxy: 'socks5://78.94.172.42:1080',
|
||||
proxy: '',
|
||||
};
|
||||
|
||||
function callback(err, response) {
|
||||
|
@ -1,109 +1,78 @@
|
||||
const cheerio = require('cheerio');
|
||||
const sfunctions = require('./functions.js');
|
||||
const Scraper = require('./se_scraper');
|
||||
|
||||
module.exports = {
|
||||
scrape_baidu_pup: scrape_baidu_pup,
|
||||
};
|
||||
class BaiduScraper extends Scraper {
|
||||
parse(html) {
|
||||
// load the page source into cheerio
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
async function scrape_baidu_pup(page, event, context, pluggable) {
|
||||
await page.goto('https://www.baidu.com/');
|
||||
// perform queries
|
||||
const results = [];
|
||||
$('#content_left .result').each((i, link) => {
|
||||
results.push({
|
||||
link: $(link).find('h3 a').attr('href'),
|
||||
title: $(link).find('h3').text(),
|
||||
snippet: $(link).find('.c-abstract').text(),
|
||||
visible_link: $(link).find('.f13').text(),
|
||||
})
|
||||
});
|
||||
|
||||
try {
|
||||
await page.waitForSelector('input[name="wd"]', { timeout: 5000 });
|
||||
} catch (e) {
|
||||
return results;
|
||||
}
|
||||
|
||||
let keywords = event.keywords;
|
||||
var results = {};
|
||||
|
||||
for (var i = 0; i < keywords.length; i++) {
|
||||
|
||||
keyword = keywords[i];
|
||||
|
||||
if (pluggable.before_keyword_scraped) {
|
||||
await pluggable.before_keyword_scraped({
|
||||
keyword: keyword,
|
||||
page: page,
|
||||
event: event,
|
||||
context: context,
|
||||
});
|
||||
}
|
||||
|
||||
try {
|
||||
const input = await page.$('input[name="wd"]');
|
||||
// overwrites last text in input
|
||||
await input.click({ clickCount: 3 });
|
||||
await input.type(keyword);
|
||||
await input.focus();
|
||||
await page.keyboard.press("Enter");
|
||||
|
||||
if (event.sleep_range) {
|
||||
await sfunctions.random_sleep(event);
|
||||
}
|
||||
|
||||
// in baidu we have a issue with waiting for a selector
|
||||
// or waiting for navigation
|
||||
// therefore, we just manually sleep
|
||||
|
||||
// issue in baidu: https://github.com/GoogleChrome/puppeteer/issues/609
|
||||
// https://github.com/GoogleChrome/puppeteer/issues/2671
|
||||
// await page.evaluate( () => {
|
||||
// if ( ! window.Node ) {
|
||||
// window.Node = {};
|
||||
// }
|
||||
// if ( ! Node.ELEMENT_NODE ) {
|
||||
// Node.ELEMENT_NODE = 1;
|
||||
// }
|
||||
// } );
|
||||
// await page.waitForSelector('.result', { timeout: 5000 });
|
||||
|
||||
// this should be reasonable for normal internet connections
|
||||
await sfunctions.sleep(2000);
|
||||
|
||||
if (event.debug === true && event.is_local === true) {
|
||||
await page.screenshot({path: `debug/${keyword}.png`});
|
||||
const cleaned = [];
|
||||
for (var i=0; i < results.length; i++) {
|
||||
let res = results[i];
|
||||
if (res.link && res.link.trim()) {
|
||||
res.rank = this.result_rank++;
|
||||
cleaned.push(res);
|
||||
}
|
||||
}
|
||||
|
||||
let html = await page.content();
|
||||
results[keyword] = parse(html);
|
||||
|
||||
} catch (e) {
|
||||
console.error(`Problem with scraping ${keyword}: ${e}`);
|
||||
return {
|
||||
time: (new Date()).toUTCString(),
|
||||
no_results: false,
|
||||
num_results: $('.nums_text').text(),
|
||||
results: cleaned,
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
async load_start_page() {
|
||||
try {
|
||||
await this.page.goto('https://www.baidu.com/');
|
||||
await this.page.waitForSelector('input[name="wd"]', { timeout: 5000 });
|
||||
} catch (e) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
async search_keyword(keyword) {
|
||||
const input = await this.page.$('input[name="wd"]');
|
||||
// overwrites last text in input
|
||||
await input.click({ clickCount: 3 });
|
||||
await input.type(keyword);
|
||||
await input.focus();
|
||||
await this.page.keyboard.press("Enter");
|
||||
}
|
||||
|
||||
async next_page() {
|
||||
let next_page_link = await this.page.$('.sb_pagN', {timeout: 1000});
|
||||
if (!next_page_link) {
|
||||
return false;
|
||||
}
|
||||
await next_page_link.click();
|
||||
await this.page.waitForNavigation();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
async wait_for_results() {
|
||||
// TODO: very very bad, but nobody uses baidu, or does someone?
|
||||
await this.sleep(2000);
|
||||
}
|
||||
|
||||
async detected() {
|
||||
}
|
||||
}
|
||||
|
||||
function parse(html) {
|
||||
// load the page source into cheerio
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
// perform queries
|
||||
const results = [];
|
||||
$('#content_left .result').each((i, link) => {
|
||||
results.push({
|
||||
link: $(link).find('h3 a').attr('href'),
|
||||
title: $(link).find('h3').text(),
|
||||
snippet: $(link).find('.c-abstract').text(),
|
||||
visible_link: $(link).find('.f13').text(),
|
||||
})
|
||||
});
|
||||
|
||||
const cleaned = [];
|
||||
for (var i=0; i < results.length; i++) {
|
||||
let res = results[i];
|
||||
if (res.link && res.link.trim()) {
|
||||
res.rank = i+1;
|
||||
cleaned.push(res);
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
time: (new Date()).toUTCString(),
|
||||
no_results: false,
|
||||
num_results: $('.nums_text').text(),
|
||||
results: cleaned,
|
||||
}
|
||||
}
|
||||
module.exports = {
|
||||
BaiduScraper: BaiduScraper,
|
||||
};
|
@ -29,7 +29,7 @@ class BingScraper extends Scraper {
|
||||
for (var i=0; i < results.length; i++) {
|
||||
let res = results[i];
|
||||
if (res.link && res.link.trim() && res.title && res.title.trim()) {
|
||||
res.rank = i+1;
|
||||
res.rank = this.result_rank++;
|
||||
cleaned.push(res);
|
||||
}
|
||||
}
|
||||
@ -104,7 +104,7 @@ class BingNewsScraper extends Scraper {
|
||||
for (var i=0; i < results.length; i++) {
|
||||
let res = results[i];
|
||||
if (res.link && res.link.trim() && res.title && res.title.trim()) {
|
||||
res.rank = i+1;
|
||||
res.rank = this.result_rank++;
|
||||
cleaned.push(res);
|
||||
}
|
||||
}
|
||||
|
@ -1,94 +1,148 @@
|
||||
const cheerio = require('cheerio');
|
||||
const sfunctions = require('./functions.js');
|
||||
const Scraper = require('./se_scraper');
|
||||
|
||||
module.exports = {
|
||||
scrape_duckduckgo_news_pup: scrape_duckduckgo_news_pup,
|
||||
};
|
||||
class DuckduckgoScraper extends Scraper {
|
||||
|
||||
async function scrape_duckduckgo_news_pup(page, event, context, pluggable) {
|
||||
await page.goto('https://duckduckgo.com/?q=42&t=h_&iar=news&ia=news');
|
||||
parse(html) {
|
||||
// load the page source into cheerio
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
try {
|
||||
await page.waitForSelector('input[name="q"]', { timeout: 5000 });
|
||||
} catch (e) {
|
||||
return results;
|
||||
}
|
||||
|
||||
let keywords = event.keywords;
|
||||
var results = {};
|
||||
|
||||
for (var i = 0; i < keywords.length; i++) {
|
||||
|
||||
keyword = keywords[i];
|
||||
|
||||
if (pluggable.before_keyword_scraped) {
|
||||
await pluggable.before_keyword_scraped({
|
||||
keyword: keyword,
|
||||
page: page,
|
||||
event: event,
|
||||
context: context,
|
||||
// perform queries
|
||||
const results = [];
|
||||
$('.result__body').each((i, link) => {
|
||||
results.push({
|
||||
link: $(link).find('.result__title .result__a').attr('href'),
|
||||
title: $(link).find('.result__title .result__a').text(),
|
||||
date: $(link).find('.result__timestamp').text(),
|
||||
snippet: $(link).find('.result__snippet').text(),
|
||||
visible_link: $(link).find('.result__url').attr('href'),
|
||||
});
|
||||
});
|
||||
|
||||
const cleaned = [];
|
||||
for (var i=0; i < results.length; i++) {
|
||||
let res = results[i];
|
||||
if (res.link && res.link.trim() && res.title && res.title.trim()) {
|
||||
res.rank = this.result_rank++;
|
||||
cleaned.push(res);
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
const input = await page.$('input[name="q"]');
|
||||
// overwrites last text in input
|
||||
await input.click({ clickCount: 3 });
|
||||
await sfunctions.sleep(150);
|
||||
await input.type(keyword);
|
||||
await sfunctions.sleep(150);
|
||||
await input.focus();
|
||||
await page.keyboard.press("Enter");
|
||||
|
||||
if (event.sleep_range) {
|
||||
await sfunctions.random_sleep(event);
|
||||
}
|
||||
|
||||
// await page.waitForSelector('.result--news', { timeout: 5000 });
|
||||
await page.waitForSelector('.serp__results', { timeout: 5000 });
|
||||
|
||||
await sfunctions.sleep(1500);
|
||||
|
||||
if (event.debug === true && event.is_local === true) {
|
||||
await page.screenshot({path: `debug/${keyword}.png`});
|
||||
}
|
||||
let html = await page.content();
|
||||
results[keyword] = parse_duckduckgo_news_results(html, event.max_results);
|
||||
|
||||
} catch (e) {
|
||||
console.error(`Problem with scraping ${keyword}: ${e}`);
|
||||
return results;
|
||||
return {
|
||||
time: (new Date()).toUTCString(),
|
||||
results: cleaned
|
||||
}
|
||||
}
|
||||
return results;
|
||||
|
||||
async load_start_page() {
|
||||
try {
|
||||
await this.page.goto('https://duckduckgo.com/');
|
||||
await this.page.waitForSelector('input[name="q"]', { timeout: 5000 });
|
||||
} catch (e) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
async search_keyword(keyword) {
|
||||
const input = await this.page.$('input[name="q"]');
|
||||
await this.set_input_value(`input[name="q"]`, keyword);
|
||||
await this.sleep(50);
|
||||
await input.focus();
|
||||
await this.page.keyboard.press("Enter");
|
||||
}
|
||||
|
||||
async next_page() {
|
||||
let next_page_link = await this.page.$('a.result--more__btn', {timeout: 1000});
|
||||
if (!next_page_link) {
|
||||
return false;
|
||||
}
|
||||
await next_page_link.click();
|
||||
//await this.page.waitForNavigation();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
async wait_for_results() {
|
||||
await this.page.waitForSelector('.serp__results', { timeout: 5000 });
|
||||
}
|
||||
|
||||
async detected() {
|
||||
}
|
||||
}
|
||||
|
||||
function parse_duckduckgo_news_results(html) {
|
||||
// load the page source into cheerio
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
// perform queries
|
||||
const results = [];
|
||||
$('.result--news').each((i, link) => {
|
||||
results.push({
|
||||
link: $(link).find('.result__title .result__a').attr('href'),
|
||||
title: $(link).find('.result__title .result__a').text(),
|
||||
date: $(link).find('.result__timestamp').text(),
|
||||
snippet: $(link).find('.result__snippet').text(),
|
||||
class DuckduckgoNewsScraper extends Scraper {
|
||||
|
||||
parse(html) {
|
||||
// load the page source into cheerio
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
// perform queries
|
||||
const results = [];
|
||||
$('.result--news').each((i, link) => {
|
||||
results.push({
|
||||
link: $(link).find('.result__title .result__a').attr('href'),
|
||||
title: $(link).find('.result__title .result__a').text(),
|
||||
date: $(link).find('.result__timestamp').text(),
|
||||
snippet: $(link).find('.result__snippet').text(),
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
const cleaned = [];
|
||||
for (var i=0; i < results.length; i++) {
|
||||
let res = results[i];
|
||||
if (res.link && res.link.trim() && res.title && res.title.trim()) {
|
||||
res.rank = i+1;
|
||||
cleaned.push(res);
|
||||
const cleaned = [];
|
||||
for (var i=0; i < results.length; i++) {
|
||||
let res = results[i];
|
||||
if (res.link && res.link.trim() && res.title && res.title.trim()) {
|
||||
res.rank = this.result_rank++;
|
||||
cleaned.push(res);
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
time: (new Date()).toUTCString(),
|
||||
results: cleaned
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
time: (new Date()).toUTCString(),
|
||||
results: cleaned
|
||||
async load_start_page() {
|
||||
try {
|
||||
await page.goto('https://duckduckgo.com/?q=42&t=h_&iar=news&ia=news');
|
||||
await page.waitForSelector('input[name="q"]', { timeout: 5000 });
|
||||
} catch (e) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
async search_keyword(keyword) {
|
||||
const input = await this.page.$('input[name="q"]');
|
||||
await this.set_input_value(`input[name="q"]`, keyword);
|
||||
await this.sleep(50);
|
||||
await input.focus();
|
||||
await this.page.keyboard.press("Enter");
|
||||
}
|
||||
|
||||
async next_page() {
|
||||
let next_page_link = await this.page.$('.sb_pagN', {timeout: 1000});
|
||||
if (!next_page_link) {
|
||||
return false;
|
||||
}
|
||||
await next_page_link.click();
|
||||
await this.page.waitForNavigation();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
async wait_for_results() {
|
||||
await this.page.waitForSelector('.serp__results', { timeout: 5000 });
|
||||
await this.sleep(1500);
|
||||
}
|
||||
|
||||
async detected() {
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
DuckduckgoNewsScraper: DuckduckgoNewsScraper,
|
||||
DuckduckgoScraper: DuckduckgoScraper,
|
||||
};
|
@ -1,40 +0,0 @@
|
||||
module.exports = {
|
||||
no_results: no_results,
|
||||
effective_query: effective_query,
|
||||
sleep: sleep,
|
||||
random_sleep: random_sleep,
|
||||
set_input_value: set_input_value,
|
||||
|
||||
};
|
||||
|
||||
async function set_input_value(page, selector, value) {
|
||||
await page.waitFor(selector);
|
||||
await page.evaluate((value, selector) => {
|
||||
return document.querySelector(selector).value = value;
|
||||
}, value, selector);
|
||||
}
|
||||
|
||||
function no_results(needles, html) {
|
||||
return !needles.map((needle) => { return html.indexOf(needle)})
|
||||
.every((res) => { return res == -1});
|
||||
}
|
||||
|
||||
function effective_query(needles, html) {
|
||||
return;
|
||||
}
|
||||
|
||||
function sleep(ms) {
|
||||
return new Promise(resolve => {
|
||||
setTimeout(resolve, ms)
|
||||
})
|
||||
}
|
||||
|
||||
async function random_sleep(config) {
|
||||
var min, max;
|
||||
[min, max] = config.sleep_range;
|
||||
var rand = Math.floor(Math.random() * (max - min + 1) + min); //Generate Random number
|
||||
if (config.debug === true) {
|
||||
console.log(`Sleeping for ${rand}s`);
|
||||
}
|
||||
await sleep(rand * 1000);
|
||||
}
|
@ -1,5 +1,4 @@
|
||||
const cheerio = require('cheerio');
|
||||
const sfunctions = require('./functions.js');
|
||||
const Scraper = require('./se_scraper');
|
||||
|
||||
class GoogleScraper extends Scraper {
|
||||
@ -20,7 +19,7 @@ class GoogleScraper extends Scraper {
|
||||
})
|
||||
});
|
||||
|
||||
let no_results = sfunctions.no_results(
|
||||
let no_results = this.no_results(
|
||||
['Es wurden keine mit deiner Suchanfrage', 'did not match any documents', 'Keine Ergebnisse für',
|
||||
'No results found for', 'Ergebnisse für', 'Showing results for'],
|
||||
$('#main').text()
|
||||
@ -35,7 +34,7 @@ class GoogleScraper extends Scraper {
|
||||
for (var i=0; i < results.length; i++) {
|
||||
let res = results[i];
|
||||
if (res.link && res.link.trim() && res.title && res.title.trim()) {
|
||||
res.rank = i+1;
|
||||
res.rank = this.result_rank++;
|
||||
cleaned.push(res);
|
||||
}
|
||||
}
|
||||
@ -108,7 +107,7 @@ class GoogleNewsOldScraper extends Scraper {
|
||||
})
|
||||
});
|
||||
|
||||
let no_results = sfunctions.no_results(
|
||||
let no_results = this.no_results(
|
||||
['Es wurden keine mit deiner Suchanfrage', 'did not match any documents', 'Keine Ergebnisse für',
|
||||
'No results found for', 'Ergebnisse für', 'Showing results for', 'did not match any news results'],
|
||||
$('#main').text()
|
||||
@ -123,7 +122,7 @@ class GoogleNewsOldScraper extends Scraper {
|
||||
for (var i=0; i < results.length; i++) {
|
||||
let res = results[i];
|
||||
if (res.link && res.link.trim()) {
|
||||
res.rank = i+1;
|
||||
res.rank = this.result_rank++;
|
||||
cleaned.push(res);
|
||||
}
|
||||
}
|
||||
@ -161,7 +160,7 @@ class GoogleNewsOldScraper extends Scraper {
|
||||
async wait_for_results() {
|
||||
//await this.page.waitForNavigation({ timeout: this.STANDARD_TIMEOUT });
|
||||
await this.page.waitForSelector('#main', { timeout: this.STANDARD_TIMEOUT });
|
||||
await this.sleep(200);
|
||||
await this.sleep(500);
|
||||
}
|
||||
|
||||
async detected() {
|
||||
@ -190,7 +189,7 @@ class GoogleImageScraper extends Scraper {
|
||||
})
|
||||
});
|
||||
|
||||
let no_results = sfunctions.no_results(
|
||||
let no_results = this.no_results(
|
||||
['stimmt mit keinem Bildergebnis', 'Keine Ergebnisse für', 'not match any image results', 'No results found for',
|
||||
'Showing results for', 'Ergebnisse für'],
|
||||
$('#main').text()
|
||||
@ -206,7 +205,7 @@ class GoogleImageScraper extends Scraper {
|
||||
let res = results[i];
|
||||
if (res.link && res.link.trim() && res.link.trim().length > 10) {
|
||||
res.link = res.link.trim();
|
||||
res.rank = i+1;
|
||||
res.rank = this.result_rank++;
|
||||
cleaned.push(res);
|
||||
}
|
||||
}
|
||||
@ -252,7 +251,7 @@ class GoogleImageScraper extends Scraper {
|
||||
|
||||
async wait_for_results() {
|
||||
await this.page.waitForSelector('#main', { timeout: this.STANDARD_TIMEOUT });
|
||||
await this.sleep(100);
|
||||
await this.sleep(500);
|
||||
}
|
||||
|
||||
async detected() {
|
||||
@ -296,7 +295,7 @@ class GoogleNewsScraper extends Scraper {
|
||||
this.all_results.add(title);
|
||||
});
|
||||
|
||||
let no_results = sfunctions.no_results(
|
||||
let no_results = this.no_results(
|
||||
['Es wurden keine mit deiner Suchanfrage', 'did not match any documents', 'Keine Ergebnisse für',
|
||||
'No results found for', 'Ergebnisse für', 'Showing results for', 'did not match any news results'],
|
||||
$('body').text()
|
||||
@ -308,7 +307,7 @@ class GoogleNewsScraper extends Scraper {
|
||||
for (var i=0; i < results.length; i++) {
|
||||
let res = results[i];
|
||||
if (res.title && res.title.trim()) {
|
||||
res.rank = i+1;
|
||||
res.rank = this.result_rank++;
|
||||
cleaned.push(res);
|
||||
}
|
||||
}
|
||||
@ -333,6 +332,7 @@ class GoogleNewsScraper extends Scraper {
|
||||
// parse here front page results
|
||||
let html = await this.page.content();
|
||||
this.results['frontpage'] = this.parse(html);
|
||||
this.result_rank = 1;
|
||||
} catch(e) {
|
||||
return false;
|
||||
}
|
||||
@ -367,7 +367,6 @@ class GoogleNewsScraper extends Scraper {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
function clean_image_url(url) {
|
||||
// Example:
|
||||
// https://www.google.com/imgres?imgurl=https%3A%2F%2Fupload.wikimedia.org%2Fwikipedia%2Fen%2Fthumb%2Ff%2Ffd%2F1928_Edward_Campbell.jpg%2F220px-1928_Edward_Campbell.jpg&imgrefurl=https%3A%2F%2Fwww.revolvy.com%2Fpage%2FSir-Edward-Campbell%252C-1st-Baronet&docid=BMkW_GerTIY4GM&tbnid=TmQapIxDCQbQhM%3A&vet=10ahUKEwje_LLE_YXeAhXisaQKHVAEBSAQMwiNAShEMEQ..i&w=220&h=290&bih=1696&biw=1280&q=John%20MacLeod%20Breadalbane%20Councillor%20Prince%20Edward%20Island&ved=0ahUKEwje_LLE_YXeAhXisaQKHVAEBSAQMwiNAShEMEQ&iact=mrc&uact=8
|
||||
|
@ -1,186 +1,157 @@
|
||||
const cheerio = require('cheerio');
|
||||
const sfunctions = require('./functions.js');
|
||||
const Scraper = require('./se_scraper');
|
||||
|
||||
class InfospaceScraper extends Scraper {
|
||||
|
||||
parse(html) {
|
||||
// load the page source into cheerio
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
// perform queries
|
||||
const results = [];
|
||||
$('.result').each((i, link) => {
|
||||
results.push({
|
||||
link: $(link).find('a.title').attr('href'),
|
||||
title: $(link).find('a.title').text(),
|
||||
snippet: $(link).find('.description').text(),
|
||||
visible_link: $(link).find('.url').text(),
|
||||
})
|
||||
});
|
||||
|
||||
const cleaned = [];
|
||||
for (var i=0; i < results.length; i++) {
|
||||
let res = results[i];
|
||||
if (res.link && res.link.trim()) {
|
||||
res.rank = this.result_rank++;
|
||||
cleaned.push(res);
|
||||
}
|
||||
}
|
||||
|
||||
let no_results = this.no_results(
|
||||
['No search results were found for'],
|
||||
$('.layout__mainline').text()
|
||||
);
|
||||
|
||||
return {
|
||||
time: (new Date()).toUTCString(),
|
||||
no_results: no_results,
|
||||
num_results: '',
|
||||
results: cleaned,
|
||||
}
|
||||
}
|
||||
|
||||
async load_start_page() {
|
||||
try {
|
||||
await this.page.goto('http://infospace.com/index.html');
|
||||
await this.page.waitForSelector('input[name="q"]', { timeout: 5000 });
|
||||
} catch (e) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
async search_keyword(keyword) {
|
||||
const input = await this.page.$('input[id="q"]');
|
||||
await this.set_input_value('input[id="q"]', keyword);
|
||||
await this.sleep(50);
|
||||
await input.focus();
|
||||
await this.page.keyboard.press("Enter");
|
||||
}
|
||||
|
||||
async next_page() {
|
||||
let next_page_link = await this.page.$('a.next', {timeout: 1000});
|
||||
if (!next_page_link) {
|
||||
return false;
|
||||
}
|
||||
await next_page_link.click();
|
||||
await this.page.waitForNavigation();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
async wait_for_results() {
|
||||
await this.page.waitForSelector('.mainline-results', { timeout: 5000 }); // TODO: this is not the best selector.
|
||||
await this.sleep(250);
|
||||
}
|
||||
|
||||
async detected() {
|
||||
}
|
||||
}
|
||||
|
||||
class WebcrawlerNewsScraper extends Scraper {
|
||||
|
||||
parse(html) {
|
||||
// load the page source into cheerio
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
// perform queries
|
||||
const results = [];
|
||||
$('.article').each((i, link) => {
|
||||
let source = $(link).find('.source').text();
|
||||
let date = source.split(',')[1] || '';
|
||||
results.push({
|
||||
link: $(link).find('a').attr('href'),
|
||||
title: $(link).find('.title').text(),
|
||||
publisher: $(link).find('.source').text(),
|
||||
date: date,
|
||||
snippet: $(link).find('.description').text(),
|
||||
});
|
||||
});
|
||||
|
||||
const cleaned = [];
|
||||
for (var i=0; i < results.length; i++) {
|
||||
let res = results[i];
|
||||
if (res.link && res.link.trim() && res.title && res.title.trim()) {
|
||||
res.rank = this.result_rank++;
|
||||
cleaned.push(res);
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
time: (new Date()).toUTCString(),
|
||||
results: cleaned
|
||||
}
|
||||
}
|
||||
|
||||
async load_start_page() {
|
||||
try {
|
||||
await this.page.goto('https://www.webcrawler.com/?qc=news');
|
||||
await this.page.waitForSelector('input[name="q"]', { timeout: 5000 });
|
||||
} catch (e) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
async search_keyword(keyword) {
|
||||
const input = await this.page.$('input[name="q"]');
|
||||
await this.set_input_value('input[name="q"]', keyword);
|
||||
await this.sleep(50);
|
||||
await input.focus();
|
||||
await this.page.keyboard.press("Enter");
|
||||
}
|
||||
|
||||
async next_page() {
|
||||
let next_page_link = await this.page.$('.pagination__num--next', {timeout: 1000});
|
||||
if (!next_page_link) {
|
||||
return false;
|
||||
}
|
||||
await next_page_link.click();
|
||||
await this.page.waitForNavigation();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
async wait_for_results() {
|
||||
await this.page.waitForSelector('.mainline-results', { timeout: 5000 });
|
||||
await this.sleep(150);
|
||||
}
|
||||
|
||||
async detected() {
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
scrape_infospace_pup: scrape_infospace_pup,
|
||||
scrape_webcrawler_news_pup: scrape_webcrawler_news_pup,
|
||||
};
|
||||
|
||||
async function scrape_infospace_pup(page, event, context, pluggable) {
|
||||
await page.goto('http://infospace.com/index.html');
|
||||
|
||||
try {
|
||||
await page.waitForSelector('input[name="q"]', { timeout: 5000 });
|
||||
} catch (e) {
|
||||
return results;
|
||||
}
|
||||
|
||||
let keywords = event.keywords;
|
||||
var results = {};
|
||||
|
||||
for (var i = 0; i < keywords.length; i++) {
|
||||
|
||||
keyword = keywords[i];
|
||||
|
||||
if (pluggable.before_keyword_scraped) {
|
||||
await pluggable.before_keyword_scraped({
|
||||
keyword: keyword,
|
||||
page: page,
|
||||
event: event,
|
||||
context: context,
|
||||
});
|
||||
}
|
||||
|
||||
try {
|
||||
const input = await page.$('input[id="q"]');
|
||||
// overwrites last text in input
|
||||
await input.click({ clickCount: 3 });
|
||||
await input.type(keyword);
|
||||
await input.focus();
|
||||
await page.keyboard.press("Enter");
|
||||
|
||||
if (event.sleep_range) {
|
||||
await sfunctions.random_sleep(event);
|
||||
}
|
||||
|
||||
await page.waitForSelector('.mainline-results', { timeout: 5000 }); // TODO: this is not the best selector.
|
||||
await sfunctions.sleep(250);
|
||||
if (event.debug === true && event.is_local === true) {
|
||||
await page.screenshot({path: `debug/${keyword}.png`});
|
||||
}
|
||||
|
||||
let html = await page.content();
|
||||
results[keyword] = parse(html);
|
||||
|
||||
} catch (e) {
|
||||
console.error(`Problem with scraping ${keyword}: ${e}`);
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
function parse(html) {
|
||||
// load the page source into cheerio
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
// perform queries
|
||||
const results = [];
|
||||
$('.result').each((i, link) => {
|
||||
results.push({
|
||||
link: $(link).find('a.title').attr('href'),
|
||||
title: $(link).find('a.title').text(),
|
||||
snippet: $(link).find('.description').text(),
|
||||
visible_link: $(link).find('.url').text(),
|
||||
})
|
||||
});
|
||||
|
||||
const cleaned = [];
|
||||
for (var i=0; i < results.length; i++) {
|
||||
let res = results[i];
|
||||
if (res.link && res.link.trim()) {
|
||||
res.rank = i+1;
|
||||
cleaned.push(res);
|
||||
}
|
||||
}
|
||||
|
||||
let no_results = sfunctions.no_results(
|
||||
['No search results were found for'],
|
||||
$('.layout__mainline').text()
|
||||
);
|
||||
|
||||
return {
|
||||
time: (new Date()).toUTCString(),
|
||||
no_results: no_results,
|
||||
num_results: '',
|
||||
results: cleaned,
|
||||
}
|
||||
}
|
||||
|
||||
async function scrape_webcrawler_news_pup(page, event, context, pluggable) {
|
||||
await page.goto('https://www.webcrawler.com/?qc=news');
|
||||
|
||||
try {
|
||||
await page.waitForSelector('input[name="q"]', { timeout: 5000 });
|
||||
} catch (e) {
|
||||
return results;
|
||||
}
|
||||
|
||||
let keywords = event.keywords;
|
||||
var results = {};
|
||||
|
||||
for (var i = 0; i < keywords.length; i++) {
|
||||
|
||||
keyword = keywords[i];
|
||||
|
||||
if (pluggable.before_keyword_scraped) {
|
||||
await pluggable.before_keyword_scraped({
|
||||
keyword: keyword,
|
||||
page: page,
|
||||
event: event,
|
||||
context: context,
|
||||
});
|
||||
}
|
||||
|
||||
try {
|
||||
const input = await page.$('input[name="q"]');
|
||||
// overwrites last text in input
|
||||
await input.click({ clickCount: 3 });
|
||||
await sfunctions.sleep(150);
|
||||
await input.type(keyword);
|
||||
await sfunctions.sleep(150);
|
||||
await input.focus();
|
||||
await page.keyboard.press("Enter");
|
||||
|
||||
if (event.sleep_range) {
|
||||
await sfunctions.random_sleep(event);
|
||||
}
|
||||
|
||||
await page.waitForSelector('.mainline-results', { timeout: 5000 });
|
||||
|
||||
if (event.debug === true && event.is_local === true) {
|
||||
await page.screenshot({path: `debug/${keyword}.png`});
|
||||
}
|
||||
let html = await page.content();
|
||||
results[keyword] = parse_webcrawler_news_results(html, event.max_results);
|
||||
|
||||
} catch (e) {
|
||||
console.error(`Problem with scraping ${keyword}: ${e}`);
|
||||
return results;
|
||||
}
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
function parse_webcrawler_news_results(html) {
|
||||
// load the page source into cheerio
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
// perform queries
|
||||
const results = [];
|
||||
$('.article').each((i, link) => {
|
||||
let source = $(link).find('.source').text();
|
||||
let date = source.split(',')[1] || '';
|
||||
results.push({
|
||||
link: $(link).find('a').attr('href'),
|
||||
title: $(link).find('.title').text(),
|
||||
publisher: $(link).find('.source').text(),
|
||||
date: date,
|
||||
snippet: $(link).find('.description').text(),
|
||||
});
|
||||
});
|
||||
|
||||
const cleaned = [];
|
||||
for (var i=0; i < results.length; i++) {
|
||||
let res = results[i];
|
||||
if (res.link && res.link.trim() && res.title && res.title.trim()) {
|
||||
res.rank = i+1;
|
||||
cleaned.push(res);
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
time: (new Date()).toUTCString(),
|
||||
results: cleaned
|
||||
}
|
||||
}
|
||||
InfospaceScraper: InfospaceScraper,
|
||||
WebcrawlerNewsScraper: WebcrawlerNewsScraper,
|
||||
};
|
@ -81,6 +81,8 @@ module.exports = class Scraper {
|
||||
*/
|
||||
async scraping_loop() {
|
||||
|
||||
this.result_rank = 1;
|
||||
|
||||
for (let keyword of this.config.keywords) {
|
||||
this.keyword = keyword;
|
||||
this.results[keyword] = {};
|
||||
@ -121,7 +123,7 @@ module.exports = class Scraper {
|
||||
break;
|
||||
}
|
||||
|
||||
} while (page_num < event.num_pages);
|
||||
} while (page_num <= event.num_pages);
|
||||
|
||||
} catch (e) {
|
||||
|
||||
|
@ -1,5 +1,4 @@
|
||||
const cheerio = require('cheerio');
|
||||
const sfunctions = require('./functions.js');
|
||||
|
||||
module.exports = {
|
||||
scrape_yahoo_finance_pup: scrape_yahoo_finance_pup,
|
||||
@ -7,9 +6,14 @@ module.exports = {
|
||||
scrape_reuters_finance_pup: scrape_reuters_finance_pup,
|
||||
scrape_cnbc_finance_pup: scrape_cnbc_finance_pup,
|
||||
scrape_marketwatch_finance_pup: scrape_marketwatch_finance_pup,
|
||||
not_implemented: undefined,
|
||||
};
|
||||
|
||||
// https://www.google.com/search?q=MSFT&tbm=fin
|
||||
function sleep(ms) {
|
||||
return new Promise(resolve => {
|
||||
setTimeout(resolve, ms)
|
||||
})
|
||||
}
|
||||
|
||||
async function scrape_yahoo_finance_pup(page, event, context, pluggable) {
|
||||
var results = {};
|
||||
@ -40,7 +44,7 @@ async function scrape_yahoo_finance_pup(page, event, context, pluggable) {
|
||||
await page.screenshot({path: `debug/${keyword}.png`});
|
||||
}
|
||||
|
||||
await sfunctions.sleep(1000);
|
||||
await sleep(1000);
|
||||
|
||||
let html = await page.content();
|
||||
results[keyword] = parse(html);
|
||||
@ -90,7 +94,7 @@ async function scrape_marketwatch_finance_pup(page, event, context, pluggable) {
|
||||
await page.screenshot({path: `debug/${keyword}.png`});
|
||||
}
|
||||
|
||||
await sfunctions.sleep(500);
|
||||
await sleep(500);
|
||||
|
||||
let newsData = await page.evaluate(() => {
|
||||
let results = [];
|
||||
@ -150,7 +154,7 @@ async function scrape_bloomberg_finance_pup(page, event, context, pluggable) {
|
||||
await page.screenshot({path: `debug/${keyword}.png`});
|
||||
}
|
||||
|
||||
await sfunctions.sleep(1000);
|
||||
await sleep(1000);
|
||||
|
||||
let news_items = await page.$x('//*[starts-with(@class,"newsItem")]');
|
||||
for (let item of news_items) {
|
||||
@ -189,7 +193,7 @@ async function scrape_reuters_finance_pup(page, event, context, pluggable) {
|
||||
await page.screenshot({path: `debug/${keyword}.png`});
|
||||
}
|
||||
|
||||
await sfunctions.sleep(500);
|
||||
await sleep(500);
|
||||
|
||||
let newsData = await page.evaluate(() => {
|
||||
let results = [];
|
||||
@ -246,7 +250,7 @@ async function scrape_cnbc_finance_pup(page, event, context, pluggable) {
|
||||
await page.screenshot({path: `debug/${keyword}.png`});
|
||||
}
|
||||
|
||||
await sfunctions.sleep(500);
|
||||
await sleep(500);
|
||||
|
||||
let newsData = await page.evaluate(() => {
|
||||
let results = [];
|
||||
|
@ -1,121 +1,105 @@
|
||||
const cheerio = require('cheerio');
|
||||
const sfunctions = require('./functions.js');
|
||||
const Scraper = require('./se_scraper');
|
||||
|
||||
module.exports = {
|
||||
scrape_youtube_pup: scrape_youtube_pup,
|
||||
};
|
||||
class YoutubeScraper extends Scraper {
|
||||
|
||||
const all_videos = new Set();
|
||||
parse(html) {
|
||||
// load the page source into cheerio
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
async function scrape_youtube_pup(page, event, context, pluggable) {
|
||||
await page.goto('https://www.youtube.com');
|
||||
// perform queries
|
||||
const results = [];
|
||||
$('#contents ytd-video-renderer,#contents ytd-grid-video-renderer').each((i, link) => {
|
||||
results.push({
|
||||
link: $(link).find('#video-title').attr('href'),
|
||||
title: $(link).find('#video-title').text(),
|
||||
snippet: $(link).find('#description-text').text(),
|
||||
channel: $(link).find('#byline a').text(),
|
||||
channel_link: $(link).find('#byline a').attr('href'),
|
||||
num_views: $(link).find('#metadata-line span:nth-child(1)').text(),
|
||||
release_date: $(link).find('#metadata-line span:nth-child(2)').text(),
|
||||
})
|
||||
});
|
||||
|
||||
try {
|
||||
await page.waitForSelector('input[id="search"]', { timeout: 5000 });
|
||||
} catch (e) {
|
||||
return results;
|
||||
}
|
||||
let no_results = this.no_results(
|
||||
['No results found', 'Keine Ergebnisse', 'Es werden Ergebnisse angezeigt', 'Showing results for' ],
|
||||
$('yt-showing-results-for-renderer').text()
|
||||
);
|
||||
|
||||
let keywords = event.keywords;
|
||||
var results = {};
|
||||
let effective_query = $('#corrected-link').text() || '';
|
||||
|
||||
// before we do anything, parse the results of the front page of youtube
|
||||
await page.waitForSelector('ytd-video-renderer,ytd-grid-video-renderer', { timeout: 10000 });
|
||||
await sfunctions.sleep(500);
|
||||
const cleaned = [];
|
||||
for (var i=0; i < results.length; i++) {
|
||||
let res = results[i];
|
||||
if (res.link && res.link.trim() && res.title && res.title.trim()) {
|
||||
res.title = res.title.trim();
|
||||
res.snippet = res.snippet.trim();
|
||||
res.rank = this.result_rank++;
|
||||
|
||||
let html = await page.content();
|
||||
results['__frontpage__'] = parse(html);
|
||||
|
||||
for (var i = 0; i < keywords.length; i++) {
|
||||
|
||||
keyword = keywords[i];
|
||||
|
||||
if (pluggable.before_keyword_scraped) {
|
||||
await pluggable.before_keyword_scraped({
|
||||
keyword: keyword,
|
||||
page: page,
|
||||
event: event,
|
||||
context: context,
|
||||
});
|
||||
}
|
||||
|
||||
try {
|
||||
const input = await page.$('input[id="search"]');
|
||||
// overwrites last text in input
|
||||
await input.click({ clickCount: 3 });
|
||||
await input.type(keyword);
|
||||
await input.focus();
|
||||
await page.keyboard.press("Enter");
|
||||
|
||||
if (event.sleep_range) {
|
||||
await sfunctions.random_sleep(event);
|
||||
}
|
||||
|
||||
await page.waitForFunction(`document.title.indexOf('${keyword}') !== -1`, { timeout: 5000 });
|
||||
await page.waitForSelector('ytd-video-renderer,ytd-grid-video-renderer', { timeout: 5000 });
|
||||
await sfunctions.sleep(500);
|
||||
|
||||
if (event.debug === true && event.is_local === true) {
|
||||
await page.screenshot({path: `debug/${keyword}.png`});
|
||||
// check if this result has been used before
|
||||
if (this.all_videos.has(res.title) === false) {
|
||||
cleaned.push(res);
|
||||
}
|
||||
this.all_videos.add(res.title);
|
||||
}
|
||||
}
|
||||
|
||||
let html = await page.content();
|
||||
results[keyword] = parse(html);
|
||||
|
||||
} catch (e) {
|
||||
console.error(`Problem with scraping ${keyword}: ${e}`);
|
||||
return {
|
||||
time: (new Date()).toUTCString(),
|
||||
no_results: no_results,
|
||||
effective_query: effective_query,
|
||||
num_results: '',
|
||||
results: cleaned,
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
async load_start_page() {
|
||||
try {
|
||||
this.all_videos = new Set();
|
||||
await this.page.goto('https://www.youtube.com', {
|
||||
referer: 'https://google.com'
|
||||
});
|
||||
await this.page.waitForSelector('input[id="search"]', { timeout: 5000 });
|
||||
// before we do anything, parse the results of the front page of youtube
|
||||
await this.page.waitForSelector('ytd-video-renderer,ytd-grid-video-renderer', { timeout: 10000 });
|
||||
await this.sleep(500);
|
||||
let html = await this.page.content();
|
||||
this.results['frontpage'] = this.parse(html);
|
||||
this.result_rank = 1;
|
||||
} catch(e) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
async search_keyword(keyword) {
|
||||
const input = await this.page.$('input[id="search"]');
|
||||
// overwrites last text in input
|
||||
await input.click({ clickCount: 3 });
|
||||
await input.type(keyword);
|
||||
await input.focus();
|
||||
await this.page.keyboard.press("Enter");
|
||||
}
|
||||
|
||||
async next_page() {
|
||||
// youtube needs scrolling
|
||||
// TODO: implement scrolling, no priority right now
|
||||
return false;
|
||||
}
|
||||
|
||||
async wait_for_results() {
|
||||
await this.page.waitForFunction(`document.title.indexOf('${this.keyword}') !== -1`, { timeout: 5000 });
|
||||
await this.page.waitForSelector('ytd-video-renderer,ytd-grid-video-renderer', { timeout: 5000 });
|
||||
await this.sleep(500);
|
||||
}
|
||||
|
||||
async detected() {
|
||||
const title = await this.page.title();
|
||||
let html = await this.page.content();
|
||||
return html.indexOf('detected unusual traffic') !== -1 || title.indexOf('/sorry/') !== -1;
|
||||
}
|
||||
}
|
||||
|
||||
function parse(html) {
|
||||
// load the page source into cheerio
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
// perform queries
|
||||
const results = [];
|
||||
$('#contents ytd-video-renderer,#contents ytd-grid-video-renderer').each((i, link) => {
|
||||
results.push({
|
||||
link: $(link).find('#video-title').attr('href'),
|
||||
title: $(link).find('#video-title').text(),
|
||||
snippet: $(link).find('#description-text').text(),
|
||||
channel: $(link).find('#byline a').text(),
|
||||
channel_link: $(link).find('#byline a').attr('href'),
|
||||
num_views: $(link).find('#metadata-line span:nth-child(1)').text(),
|
||||
release_date: $(link).find('#metadata-line span:nth-child(2)').text(),
|
||||
})
|
||||
});
|
||||
|
||||
let no_results = sfunctions.no_results(
|
||||
['No results found', 'Keine Ergebnisse', 'Es werden Ergebnisse angezeigt', 'Showing results for' ],
|
||||
$('yt-showing-results-for-renderer').text()
|
||||
);
|
||||
|
||||
let effective_query = $('#corrected-link').text() || '';
|
||||
|
||||
const cleaned = [];
|
||||
for (var i=0; i < results.length; i++) {
|
||||
let res = results[i];
|
||||
if (res.link && res.link.trim() && res.title && res.title.trim()) {
|
||||
res.title = res.title.trim();
|
||||
res.snippet = res.snippet.trim();
|
||||
res.rank = i+1;
|
||||
|
||||
// check if this result has been used before
|
||||
if (all_videos.has(res.title) === false) {
|
||||
cleaned.push(res);
|
||||
}
|
||||
all_videos.add(res.title);
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
time: (new Date()).toUTCString(),
|
||||
no_results: no_results,
|
||||
effective_query: effective_query,
|
||||
num_results: '',
|
||||
results: cleaned,
|
||||
}
|
||||
}
|
||||
module.exports = {
|
||||
YoutubeScraper: YoutubeScraper,
|
||||
};
|
@ -129,6 +129,8 @@ module.exports.handler = async function handler (event, context, callback) {
|
||||
}
|
||||
}
|
||||
|
||||
var results = {};
|
||||
|
||||
Scraper = {
|
||||
google: google.GoogleScraper,
|
||||
google_news_old: google.GoogleNewsOldScraper,
|
||||
@ -136,28 +138,32 @@ module.exports.handler = async function handler (event, context, callback) {
|
||||
google_image: google.GoogleImageScraper,
|
||||
bing: bing.BingScraper,
|
||||
bing_news: bing.BingNewsScraper,
|
||||
duckduckgo: duckduckgo.DuckduckgoScraper,
|
||||
duckduckgo_news: duckduckgo.DuckduckgoNewsScraper,
|
||||
infospace: infospace.InfospaceScraper,
|
||||
webcrawler: infospace.WebcrawlerNewsScraper,
|
||||
baidu: baidu.BaiduScraper,
|
||||
youtube: youtube.YoutubeScraper,
|
||||
|
||||
yahoo_news: tickersearch.not_implemented,
|
||||
bloomberg: tickersearch.not_implemented,
|
||||
reuters: tickersearch.not_implemented,
|
||||
cnbc: tickersearch.not_implemented,
|
||||
marketwatch: tickersearch.not_implemented,
|
||||
|
||||
infospace: infospace.scrape_infospace_pup,
|
||||
webcrawler: infospace.scrape_webcrawler_news_pup,
|
||||
baidu: baidu.scrape_baidu_pup,
|
||||
youtube: youtube.scrape_youtube_pup,
|
||||
duckduckgo_news: duckduckgo.scrape_duckduckgo_news_pup,
|
||||
google_dr: google.scrape_google_pup_dr,
|
||||
yahoo_news: tickersearch.scrape_yahoo_finance_pup,
|
||||
bloomberg: tickersearch.scrape_bloomberg_finance_pup,
|
||||
reuters: tickersearch.scrape_reuters_finance_pup,
|
||||
cnbc: tickersearch.scrape_cnbc_finance_pup,
|
||||
marketwatch: tickersearch.scrape_marketwatch_finance_pup,
|
||||
}[config.search_engine];
|
||||
|
||||
let scraper = new Scraper({
|
||||
browser: browser,
|
||||
config: config,
|
||||
context: context,
|
||||
pluggable: pluggable,
|
||||
});
|
||||
|
||||
let results = await scraper.run();
|
||||
if (Scraper === undefined) {
|
||||
console.info('Currently not implemented search_engine: ', config.search_engine);
|
||||
} else {
|
||||
let scraper = new Scraper({
|
||||
browser: browser,
|
||||
config: config,
|
||||
context: context,
|
||||
pluggable: pluggable,
|
||||
});
|
||||
var results = await scraper.run();
|
||||
}
|
||||
|
||||
if (pluggable.close_browser) {
|
||||
await pluggable.close_browser();
|
||||
|
@ -1,5 +1,4 @@
|
||||
const handler = require('./../src/node_scraper.js');
|
||||
|
||||
var assert = require('chai').assert;
|
||||
|
||||
/*
|
||||
@ -13,22 +12,26 @@ function sleep(ms) {
|
||||
})
|
||||
}
|
||||
|
||||
const search_engines = ['google', 'google_image', 'google_news', 'youtube', 'bing', 'infospace', 'baidu'];
|
||||
const search_engines = ['google', 'google_image', 'google_news', 'youtube', 'bing', 'infospace', 'duckduckgo'];
|
||||
const keywords = ['news', 'weather'];
|
||||
|
||||
async function tests() {
|
||||
|
||||
const keywords = ['Google scraper NikolaiT', 'the idiot'];
|
||||
|
||||
event = {
|
||||
search_engine: 'google',
|
||||
compress: 'false',
|
||||
debug: 'false',
|
||||
verbose: 'false',
|
||||
compress: false,
|
||||
debug: false,
|
||||
verbose: false,
|
||||
keywords: keywords,
|
||||
keyword_file: '',
|
||||
num_pages: 1,
|
||||
headless: true,
|
||||
output_file: '',
|
||||
block_assets: true,
|
||||
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
|
||||
random_user_agent: false,
|
||||
};
|
||||
|
||||
for (var i = 0; i < search_engines.length; i++) {
|
||||
se = search_engines[i];
|
||||
for (let se of search_engines) {
|
||||
console.log(`Testing ${se}...`);
|
||||
event.search_engine = se;
|
||||
await handler.handler(event, undefined, test_case);
|
||||
@ -47,8 +50,7 @@ async function no_results_test() {
|
||||
keywords: keywords,
|
||||
};
|
||||
|
||||
for (var i = 0; i < search_engines.length; i++) {
|
||||
se = search_engines[i];
|
||||
for (let se of search_engines) {
|
||||
console.log(`Testing ${se}...`);
|
||||
event.search_engine = se;
|
||||
await handler.handler(event, undefined, test_case_no_results);
|
||||
@ -61,19 +63,18 @@ async function effective_query_test() {
|
||||
const keywords = ['mount evverrest'];
|
||||
|
||||
event = {
|
||||
write_meta_data: 'true',
|
||||
write_meta_data: true,
|
||||
job_name: 'test-job',
|
||||
search_engine: '',
|
||||
compress: 'false',
|
||||
debug: 'false',
|
||||
verbose: 'false',
|
||||
compress: false,
|
||||
debug: false,
|
||||
verbose: false,
|
||||
keywords: keywords,
|
||||
};
|
||||
|
||||
const effective_query_engines = ['google', 'google_image', 'google_news', 'youtube', 'bing'];
|
||||
|
||||
for (var i = 0; i < effective_query_engines.length; i++) {
|
||||
se = effective_query_engines[i];
|
||||
for (let se of search_engines) {
|
||||
console.log(`Testing ${se}...`);
|
||||
event.search_engine = se;
|
||||
await handler.handler(event, undefined, test_case_effective_query);
|
||||
@ -90,27 +91,47 @@ function test_case(err, response) {
|
||||
assert.equal(response.headers['Content-Type'], 'text/json', 'content type is not text/json');
|
||||
assert.equal(response.statusCode, 200, 'status code must be 200');
|
||||
|
||||
for (key in response.results) {
|
||||
kw = response.results[key];
|
||||
// at least 6 results
|
||||
assert.isAtLeast(kw.results.length, 6, 'results must have at least 6 links');
|
||||
assert.equal(kw.no_results, false, 'no results should be false');
|
||||
assert.typeOf(kw.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(kw.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(kw.time), 'number', 'time should be a valid date');
|
||||
let total_rank = 1;
|
||||
|
||||
for (let res of kw.results) {
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
for (query in response.results) {
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
assert.containsAllKeys(response.results, keywords, 'not all keywords were scraped.');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
for (page_number in response.results[query]) {
|
||||
|
||||
assert.isNumber(parseInt(page_number), 'page_number must be numeric');
|
||||
|
||||
let obj = response.results[query][page_number];
|
||||
|
||||
assert.containsAllKeys(obj, ['results', 'time',], 'not all keys are in the object');
|
||||
|
||||
// at least 6 results
|
||||
assert.isAtLeast(obj.results.length, 6, 'results must have at least 6 SERP objects');
|
||||
// TODO: fix this
|
||||
// assert.equal(obj.no_results, false, 'no results should be false');
|
||||
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
|
||||
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
|
||||
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');
|
||||
|
||||
for (let res of obj.results) {
|
||||
|
||||
assert.containsAllKeys(res, ['link', 'title', 'rank'], 'not all keys are in the SERP object');
|
||||
|
||||
assert.isOk(res.link, 'link must be ok');
|
||||
assert.typeOf(res.link, 'string', 'link must be string');
|
||||
assert.isAtLeast(res.link.length, 5, 'link must have at least 5 chars');
|
||||
|
||||
assert.isOk(res.title, 'title must be ok');
|
||||
assert.typeOf(res.title, 'string', 'title must be string');
|
||||
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');
|
||||
|
||||
assert.isOk(res.snippet, 'snippet must be ok');
|
||||
assert.typeOf(res.snippet, 'string', 'snippet must be string');
|
||||
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
|
||||
|
||||
assert.isNumber(res.rank, 'rank must be integer');
|
||||
assert.equal(res.rank, total_rank++, 'rank ist wrong');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user