mirror of
https://github.com/NikolaiT/se-scraper.git
synced 2024-11-21 23:23:07 +01:00
resolved issue #30, custom scrapers now possible. new npm version
This commit is contained in:
parent
06d500f75c
commit
43d5732de7
@ -13,6 +13,7 @@ If you don't have much technical experience or don't want to purchase proxies, y
|
|||||||
- [Minimal Example](#minimal-example)
|
- [Minimal Example](#minimal-example)
|
||||||
- [Quickstart](#quickstart)
|
- [Quickstart](#quickstart)
|
||||||
- [Using Proxies](#proxies)
|
- [Using Proxies](#proxies)
|
||||||
|
- [Custom Scrapers](#custom-scrapers)
|
||||||
- [Examples](#examples)
|
- [Examples](#examples)
|
||||||
- [Scraping Model](#scraping-model)
|
- [Scraping Model](#scraping-model)
|
||||||
- [Technical Notes](#technical-notes)
|
- [Technical Notes](#technical-notes)
|
||||||
@ -170,6 +171,14 @@ socks4://51.11.23.22:22222
|
|||||||
|
|
||||||
This will scrape with **three** browser instance each having their own IP address. Unfortunately, it is currently not possible to scrape with different proxies per tab. Chromium does not support that.
|
This will scrape with **three** browser instance each having their own IP address. Unfortunately, it is currently not possible to scrape with different proxies per tab. Chromium does not support that.
|
||||||
|
|
||||||
|
|
||||||
|
## Custom Scrapers
|
||||||
|
|
||||||
|
You can define your own scraper class and use it within se-scraper.
|
||||||
|
|
||||||
|
[Check this example out](examples/custom_scraper.js) that defines a custom scraper for Ecosia.
|
||||||
|
|
||||||
|
|
||||||
## Examples
|
## Examples
|
||||||
|
|
||||||
* [Reuse existing browser](examples/multiple_search_engines.js) yields [these results](examples/results/multiple_search_engines.json)
|
* [Reuse existing browser](examples/multiple_search_engines.js) yields [these results](examples/results/multiple_search_engines.json)
|
||||||
|
119
examples/custom_scraper.js
Normal file
119
examples/custom_scraper.js
Normal file
@ -0,0 +1,119 @@
|
|||||||
|
const se_scraper = require('./../index.js');
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This example shows how you can define your custom scraper class and use it
|
||||||
|
* within se-scraper.
|
||||||
|
*/
|
||||||
|
class EcosiaScraper extends se_scraper.Scraper {
|
||||||
|
|
||||||
|
constructor(...args) {
|
||||||
|
super(...args);
|
||||||
|
}
|
||||||
|
|
||||||
|
async parse_async(html) {
|
||||||
|
// In this example we use vanilla javascript to parse out the
|
||||||
|
// interesting information from the search engine
|
||||||
|
|
||||||
|
// you may also use a external library such as cheerio.
|
||||||
|
|
||||||
|
return await this.page.evaluate(() => {
|
||||||
|
var results = {
|
||||||
|
num_results: '',
|
||||||
|
no_results: false,
|
||||||
|
effective_query: '',
|
||||||
|
results: [],
|
||||||
|
};
|
||||||
|
|
||||||
|
document.querySelectorAll('.results .result').forEach((result) => {
|
||||||
|
var serp = {};
|
||||||
|
var title = result.querySelector('.result-title');
|
||||||
|
if (title) {
|
||||||
|
serp.title = title.innerText;
|
||||||
|
serp.link = title.getAttribute('href');
|
||||||
|
}
|
||||||
|
|
||||||
|
var green = result.querySelector('.result-url');
|
||||||
|
if (green) {
|
||||||
|
serp.green = green.getAttribute('href');
|
||||||
|
}
|
||||||
|
|
||||||
|
var snippet = result.querySelector('.result-snippet');
|
||||||
|
|
||||||
|
if (snippet) {
|
||||||
|
serp.snippet = snippet.innerText;
|
||||||
|
}
|
||||||
|
|
||||||
|
results.results.push(serp);
|
||||||
|
});
|
||||||
|
|
||||||
|
var num_res = document.querySelector('.card-title-result-count');
|
||||||
|
if (num_res) {
|
||||||
|
results.num_results = num_res.innerText;
|
||||||
|
}
|
||||||
|
|
||||||
|
results.no_results = document.querySelector('.empty-result') != null;
|
||||||
|
|
||||||
|
var effective = document.querySelector('.query-context-text .result-title');
|
||||||
|
|
||||||
|
if (effective) {
|
||||||
|
results.effective_query = effective.innerText;
|
||||||
|
}
|
||||||
|
|
||||||
|
return results;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
async load_start_page() {
|
||||||
|
let startUrl = 'https://www.ecosia.org/';
|
||||||
|
|
||||||
|
await this.page.goto(startUrl);
|
||||||
|
|
||||||
|
try {
|
||||||
|
await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
|
||||||
|
} catch (e) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
async search_keyword(keyword) {
|
||||||
|
const input = await this.page.$('input[name="q"]');
|
||||||
|
await this.set_input_value(`input[name="q"]`, keyword);
|
||||||
|
await this.sleep(50);
|
||||||
|
await input.focus();
|
||||||
|
await this.page.keyboard.press("Enter");
|
||||||
|
}
|
||||||
|
|
||||||
|
async next_page() {
|
||||||
|
let next_page_link = await this.page.$('.pagination-next', {timeout: 1000});
|
||||||
|
if (!next_page_link) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
await next_page_link.click();
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
async wait_for_results() {
|
||||||
|
await this.page.waitForSelector('.results .result', { timeout: this.STANDARD_TIMEOUT });
|
||||||
|
}
|
||||||
|
|
||||||
|
async detected() {
|
||||||
|
// check whether scraping was detected.
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
(async () => {
|
||||||
|
|
||||||
|
let scrape_job = {
|
||||||
|
search_engine: EcosiaScraper,
|
||||||
|
keywords: ['lets go boys'],
|
||||||
|
num_pages: 2,
|
||||||
|
};
|
||||||
|
|
||||||
|
var results = await se_scraper.scrape({headless: true}, scrape_job);
|
||||||
|
|
||||||
|
console.dir(results, {depth: null, colors: true});
|
||||||
|
|
||||||
|
})();
|
@ -17,7 +17,7 @@ const se_scraper = require('./../src/node_scraper.js');
|
|||||||
|
|
||||||
let scrape_job = {
|
let scrape_job = {
|
||||||
search_engine: 'google',
|
search_engine: 'google',
|
||||||
keywords: ['news', 'scrapeulous.com', 'incolumitas.com', 'i work too much', 'what to do?', 'javascript is hard'],
|
keywords: ['news', 'some stuff', 'i work too much', 'what to do?', 'javascript is hard'],
|
||||||
num_pages: 1,
|
num_pages: 1,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
8
index.js
8
index.js
@ -1,8 +1,11 @@
|
|||||||
const se_scraper = require('./src/node_scraper.js');
|
const se_scraper = require('./src/node_scraper.js');
|
||||||
|
var Scraper = require('./src/modules/se_scraper');
|
||||||
|
|
||||||
async function scrape(user_config, scrape_config) {
|
async function scrape(browser_config, scrape_config) {
|
||||||
|
// scrape config overwrites the browser_config
|
||||||
|
Object.assign(browser_config, scrape_config);
|
||||||
|
|
||||||
var scraper = new se_scraper.ScrapeManager(user_config);
|
var scraper = new se_scraper.ScrapeManager(browser_config);
|
||||||
|
|
||||||
await scraper.start();
|
await scraper.start();
|
||||||
|
|
||||||
@ -16,4 +19,5 @@ async function scrape(user_config, scrape_config) {
|
|||||||
module.exports = {
|
module.exports = {
|
||||||
scrape: scrape,
|
scrape: scrape,
|
||||||
ScrapeManager: se_scraper.ScrapeManager,
|
ScrapeManager: se_scraper.ScrapeManager,
|
||||||
|
Scraper: Scraper,
|
||||||
};
|
};
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "se-scraper",
|
"name": "se-scraper",
|
||||||
"version": "1.3.2",
|
"version": "1.3.4",
|
||||||
"description": "A module using puppeteer to scrape several search engines such as Google, Duckduckgo, Bing or Baidu",
|
"description": "A module using puppeteer to scrape several search engines such as Google, Duckduckgo, Bing or Baidu",
|
||||||
"homepage": "https://scrapeulous.com/",
|
"homepage": "https://scrapeulous.com/",
|
||||||
"main": "index.js",
|
"main": "index.js",
|
||||||
|
@ -59,12 +59,16 @@ class DuckduckgoScraper extends Scraper {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async next_page() {
|
async next_page() {
|
||||||
let next_page_link = await this.page.$('a.result--more__btn', {timeout: 5000});
|
let next_page_link = await this.page.$('.result.result--more', {timeout: 5000});
|
||||||
if (!next_page_link) {
|
if (!next_page_link) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
await next_page_link.click();
|
await next_page_link.click();
|
||||||
await this.page.waitForNavigation();
|
try {
|
||||||
|
await this.page.waitForNavigation({timeout: 5000});
|
||||||
|
} catch(e) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -135,7 +139,11 @@ class DuckduckgoNewsScraper extends Scraper {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
await next_page_link.click();
|
await next_page_link.click();
|
||||||
await this.page.waitForNavigation();
|
try {
|
||||||
|
await this.page.waitForNavigation({timeout: 5000});
|
||||||
|
} catch(e) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -179,7 +179,7 @@ module.exports = class Scraper {
|
|||||||
|
|
||||||
do {
|
do {
|
||||||
|
|
||||||
log(this.config, 1, `${this.config.search_engine} scrapes keyword "${keyword}" on page ${page_num}`);
|
log(this.config, 1, `${this.config.search_engine_name} scrapes keyword "${keyword}" on page ${page_num}`);
|
||||||
|
|
||||||
await this.wait_for_results();
|
await this.wait_for_results();
|
||||||
|
|
||||||
@ -210,10 +210,10 @@ module.exports = class Scraper {
|
|||||||
|
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
|
|
||||||
console.error(`Problem with scraping ${keyword} in search engine ${this.config.search_engine}: ${e}`);
|
console.error(`Problem with scraping ${keyword} in search engine ${this.config.search_engine_name}: ${e}`);
|
||||||
|
|
||||||
if (await this.detected() === true) {
|
if (await this.detected() === true) {
|
||||||
console.error(`${this.config.search_engine} detected the scraping!`);
|
console.error(`${this.config.search_engine_name} detected the scraping!`);
|
||||||
|
|
||||||
if (this.config.is_local === true) {
|
if (this.config.is_local === true) {
|
||||||
await this.sleep(this.SOLVE_CAPTCHA_TIME);
|
await this.sleep(this.SOLVE_CAPTCHA_TIME);
|
||||||
|
@ -32,26 +32,32 @@ function read_keywords_from_file(fname) {
|
|||||||
return kws;
|
return kws;
|
||||||
}
|
}
|
||||||
|
|
||||||
function getScraper(searchEngine, args) {
|
function getScraper(search_engine, args) {
|
||||||
return new {
|
if (typeof search_engine === 'string') {
|
||||||
google: google.GoogleScraper,
|
return new {
|
||||||
google_news_old: google.GoogleNewsOldScraper,
|
google: google.GoogleScraper,
|
||||||
google_news: google.GoogleNewsScraper,
|
google_news_old: google.GoogleNewsOldScraper,
|
||||||
google_image: google.GoogleImageScraper,
|
google_news: google.GoogleNewsScraper,
|
||||||
bing: bing.BingScraper,
|
google_image: google.GoogleImageScraper,
|
||||||
bing_news: bing.BingNewsScraper,
|
bing: bing.BingScraper,
|
||||||
amazon: amazon.AmazonScraper,
|
bing_news: bing.BingNewsScraper,
|
||||||
duckduckgo: duckduckgo.DuckduckgoScraper,
|
amazon: amazon.AmazonScraper,
|
||||||
duckduckgo_news: duckduckgo.DuckduckgoNewsScraper,
|
duckduckgo: duckduckgo.DuckduckgoScraper,
|
||||||
infospace: infospace.InfospaceScraper,
|
duckduckgo_news: duckduckgo.DuckduckgoNewsScraper,
|
||||||
webcrawler: infospace.WebcrawlerNewsScraper,
|
infospace: infospace.InfospaceScraper,
|
||||||
baidu: baidu.BaiduScraper,
|
webcrawler: infospace.WebcrawlerNewsScraper,
|
||||||
youtube: youtube.YoutubeScraper,
|
baidu: baidu.BaiduScraper,
|
||||||
yahoo_news: tickersearch.YahooFinanceScraper,
|
youtube: youtube.YoutubeScraper,
|
||||||
reuters: tickersearch.ReutersFinanceScraper,
|
yahoo_news: tickersearch.YahooFinanceScraper,
|
||||||
cnbc: tickersearch.CnbcFinanceScraper,
|
reuters: tickersearch.ReutersFinanceScraper,
|
||||||
marketwatch: tickersearch.MarketwatchFinanceScraper,
|
cnbc: tickersearch.CnbcFinanceScraper,
|
||||||
}[searchEngine](args);
|
marketwatch: tickersearch.MarketwatchFinanceScraper,
|
||||||
|
}[search_engine](args);
|
||||||
|
} else if (typeof search_engine === 'function') {
|
||||||
|
return new search_engine(args);
|
||||||
|
} else {
|
||||||
|
throw new Error(`search_engine must either be a string of class (function)`);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -137,6 +143,8 @@ class ScrapeManager {
|
|||||||
|
|
||||||
this.config = parseEventData(this.config);
|
this.config = parseEventData(this.config);
|
||||||
|
|
||||||
|
this.config.search_engine_name = typeof this.config.search_engine === 'function' ? this.config.search_engine.name : this.config.search_engine;
|
||||||
|
|
||||||
if (fs.existsSync(this.config.keyword_file)) {
|
if (fs.existsSync(this.config.keyword_file)) {
|
||||||
this.config.keywords = read_keywords_from_file(this.config.keyword_file);
|
this.config.keywords = read_keywords_from_file(this.config.keyword_file);
|
||||||
}
|
}
|
||||||
@ -304,7 +312,8 @@ class ScrapeManager {
|
|||||||
var startTime = Date.now();
|
var startTime = Date.now();
|
||||||
|
|
||||||
if (this.config.keywords && this.config.search_engine) {
|
if (this.config.keywords && this.config.search_engine) {
|
||||||
log(this.config, 1, `[se-scraper] started at [${(new Date()).toUTCString()}] and scrapes ${this.config.search_engine} with ${this.config.keywords.length} keywords on ${this.config.num_pages} pages each.`)
|
log(this.config, 1,
|
||||||
|
`[se-scraper] started at [${(new Date()).toUTCString()}] and scrapes ${this.config.search_engine_name} with ${this.config.keywords.length} keywords on ${this.config.num_pages} pages each.`)
|
||||||
}
|
}
|
||||||
|
|
||||||
if (this.config.do_work && this.pluggable) {
|
if (this.config.do_work && this.pluggable) {
|
||||||
|
Loading…
Reference in New Issue
Block a user