diff --git a/data.json b/data.json index cb7f202..04e0994 100644 --- a/data.json +++ b/data.json @@ -1 +1 @@ -{"scrapeulous.com":{"time":"Sun, 27 Jan 2019 18:07:33 GMT","num_results":"Ungefähr 101 Ergebnisse","no_results":false,"effective_query":"","results":[]}} \ No newline at end of file +{"incolumitas.com news":{"time":"Sun, 27 Jan 2019 19:07:41 GMT","num_results":"Ungefähr 691.000 Ergebnisse","no_results":false,"effective_query":"","results":[{"link":"/url?q=https://incolumitas.com/2018/10/29/youtube-puppeteer-scraping/&sa=U&ved=0ahUKEwi0hLex1I7gAhUImbQKHZQQD7cQFggUMAA&usg=AOvVaw1PmFDpPlIFYilxfQb1ym1W","title":"Coding, Learning and Business Ideas – Tutorial ... - Incolumitas","snippet":"29 Oct 2018 ... If you are a scientist, you might be interested in the spreading of fake news for \nexample. You want to monitor the sources of fake news.","visible_link":"","date":"","rank":1},{"link":"/url?q=https://incolumitas.com/2018/11/18/introduction-machine-learning-2019/&sa=U&ved=0ahUKEwi0hLex1I7gAhUImbQKHZQQD7cQFggaMAE&usg=AOvVaw37PW4eBuCGv2zkvXt0sZkw","title":"Coding, Learning and Business Ideas – Introduction to ... - Incolumitas","snippet":"18 Nov 2018 ... I want to recognize news articles soley based on one input: A link to the \ndocument. The algorithm should automatically recognize whether the ...","visible_link":"","date":"","rank":2},{"link":"/url?q=https://incolumitas.com/pages/about/&sa=U&ved=0ahUKEwi0hLex1I7gAhUImbQKHZQQD7cQFgggMAI&usg=AOvVaw2-XXZq6XPgh5orgEd_2rr6","title":"Coding, Learning and Business Ideas – About - Incolumitas","snippet":"As far as I know, this security vulnerability is still unfixed in late 2018. See my \noriginal blog post here. An news article from Heise and an article from ars \ntechnica.","visible_link":"","date":"","rank":3},{"link":"/url?q=https://incolumitas.com/uploads/2013/12/links.txt&sa=U&ved=0ahUKEwi0hLex1I7gAhUImbQKHZQQD7cQFgglMAM&usg=AOvVaw0cAkft0dN7WcT6DCDbDYZM","title":"http://www.urbandictionary.com/define.php?term=holy%20shit http ...","snippet":"... http://www.tumblr.com/tagged/holy-shit http://pitchfork.com/news/52370-watch-\nholy-shit-perform-at-ryan-mcginley-gallery-opening-in-san-francisco/ ...","visible_link":"","date":"","rank":4},{"link":"/url?q=https://incolumitas.com/2014/11/12/scraping-and-extracting-links-from-any-major-search-engine-like-google-yandex-baidu-bing-and-duckduckgo/&sa=U&ved=0ahUKEwi0hLex1I7gAhUImbQKHZQQD7cQFggqMAQ&usg=AOvVaw0I45HMQSEjMeuKozexkwcI","title":"Coding, Learning and Business Ideas – Scraping and ... - Incolumitas","snippet":"12 Nov 2014 ... The supported search types. For instance, Google supports Video Search, Image \nSearch, News search search_types = [] def __init__(self, html, ...","visible_link":"","date":"","rank":5},{"link":"/url?q=https://incolumitas.com/2013/01/06/googlesearch-a-rapid-python-class-to-get-search-results/&sa=U&ved=0ahUKEwi0hLex1I7gAhUImbQKHZQQD7cQFggwMAU&usg=AOvVaw39UsbJVwdj3lCe2hAGZlgq","title":"Coding, Learning and Business Ideas ... - Incolumitas","snippet":"18 Feb 2014 ... UPDATE on 18th February 2014: This python module has now its own github \nrepository! The plugin can extract All links Link titles The ...","visible_link":"","date":"","rank":6},{"link":"/url?q=https://www.npmjs.com/package/se-scraper&sa=U&ved=0ahUKEwi0hLex1I7gAhUImbQKHZQQD7cQFgg2MAY&usg=AOvVaw2qrTw_GOSWhggUYNgRdAPy","title":"se-scraper - npm","snippet":"3 days ago ... Google; Google News; Google News New (https://news.google.com) .... 'https://\nincolumitas.com/2018/10/29/youtube-puppeteer-scraping/',. title:.","visible_link":"","date":"","rank":7},{"link":"/url?q=https://pypi.org/project/CountryGoogleScraper/&sa=U&ved=0ahUKEwi0hLex1I7gAhUImbQKHZQQD7cQFgg8MAc&usg=AOvVaw039KcMaYiUHgES64a5tW9F","title":"CountryGoogleScraper · PyPI","snippet":"News GoogleScraper becomes **finally mature!** In the last months I didn't ..... [4\n]: http://incolumitas.com/2013/01/06/googlesearch-a-rapid-python-class-to-get- ...","visible_link":"","date":"","rank":8},{"link":"/url?q=https://pycoders.com/issues/69&sa=U&ved=0ahUKEwi0hLex1I7gAhUImbQKHZQQD7cQFghBMAg&usg=AOvVaw1sJXhzn4R2ihjximQOaA_W","title":"PyCoder's Weekly | Issue #69","snippet":"7 Jun 2013 ... To the keep up with all the breaking Python news follow @pycoders. Support us \non Gittip -- ... News and Developments. ... (incolumitas.com).","visible_link":"","date":"","rank":9},{"link":"/url?q=https://news.ycombinator.com/item%3Fid%3D11925325&sa=U&ved=0ahUKEwi0hLex1I7gAhUImbQKHZQQD7cQFghHMAk&usg=AOvVaw1F_732iyOahhNsWaJyvYM2","title":"Typosquatting in Programming Language Package ... - Hacker News","snippet":"Typosquatting in Programming Language Package Managers [pdf] (incolumitas.\ncom). 3 points by henry_flower on June 17, 2016 | hide | past | web | favorite ...","visible_link":"","date":"","rank":10}]}} \ No newline at end of file diff --git a/examples/pluggable.js b/examples/pluggable.js index 3fda333..aea2e9b 100644 --- a/examples/pluggable.js +++ b/examples/pluggable.js @@ -27,7 +27,11 @@ module.exports = class Pluggable { } async handle_metadata(args) { - // silence + // store scraping metadata somewhere + } + + async handle_results(args) { + // store the results somewhere } async start_browser(args={}) { @@ -35,11 +39,16 @@ module.exports = class Pluggable { let launch_args = { args: args.chromeFlags || this.chromeFlags, - headless: args.headless || this.headless, + headless: args.headless, }; + if (launch_args.headless === undefined) { + launch_args.headless = this.headless; + } + this.browser = await puppeteer.launch(launch_args); console.log('Loaded custom function get_browser()'); + console.log(launch_args); return this.browser; } diff --git a/index.js b/index.js index cb85a76..2904cd3 100644 --- a/index.js +++ b/index.js @@ -33,7 +33,7 @@ exports.scrape = async function(config, callback) { // path to js module that extends functionality // this module should export the functions: // get_browser, handle_metadata, close_browser - custom_func: 'examples/pluggable.js', + custom_func: '', }; for (var key in config) { diff --git a/package.json b/package.json index 5a9942f..507ccd7 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "se-scraper", - "version": "1.1.1", + "version": "1.1.3", "description": "A simple module which uses puppeteer to scrape several search engines.", "homepage": "https://scrapeulous.com/", "main": "index.js", diff --git a/run.js b/run.js index a2ca59e..338d239 100644 --- a/run.js +++ b/run.js @@ -5,7 +5,7 @@ let config = { // the user agent to scrape with user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36', // if random_user_agent is set to True, a random user agent is chosen - random_user_agent: false, + random_user_agent: true, // get meta data of scraping in return object write_meta_data: false, // how long to sleep between requests. a random sleep interval within the range [a,b] @@ -20,7 +20,7 @@ let config = { // this output is informational verbose: false, // an array of keywords to scrape - keywords: ['scrapeulous.com', ], + keywords: ['incolumitas.com news', ], // alternatively you can specify a keyword_file. this overwrites the keywords array keyword_file: '', // whether to start the browser in headless mode @@ -38,7 +38,7 @@ let config = { custom_func: resolve('examples/pluggable.js'), }; -se_scraper.scrape(config, (err, response) => { +function callback(err, response) { if (err) { console.error(err) } /* response object has the following properties: @@ -49,4 +49,6 @@ se_scraper.scrape(config, (err, response) => { */ console.dir(response.results, {depth: null, colors: true}); -}); +} + +se_scraper.scrape(config, callback); diff --git a/src/modules/google.js b/src/modules/google.js index 637b25d..1c2a966 100644 --- a/src/modules/google.js +++ b/src/modules/google.js @@ -47,7 +47,7 @@ async function scrape_google_pup(page, event, context) { } await page.waitForSelector('#center_col', { timeout: STANDARD_TIMEOUT }); - await sfunctions.sleep(100); + await sfunctions.sleep(500); } catch (e) { console.error(`Problem with scraping ${keyword}.`); @@ -175,7 +175,7 @@ function parse_google_results(html) { $('#center_col .g').each((i, link) => { results.push({ link: $(link).find('.r a').attr('href'), - title: $(link).find('.r a h3').text(), + title: $(link).find('.r a').text(), snippet: $(link).find('span.st').text(), visible_link: $(link).find('.r cite').text(), date: $(link).find('span.f').text() || '', diff --git a/src/node_scraper.js b/src/node_scraper.js index 4584623..a7751ef 100644 --- a/src/node_scraper.js +++ b/src/node_scraper.js @@ -68,7 +68,7 @@ module.exports.handler = async function handler (config, context, callback) { let launch_args = { args: ADDITIONAL_CHROME_FLAGS, - headless: config.headless !== false, + headless: config.headless, }; if (config.debug === true) { @@ -149,6 +149,13 @@ module.exports.handler = async function handler (config, context, callback) { results = zlib.deflateSync(results).toString('base64'); } + if (pluggable && pluggable.handle_results) { + await pluggable.handle_results({ + config: config, + results: results, + }); + } + if (config.write_meta_data === true) { metadata.id = `${config.job_name} ${config.chunk_lines}`; metadata.chunk_lines = config.chunk_lines;