From 97f3671e2c649e9aef5a63ee4fd906bd37e6c371 Mon Sep 17 00:00:00 2001 From: Darren Schroeder <343840+fdncred@users.noreply.github.com> Date: Tue, 3 Nov 2020 15:46:42 -0600 Subject: [PATCH] web scraping with css selectors (#2725) * first step of making selector * wip * wip tests working * probably good enough for a first pass * oops, missed something. * and something else... * grrrr version errors --- Cargo.lock | 297 +++++++++++++++++++++- Cargo.toml | 9 +- crates/nu_plugin_selector/Cargo.toml | 20 ++ crates/nu_plugin_selector/src/lib.rs | 4 + crates/nu_plugin_selector/src/main.rs | 6 + crates/nu_plugin_selector/src/nu/mod.rs | 52 ++++ crates/nu_plugin_selector/src/selector.rs | 100 ++++++++ src/plugins/nu_plugin_extra_selector.rs | 6 + wix/main.wxs | 10 +- 9 files changed, 501 insertions(+), 3 deletions(-) create mode 100644 crates/nu_plugin_selector/Cargo.toml create mode 100644 crates/nu_plugin_selector/src/lib.rs create mode 100644 crates/nu_plugin_selector/src/main.rs create mode 100644 crates/nu_plugin_selector/src/nu/mod.rs create mode 100644 crates/nu_plugin_selector/src/selector.rs create mode 100644 src/plugins/nu_plugin_extra_selector.rs diff --git a/Cargo.lock b/Cargo.lock index f20a532ec..7ab26111a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -981,6 +981,33 @@ dependencies = [ "generic-array 0.8.3", ] +[[package]] +name = "cssparser" +version = "0.27.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "754b69d351cdc2d8ee09ae203db831e005560fc6030da058f86ad60c92a9cb0a" +dependencies = [ + "cssparser-macros", + "dtoa-short", + "itoa", + "matches", + "phf", + "proc-macro2", + "quote", + "smallvec 1.4.2", + "syn", +] + +[[package]] +name = "cssparser-macros" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dfae75de57f2b2e85e8768c3ea840fd159c8f33e2b6522c7835b7abac81be16e" +dependencies = [ + "quote", + "syn", +] + [[package]] name = "csv" version = "1.1.3" @@ -1096,6 +1123,17 @@ dependencies = [ "syn", ] +[[package]] +name = "derive_more" +version = "0.99.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41cb0e6161ad61ed084a36ba71fbba9e3ac5aee3606fb607fe08da6acbcf3d8c" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "digest" version = "0.6.2" @@ -1205,6 +1243,15 @@ version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "134951f4028bdadb9b84baf4232681efbf277da25144b9b0ad65df75946c422b" +[[package]] +name = "dtoa-short" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59020b8513b76630c49d918c33db9f4c91638e7d3404a28084083b87e33f76f2" +dependencies = [ + "dtoa", +] + [[package]] name = "dtparse" version = "1.2.0" @@ -1488,6 +1535,16 @@ version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3dcaa9ae7725d12cdb85b3ad99a434db70b468c09ded17e012d86b5c1010f7a7" +[[package]] +name = "futf" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c9c1ce3fa9336301af935ab852c437817d14cd33690446569392e65170aac3b" +dependencies = [ + "mac", + "new_debug_unreachable", +] + [[package]] name = "futures" version = "0.1.29" @@ -1704,6 +1761,15 @@ dependencies = [ "pin-project", ] +[[package]] +name = "fxhash" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" +dependencies = [ + "byteorder", +] + [[package]] name = "gcc" version = "0.3.55" @@ -2035,6 +2101,20 @@ dependencies = [ "sha1", ] +[[package]] +name = "html5ever" +version = "0.25.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aafcf38a1a36118242d29b92e1b08ef84e67e4a5ed06e0a80be20e6a32bfed6b" +dependencies = [ + "log 0.4.11", + "mac", + "markup5ever", + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "htmlescape" version = "0.3.1" @@ -2566,6 +2646,12 @@ dependencies = [ "linked-hash-map 0.5.3", ] +[[package]] +name = "mac" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" + [[package]] name = "macaddr" version = "1.0.1" @@ -2596,6 +2682,23 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d" +[[package]] +name = "markup5ever" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aae38d669396ca9b707bfc3db254bc382ddb94f57cc5c235f34623a669a01dab" +dependencies = [ + "log 0.4.11", + "phf", + "phf_codegen", + "serde 1.0.115", + "serde_derive", + "serde_json", + "string_cache", + "string_cache_codegen", + "tendril", +] + [[package]] name = "matches" version = "0.1.8" @@ -2799,6 +2902,25 @@ dependencies = [ "winapi 0.3.9", ] +[[package]] +name = "new_debug_unreachable" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4a24736216ec316047a1fc4252e27dabb04218aa4a3f37c6e7ddbf1f9782b54" + +[[package]] +name = "nipper" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "576d0e437aa08b447a207584463febe639d00b26b63121a9c038eff8371e0050" +dependencies = [ + "cssparser", + "html5ever", + "markup5ever", + "selectors", + "tendril", +] + [[package]] name = "nix" version = "0.17.0" @@ -2885,6 +3007,7 @@ dependencies = [ "nu_plugin_post", "nu_plugin_ps", "nu_plugin_s3", + "nu_plugin_selector", "nu_plugin_start", "nu_plugin_sys", "nu_plugin_textview", @@ -3288,6 +3411,18 @@ dependencies = [ "s3handler", ] +[[package]] +name = "nu_plugin_selector" +version = "0.22.0" +dependencies = [ + "nipper", + "nu-errors", + "nu-plugin", + "nu-protocol", + "nu-source", + "nu-test-support", +] + [[package]] name = "nu_plugin_start" version = "0.22.0" @@ -3808,6 +3943,60 @@ dependencies = [ "indexmap", ] +[[package]] +name = "phf" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3dfb61232e34fcb633f43d12c58f83c1df82962dcdfa565a4e866ffc17dafe12" +dependencies = [ + "phf_macros", + "phf_shared", + "proc-macro-hack", +] + +[[package]] +name = "phf_codegen" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cbffee61585b0411840d3ece935cce9cb6321f01c45477d30066498cd5e1a815" +dependencies = [ + "phf_generator", + "phf_shared", +] + +[[package]] +name = "phf_generator" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17367f0cc86f2d25802b2c26ee58a7b23faeccf78a396094c13dced0d0182526" +dependencies = [ + "phf_shared", + "rand 0.7.3", +] + +[[package]] +name = "phf_macros" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f6fde18ff429ffc8fe78e2bf7f8b7a5a5a6e2a8b58bc5a9ac69198bbda9189c" +dependencies = [ + "phf_generator", + "phf_shared", + "proc-macro-hack", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "phf_shared" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c00cf8b9eafe68dde5e9eaa2cef8ee84a9336a47d566ec55ca16589633b65af7" +dependencies = [ + "siphasher", +] + [[package]] name = "pin-project" version = "0.4.23" @@ -3896,6 +4085,12 @@ version = "0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c36fa947111f5c62a733b652544dd0016a43ce89619538a8ef92724a6f501a20" +[[package]] +name = "precomputed-hash" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" + [[package]] name = "pretty" version = "0.5.2" @@ -4142,7 +4337,7 @@ dependencies = [ "rand_isaac", "rand_jitter", "rand_os", - "rand_pcg", + "rand_pcg 0.1.2", "rand_xorshift", "winapi 0.3.9", ] @@ -4158,6 +4353,7 @@ dependencies = [ "rand_chacha 0.2.2", "rand_core 0.5.1", "rand_hc 0.2.0", + "rand_pcg 0.2.1", ] [[package]] @@ -4266,6 +4462,15 @@ dependencies = [ "rand_core 0.4.2", ] +[[package]] +name = "rand_pcg" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16abd0c1b639e9eb4d7c50c0b8100b0d0f849be2349829c740fe8e6eb4816429" +dependencies = [ + "rand_core 0.5.1", +] + [[package]] name = "rand_xorshift" version = "0.1.1" @@ -4697,6 +4902,26 @@ dependencies = [ "libc", ] +[[package]] +name = "selectors" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df320f1889ac4ba6bc0cdc9c9af7af4bd64bb927bccdf32d81140dc1f9be12fe" +dependencies = [ + "bitflags", + "cssparser", + "derive_more", + "fxhash", + "log 0.4.11", + "matches", + "phf", + "phf_codegen", + "precomputed-hash", + "servo_arc", + "smallvec 1.4.2", + "thin-slice", +] + [[package]] name = "semver" version = "0.9.0" @@ -4865,6 +5090,16 @@ dependencies = [ "yaml-rust", ] +[[package]] +name = "servo_arc" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d98238b800e0d1576d8b6e3de32827c2d74bee68bb97748dcf5071fb53965432" +dependencies = [ + "nodrop", + "stable_deref_trait", +] + [[package]] name = "sha-1" version = "0.8.2" @@ -4945,6 +5180,12 @@ dependencies = [ "libc", ] +[[package]] +name = "siphasher" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa8f3741c7372e75519bd9346068370c9cdaabcc1f9599cbcf2a2719352286b7" + [[package]] name = "slab" version = "0.3.0" @@ -5022,6 +5263,12 @@ dependencies = [ "winapi 0.3.9", ] +[[package]] +name = "stable_deref_trait" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" + [[package]] name = "static_assertions" version = "1.1.0" @@ -5043,6 +5290,31 @@ dependencies = [ "bytes 0.4.12", ] +[[package]] +name = "string_cache" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2940c75beb4e3bf3a494cef919a747a2cb81e52571e212bfbd185074add7208a" +dependencies = [ + "lazy_static 1.4.0", + "new_debug_unreachable", + "phf_shared", + "precomputed-hash", + "serde 1.0.115", +] + +[[package]] +name = "string_cache_codegen" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f24c8e5e19d22a726626f1a5e16fe15b132dcf21d10177fa5a45ce7962996b97" +dependencies = [ + "phf_generator", + "phf_shared", + "proc-macro2", + "quote", +] + [[package]] name = "strip-ansi-escapes" version = "0.1.0" @@ -5167,6 +5439,17 @@ dependencies = [ "winapi 0.3.9", ] +[[package]] +name = "tendril" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "707feda9f2582d5d680d733e38755547a3e8fb471e7ba11452ecfd9ce93a5d3b" +dependencies = [ + "futf", + "mac", + "utf-8", +] + [[package]] name = "term" version = "0.6.1" @@ -5225,6 +5508,12 @@ dependencies = [ "unicode-width", ] +[[package]] +name = "thin-slice" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8eaa81235c7058867fa8c0e7314f33dcce9c215f535d1913822a2b3f5e289f3c" + [[package]] name = "thiserror" version = "1.0.20" @@ -5718,6 +6007,12 @@ dependencies = [ "log 0.4.11", ] +[[package]] +name = "utf-8" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05e42f7c18b8f902290b009cde6d651262f956c98bc51bca4cd1d511c9cd85c7" + [[package]] name = "utf8-ranges" version = "1.0.4" diff --git a/Cargo.toml b/Cargo.toml index c6c1bb4b7..564c5000e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -44,6 +44,7 @@ nu_plugin_to_bson = {version = "0.22.0", path = "./crates/nu_plugin_to_bson", op nu_plugin_to_sqlite = {version = "0.22.0", path = "./crates/nu_plugin_to_sqlite", optional = true} nu_plugin_tree = {version = "0.22.0", path = "./crates/nu_plugin_tree", optional = true} nu_plugin_xpath = {version = "0.22.0", path = "./crates/nu_plugin_xpath", optional = true} +nu_plugin_selector = {version = "0.22.0", path = "./crates/nu_plugin_selector", optional = true} # Required to bootstrap the main binary clap = "2.33.3" @@ -88,7 +89,7 @@ default = [ "fetch", "rich-benchmark", ] -extra = ["default", "binaryview", "tree", "clipboard-cli", "trash-support", "start", "bson", "sqlite", "s3", "chart", "xpath"] +extra = ["default", "binaryview", "tree", "clipboard-cli", "trash-support", "start", "bson", "sqlite", "s3", "chart", "xpath", "selector"] stable = ["default"] wasi = ["inc", "match", "directories-support", "ptree-support", "match", "tree", "rustyline-support"] @@ -115,6 +116,7 @@ start = ["nu_plugin_start"] trash-support = ["nu-cli/trash-support"] tree = ["nu_plugin_tree"] xpath = ["nu_plugin_xpath"] +selector = ["nu_plugin_selector"] # Core plugins that ship with `cargo install nu` by default # Currently, Cargo limits us to installing only one binary @@ -191,6 +193,11 @@ name = "nu_plugin_extra_xpath" path = "src/plugins/nu_plugin_extra_xpath.rs" required-features = ["xpath"] +[[bin]] +name = "nu_plugin_extra_selector" +path = "src/plugins/nu_plugin_extra_selector.rs" +required-features = ["selector"] + [[bin]] name = "nu_plugin_extra_from_bson" path = "src/plugins/nu_plugin_extra_from_bson.rs" diff --git a/crates/nu_plugin_selector/Cargo.toml b/crates/nu_plugin_selector/Cargo.toml new file mode 100644 index 000000000..c54e6be08 --- /dev/null +++ b/crates/nu_plugin_selector/Cargo.toml @@ -0,0 +1,20 @@ +[package] +authors = ["The Nu Project Contributors"] +description = "web scraping using css selector" +edition = "2018" +license = "MIT" +name = "nu_plugin_selector" +version = "0.22.0" + +[lib] +doctest = false + +[dependencies] +nu-errors = {version = "0.22.0", path = "../nu-errors"} +nu-plugin = {version = "0.22.0", path = "../nu-plugin"} +nu-protocol = {version = "0.22.0", path = "../nu-protocol"} +nu-source = {version = "0.22.0", path = "../nu-source"} +nipper = "0.1.8" + +[dev-dependencies] +nu-test-support = {path = "../nu-test-support", version = "0.22.0"} diff --git a/crates/nu_plugin_selector/src/lib.rs b/crates/nu_plugin_selector/src/lib.rs new file mode 100644 index 000000000..4712609d6 --- /dev/null +++ b/crates/nu_plugin_selector/src/lib.rs @@ -0,0 +1,4 @@ +mod nu; +mod selector; + +pub use selector::Selector; diff --git a/crates/nu_plugin_selector/src/main.rs b/crates/nu_plugin_selector/src/main.rs new file mode 100644 index 000000000..3e1832f42 --- /dev/null +++ b/crates/nu_plugin_selector/src/main.rs @@ -0,0 +1,6 @@ +use nu_plugin::serve_plugin; +use nu_plugin_selector::Selector; + +fn main() { + serve_plugin(&mut Selector::new()); +} diff --git a/crates/nu_plugin_selector/src/nu/mod.rs b/crates/nu_plugin_selector/src/nu/mod.rs new file mode 100644 index 000000000..00b10ecda --- /dev/null +++ b/crates/nu_plugin_selector/src/nu/mod.rs @@ -0,0 +1,52 @@ +use nu_errors::ShellError; +use nu_plugin::Plugin; +use nu_protocol::{ + CallInfo, Primitive, ReturnSuccess, ReturnValue, Signature, SyntaxShape, UntaggedValue, Value, +}; +use nu_source::TaggedItem; + +use crate::{selector::begin_selector_query, Selector}; + +impl Plugin for Selector { + fn config(&mut self) -> Result { + Ok(Signature::build("selector") + .desc("execute selector query on html/web") + .required("query", SyntaxShape::String, "selector query") + .filter()) + } + + fn begin_filter(&mut self, call_info: CallInfo) -> Result, ShellError> { + let tag = call_info.name_tag; + let query = call_info.args.nth(0).ok_or_else(|| { + ShellError::labeled_error( + "selector query not passed", + "selector query not passed", + &tag, + ) + })?; + + self.query = query.as_string()?; + self.tag = tag; + + Ok(vec![]) + } + + fn filter(&mut self, input: Value) -> Result, ShellError> { + match input { + Value { + value: UntaggedValue::Primitive(Primitive::String(s)), + .. + } => Ok(begin_selector_query(s, (*self.query).tagged(&self.tag))? + .into_iter() + .map(ReturnSuccess::value) + .collect()), + Value { tag, .. } => Err(ShellError::labeled_error_with_secondary( + "Expected text from pipeline", + "requires text input", + &self.tag, + "value originates from here", + tag, + )), + } + } +} diff --git a/crates/nu_plugin_selector/src/selector.rs b/crates/nu_plugin_selector/src/selector.rs new file mode 100644 index 000000000..61fdeb0bc --- /dev/null +++ b/crates/nu_plugin_selector/src/selector.rs @@ -0,0 +1,100 @@ +use nipper::Document; +use nu_errors::ShellError; +use nu_protocol::{value::StringExt, Value}; +use nu_source::{Tag, Tagged}; + +pub struct Selector { + pub query: String, + pub tag: Tag, +} + +impl Selector { + pub fn new() -> Selector { + Selector { + query: String::new(), + tag: Tag::unknown(), + } + } +} + +impl Default for Selector { + fn default() -> Self { + Self::new() + } +} + +pub fn begin_selector_query(raw: String, query: Tagged<&str>) -> Result, ShellError> { + execute_selector_query(raw, query.item.to_string(), query.tag()) +} + +fn execute_selector_query( + input_string: String, + query_string: String, + tag: impl Into, +) -> Result, ShellError> { + let _tag = tag.into(); + let mut ret = vec![]; + let doc = Document::from(&input_string); + + // How to internally iterate + // doc.nip("tr.athing").iter().for_each(|athing| { + // let title = format!("{}", athing.select(".title a").text().to_string()); + // let href = athing + // .select(".storylink") + // .attr("href") + // .unwrap() + // .to_string(); + // let title_url = format!("{} - {}\n", title, href); + // ret.push(title_url.to_string_value_create_tag()); + // }); + + doc.nip(&query_string).iter().for_each(|athing| { + ret.push(athing.text().to_string().to_string_value_create_tag()); + }); + + Ok(ret) +} + +#[cfg(test)] +mod tests { + use nipper::Document; + use nu_errors::ShellError; + + #[test] + fn create_document_from_string() -> Result<(), ShellError> { + let html = r#"
"#; + let document = Document::from(html); + let shouldbe = + r#"
"#; + + assert_eq!(shouldbe.to_string(), document.html().to_string()); + + Ok(()) + } + + #[test] + fn modify_html_document() -> Result<(), ShellError> { + let html = r#"
"#; + let document = Document::from(html); + let mut input = document.select(r#"div[name="foo"]"#); + input.set_attr("id", "input"); + input.remove_attr("name"); + + let shouldbe = "bar".to_string(); + let actual = input.attr("value").unwrap().to_string(); + + assert_eq!(shouldbe, actual); + + Ok(()) + } + + // #[test] + // fn test_hacker_news() -> Result<(), ShellError> { + // let html = reqwest::blocking::get("https://news.ycombinator.com")?.text()?; + // let document = Document::from(&html); + // let result = query(html, ".hnname a".to_string(), Tag::unknown()); + // let shouldbe = Ok(vec!["Hacker News".to_str_value_create_tag()]); + // assert_eq!(shouldbe, result); + // Ok(()) + // } +} diff --git a/src/plugins/nu_plugin_extra_selector.rs b/src/plugins/nu_plugin_extra_selector.rs new file mode 100644 index 000000000..3e1832f42 --- /dev/null +++ b/src/plugins/nu_plugin_extra_selector.rs @@ -0,0 +1,6 @@ +use nu_plugin::serve_plugin; +use nu_plugin_selector::Selector; + +fn main() { + serve_plugin(&mut Selector::new()); +} diff --git a/wix/main.wxs b/wix/main.wxs index f80a7802c..988f45740 100644 --- a/wix/main.wxs +++ b/wix/main.wxs @@ -272,7 +272,14 @@ Source='target\$(var.Profile)\nu_plugin_to_sqlite.exe' KeyPath='yes'/> - + + + @@ -316,6 +323,7 @@ +