web scraping with css selectors (#2725)

* first step of making selector

* wip

* wip tests working

* probably good enough for a first pass

* oops, missed something.

* and something else...

* grrrr version errors
This commit is contained in:
Darren Schroeder 2020-11-03 15:46:42 -06:00 committed by GitHub
parent b674cee9d2
commit 97f3671e2c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 501 additions and 3 deletions

297
Cargo.lock generated
View File

@ -981,6 +981,33 @@ dependencies = [
"generic-array 0.8.3",
]
[[package]]
name = "cssparser"
version = "0.27.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "754b69d351cdc2d8ee09ae203db831e005560fc6030da058f86ad60c92a9cb0a"
dependencies = [
"cssparser-macros",
"dtoa-short",
"itoa",
"matches",
"phf",
"proc-macro2",
"quote",
"smallvec 1.4.2",
"syn",
]
[[package]]
name = "cssparser-macros"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dfae75de57f2b2e85e8768c3ea840fd159c8f33e2b6522c7835b7abac81be16e"
dependencies = [
"quote",
"syn",
]
[[package]]
name = "csv"
version = "1.1.3"
@ -1096,6 +1123,17 @@ dependencies = [
"syn",
]
[[package]]
name = "derive_more"
version = "0.99.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "41cb0e6161ad61ed084a36ba71fbba9e3ac5aee3606fb607fe08da6acbcf3d8c"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "digest"
version = "0.6.2"
@ -1205,6 +1243,15 @@ version = "0.4.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "134951f4028bdadb9b84baf4232681efbf277da25144b9b0ad65df75946c422b"
[[package]]
name = "dtoa-short"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "59020b8513b76630c49d918c33db9f4c91638e7d3404a28084083b87e33f76f2"
dependencies = [
"dtoa",
]
[[package]]
name = "dtparse"
version = "1.2.0"
@ -1488,6 +1535,16 @@ version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3dcaa9ae7725d12cdb85b3ad99a434db70b468c09ded17e012d86b5c1010f7a7"
[[package]]
name = "futf"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7c9c1ce3fa9336301af935ab852c437817d14cd33690446569392e65170aac3b"
dependencies = [
"mac",
"new_debug_unreachable",
]
[[package]]
name = "futures"
version = "0.1.29"
@ -1704,6 +1761,15 @@ dependencies = [
"pin-project",
]
[[package]]
name = "fxhash"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c"
dependencies = [
"byteorder",
]
[[package]]
name = "gcc"
version = "0.3.55"
@ -2035,6 +2101,20 @@ dependencies = [
"sha1",
]
[[package]]
name = "html5ever"
version = "0.25.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aafcf38a1a36118242d29b92e1b08ef84e67e4a5ed06e0a80be20e6a32bfed6b"
dependencies = [
"log 0.4.11",
"mac",
"markup5ever",
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "htmlescape"
version = "0.3.1"
@ -2566,6 +2646,12 @@ dependencies = [
"linked-hash-map 0.5.3",
]
[[package]]
name = "mac"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
[[package]]
name = "macaddr"
version = "1.0.1"
@ -2596,6 +2682,23 @@ version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d"
[[package]]
name = "markup5ever"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aae38d669396ca9b707bfc3db254bc382ddb94f57cc5c235f34623a669a01dab"
dependencies = [
"log 0.4.11",
"phf",
"phf_codegen",
"serde 1.0.115",
"serde_derive",
"serde_json",
"string_cache",
"string_cache_codegen",
"tendril",
]
[[package]]
name = "matches"
version = "0.1.8"
@ -2799,6 +2902,25 @@ dependencies = [
"winapi 0.3.9",
]
[[package]]
name = "new_debug_unreachable"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e4a24736216ec316047a1fc4252e27dabb04218aa4a3f37c6e7ddbf1f9782b54"
[[package]]
name = "nipper"
version = "0.1.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "576d0e437aa08b447a207584463febe639d00b26b63121a9c038eff8371e0050"
dependencies = [
"cssparser",
"html5ever",
"markup5ever",
"selectors",
"tendril",
]
[[package]]
name = "nix"
version = "0.17.0"
@ -2885,6 +3007,7 @@ dependencies = [
"nu_plugin_post",
"nu_plugin_ps",
"nu_plugin_s3",
"nu_plugin_selector",
"nu_plugin_start",
"nu_plugin_sys",
"nu_plugin_textview",
@ -3288,6 +3411,18 @@ dependencies = [
"s3handler",
]
[[package]]
name = "nu_plugin_selector"
version = "0.22.0"
dependencies = [
"nipper",
"nu-errors",
"nu-plugin",
"nu-protocol",
"nu-source",
"nu-test-support",
]
[[package]]
name = "nu_plugin_start"
version = "0.22.0"
@ -3808,6 +3943,60 @@ dependencies = [
"indexmap",
]
[[package]]
name = "phf"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3dfb61232e34fcb633f43d12c58f83c1df82962dcdfa565a4e866ffc17dafe12"
dependencies = [
"phf_macros",
"phf_shared",
"proc-macro-hack",
]
[[package]]
name = "phf_codegen"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cbffee61585b0411840d3ece935cce9cb6321f01c45477d30066498cd5e1a815"
dependencies = [
"phf_generator",
"phf_shared",
]
[[package]]
name = "phf_generator"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "17367f0cc86f2d25802b2c26ee58a7b23faeccf78a396094c13dced0d0182526"
dependencies = [
"phf_shared",
"rand 0.7.3",
]
[[package]]
name = "phf_macros"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7f6fde18ff429ffc8fe78e2bf7f8b7a5a5a6e2a8b58bc5a9ac69198bbda9189c"
dependencies = [
"phf_generator",
"phf_shared",
"proc-macro-hack",
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "phf_shared"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c00cf8b9eafe68dde5e9eaa2cef8ee84a9336a47d566ec55ca16589633b65af7"
dependencies = [
"siphasher",
]
[[package]]
name = "pin-project"
version = "0.4.23"
@ -3896,6 +4085,12 @@ version = "0.2.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c36fa947111f5c62a733b652544dd0016a43ce89619538a8ef92724a6f501a20"
[[package]]
name = "precomputed-hash"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c"
[[package]]
name = "pretty"
version = "0.5.2"
@ -4142,7 +4337,7 @@ dependencies = [
"rand_isaac",
"rand_jitter",
"rand_os",
"rand_pcg",
"rand_pcg 0.1.2",
"rand_xorshift",
"winapi 0.3.9",
]
@ -4158,6 +4353,7 @@ dependencies = [
"rand_chacha 0.2.2",
"rand_core 0.5.1",
"rand_hc 0.2.0",
"rand_pcg 0.2.1",
]
[[package]]
@ -4266,6 +4462,15 @@ dependencies = [
"rand_core 0.4.2",
]
[[package]]
name = "rand_pcg"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "16abd0c1b639e9eb4d7c50c0b8100b0d0f849be2349829c740fe8e6eb4816429"
dependencies = [
"rand_core 0.5.1",
]
[[package]]
name = "rand_xorshift"
version = "0.1.1"
@ -4697,6 +4902,26 @@ dependencies = [
"libc",
]
[[package]]
name = "selectors"
version = "0.22.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df320f1889ac4ba6bc0cdc9c9af7af4bd64bb927bccdf32d81140dc1f9be12fe"
dependencies = [
"bitflags",
"cssparser",
"derive_more",
"fxhash",
"log 0.4.11",
"matches",
"phf",
"phf_codegen",
"precomputed-hash",
"servo_arc",
"smallvec 1.4.2",
"thin-slice",
]
[[package]]
name = "semver"
version = "0.9.0"
@ -4865,6 +5090,16 @@ dependencies = [
"yaml-rust",
]
[[package]]
name = "servo_arc"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d98238b800e0d1576d8b6e3de32827c2d74bee68bb97748dcf5071fb53965432"
dependencies = [
"nodrop",
"stable_deref_trait",
]
[[package]]
name = "sha-1"
version = "0.8.2"
@ -4945,6 +5180,12 @@ dependencies = [
"libc",
]
[[package]]
name = "siphasher"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fa8f3741c7372e75519bd9346068370c9cdaabcc1f9599cbcf2a2719352286b7"
[[package]]
name = "slab"
version = "0.3.0"
@ -5022,6 +5263,12 @@ dependencies = [
"winapi 0.3.9",
]
[[package]]
name = "stable_deref_trait"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
[[package]]
name = "static_assertions"
version = "1.1.0"
@ -5043,6 +5290,31 @@ dependencies = [
"bytes 0.4.12",
]
[[package]]
name = "string_cache"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2940c75beb4e3bf3a494cef919a747a2cb81e52571e212bfbd185074add7208a"
dependencies = [
"lazy_static 1.4.0",
"new_debug_unreachable",
"phf_shared",
"precomputed-hash",
"serde 1.0.115",
]
[[package]]
name = "string_cache_codegen"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f24c8e5e19d22a726626f1a5e16fe15b132dcf21d10177fa5a45ce7962996b97"
dependencies = [
"phf_generator",
"phf_shared",
"proc-macro2",
"quote",
]
[[package]]
name = "strip-ansi-escapes"
version = "0.1.0"
@ -5167,6 +5439,17 @@ dependencies = [
"winapi 0.3.9",
]
[[package]]
name = "tendril"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "707feda9f2582d5d680d733e38755547a3e8fb471e7ba11452ecfd9ce93a5d3b"
dependencies = [
"futf",
"mac",
"utf-8",
]
[[package]]
name = "term"
version = "0.6.1"
@ -5225,6 +5508,12 @@ dependencies = [
"unicode-width",
]
[[package]]
name = "thin-slice"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8eaa81235c7058867fa8c0e7314f33dcce9c215f535d1913822a2b3f5e289f3c"
[[package]]
name = "thiserror"
version = "1.0.20"
@ -5718,6 +6007,12 @@ dependencies = [
"log 0.4.11",
]
[[package]]
name = "utf-8"
version = "0.7.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "05e42f7c18b8f902290b009cde6d651262f956c98bc51bca4cd1d511c9cd85c7"
[[package]]
name = "utf8-ranges"
version = "1.0.4"

View File

@ -44,6 +44,7 @@ nu_plugin_to_bson = {version = "0.22.0", path = "./crates/nu_plugin_to_bson", op
nu_plugin_to_sqlite = {version = "0.22.0", path = "./crates/nu_plugin_to_sqlite", optional = true}
nu_plugin_tree = {version = "0.22.0", path = "./crates/nu_plugin_tree", optional = true}
nu_plugin_xpath = {version = "0.22.0", path = "./crates/nu_plugin_xpath", optional = true}
nu_plugin_selector = {version = "0.22.0", path = "./crates/nu_plugin_selector", optional = true}
# Required to bootstrap the main binary
clap = "2.33.3"
@ -88,7 +89,7 @@ default = [
"fetch",
"rich-benchmark",
]
extra = ["default", "binaryview", "tree", "clipboard-cli", "trash-support", "start", "bson", "sqlite", "s3", "chart", "xpath"]
extra = ["default", "binaryview", "tree", "clipboard-cli", "trash-support", "start", "bson", "sqlite", "s3", "chart", "xpath", "selector"]
stable = ["default"]
wasi = ["inc", "match", "directories-support", "ptree-support", "match", "tree", "rustyline-support"]
@ -115,6 +116,7 @@ start = ["nu_plugin_start"]
trash-support = ["nu-cli/trash-support"]
tree = ["nu_plugin_tree"]
xpath = ["nu_plugin_xpath"]
selector = ["nu_plugin_selector"]
# Core plugins that ship with `cargo install nu` by default
# Currently, Cargo limits us to installing only one binary
@ -191,6 +193,11 @@ name = "nu_plugin_extra_xpath"
path = "src/plugins/nu_plugin_extra_xpath.rs"
required-features = ["xpath"]
[[bin]]
name = "nu_plugin_extra_selector"
path = "src/plugins/nu_plugin_extra_selector.rs"
required-features = ["selector"]
[[bin]]
name = "nu_plugin_extra_from_bson"
path = "src/plugins/nu_plugin_extra_from_bson.rs"

View File

@ -0,0 +1,20 @@
[package]
authors = ["The Nu Project Contributors"]
description = "web scraping using css selector"
edition = "2018"
license = "MIT"
name = "nu_plugin_selector"
version = "0.22.0"
[lib]
doctest = false
[dependencies]
nu-errors = {version = "0.22.0", path = "../nu-errors"}
nu-plugin = {version = "0.22.0", path = "../nu-plugin"}
nu-protocol = {version = "0.22.0", path = "../nu-protocol"}
nu-source = {version = "0.22.0", path = "../nu-source"}
nipper = "0.1.8"
[dev-dependencies]
nu-test-support = {path = "../nu-test-support", version = "0.22.0"}

View File

@ -0,0 +1,4 @@
mod nu;
mod selector;
pub use selector::Selector;

View File

@ -0,0 +1,6 @@
use nu_plugin::serve_plugin;
use nu_plugin_selector::Selector;
fn main() {
serve_plugin(&mut Selector::new());
}

View File

@ -0,0 +1,52 @@
use nu_errors::ShellError;
use nu_plugin::Plugin;
use nu_protocol::{
CallInfo, Primitive, ReturnSuccess, ReturnValue, Signature, SyntaxShape, UntaggedValue, Value,
};
use nu_source::TaggedItem;
use crate::{selector::begin_selector_query, Selector};
impl Plugin for Selector {
fn config(&mut self) -> Result<Signature, ShellError> {
Ok(Signature::build("selector")
.desc("execute selector query on html/web")
.required("query", SyntaxShape::String, "selector query")
.filter())
}
fn begin_filter(&mut self, call_info: CallInfo) -> Result<Vec<ReturnValue>, ShellError> {
let tag = call_info.name_tag;
let query = call_info.args.nth(0).ok_or_else(|| {
ShellError::labeled_error(
"selector query not passed",
"selector query not passed",
&tag,
)
})?;
self.query = query.as_string()?;
self.tag = tag;
Ok(vec![])
}
fn filter(&mut self, input: Value) -> Result<Vec<ReturnValue>, ShellError> {
match input {
Value {
value: UntaggedValue::Primitive(Primitive::String(s)),
..
} => Ok(begin_selector_query(s, (*self.query).tagged(&self.tag))?
.into_iter()
.map(ReturnSuccess::value)
.collect()),
Value { tag, .. } => Err(ShellError::labeled_error_with_secondary(
"Expected text from pipeline",
"requires text input",
&self.tag,
"value originates from here",
tag,
)),
}
}
}

View File

@ -0,0 +1,100 @@
use nipper::Document;
use nu_errors::ShellError;
use nu_protocol::{value::StringExt, Value};
use nu_source::{Tag, Tagged};
pub struct Selector {
pub query: String,
pub tag: Tag,
}
impl Selector {
pub fn new() -> Selector {
Selector {
query: String::new(),
tag: Tag::unknown(),
}
}
}
impl Default for Selector {
fn default() -> Self {
Self::new()
}
}
pub fn begin_selector_query(raw: String, query: Tagged<&str>) -> Result<Vec<Value>, ShellError> {
execute_selector_query(raw, query.item.to_string(), query.tag())
}
fn execute_selector_query(
input_string: String,
query_string: String,
tag: impl Into<Tag>,
) -> Result<Vec<Value>, ShellError> {
let _tag = tag.into();
let mut ret = vec![];
let doc = Document::from(&input_string);
// How to internally iterate
// doc.nip("tr.athing").iter().for_each(|athing| {
// let title = format!("{}", athing.select(".title a").text().to_string());
// let href = athing
// .select(".storylink")
// .attr("href")
// .unwrap()
// .to_string();
// let title_url = format!("{} - {}\n", title, href);
// ret.push(title_url.to_string_value_create_tag());
// });
doc.nip(&query_string).iter().for_each(|athing| {
ret.push(athing.text().to_string().to_string_value_create_tag());
});
Ok(ret)
}
#[cfg(test)]
mod tests {
use nipper::Document;
use nu_errors::ShellError;
#[test]
fn create_document_from_string() -> Result<(), ShellError> {
let html = r#"<div name="foo" value="bar"></div>"#;
let document = Document::from(html);
let shouldbe =
r#"<html><head></head><body><div name="foo" value="bar"></div></body></html>"#;
assert_eq!(shouldbe.to_string(), document.html().to_string());
Ok(())
}
#[test]
fn modify_html_document() -> Result<(), ShellError> {
let html = r#"<div name="foo" value="bar"></div>"#;
let document = Document::from(html);
let mut input = document.select(r#"div[name="foo"]"#);
input.set_attr("id", "input");
input.remove_attr("name");
let shouldbe = "bar".to_string();
let actual = input.attr("value").unwrap().to_string();
assert_eq!(shouldbe, actual);
Ok(())
}
// #[test]
// fn test_hacker_news() -> Result<(), ShellError> {
// let html = reqwest::blocking::get("https://news.ycombinator.com")?.text()?;
// let document = Document::from(&html);
// let result = query(html, ".hnname a".to_string(), Tag::unknown());
// let shouldbe = Ok(vec!["Hacker News".to_str_value_create_tag()]);
// assert_eq!(shouldbe, result);
// Ok(())
// }
}

View File

@ -0,0 +1,6 @@
use nu_plugin::serve_plugin;
use nu_plugin_selector::Selector;
fn main() {
serve_plugin(&mut Selector::new());
}

View File

@ -272,7 +272,14 @@
Source='target\$(var.Profile)\nu_plugin_to_sqlite.exe'
KeyPath='yes'/>
</Component>
<Component Id='binary23' Guid='*' Win64='$(var.Win64)'>
<File
Id='exe23'
Name='nu_plugin_selector.exe'
DiskId='1'
Source='target\$(var.Profile)\nu_plugin_selector.exe'
KeyPath='yes'/>
</Component>
</Directory>
</Directory>
</Directory>
@ -316,6 +323,7 @@
<ComponentRef Id='binary20'/>
<ComponentRef Id='binary21'/>
<ComponentRef Id='binary22'/>
<ComponentRef Id='binary23'/>
<Feature
Id='Environment'