forked from extern/nushell
web scraping with css selectors (#2725)
* first step of making selector * wip * wip tests working * probably good enough for a first pass * oops, missed something. * and something else... * grrrr version errors
This commit is contained in:
20
crates/nu_plugin_selector/Cargo.toml
Normal file
20
crates/nu_plugin_selector/Cargo.toml
Normal file
@ -0,0 +1,20 @@
|
||||
[package]
|
||||
authors = ["The Nu Project Contributors"]
|
||||
description = "web scraping using css selector"
|
||||
edition = "2018"
|
||||
license = "MIT"
|
||||
name = "nu_plugin_selector"
|
||||
version = "0.22.0"
|
||||
|
||||
[lib]
|
||||
doctest = false
|
||||
|
||||
[dependencies]
|
||||
nu-errors = {version = "0.22.0", path = "../nu-errors"}
|
||||
nu-plugin = {version = "0.22.0", path = "../nu-plugin"}
|
||||
nu-protocol = {version = "0.22.0", path = "../nu-protocol"}
|
||||
nu-source = {version = "0.22.0", path = "../nu-source"}
|
||||
nipper = "0.1.8"
|
||||
|
||||
[dev-dependencies]
|
||||
nu-test-support = {path = "../nu-test-support", version = "0.22.0"}
|
4
crates/nu_plugin_selector/src/lib.rs
Normal file
4
crates/nu_plugin_selector/src/lib.rs
Normal file
@ -0,0 +1,4 @@
|
||||
mod nu;
|
||||
mod selector;
|
||||
|
||||
pub use selector::Selector;
|
6
crates/nu_plugin_selector/src/main.rs
Normal file
6
crates/nu_plugin_selector/src/main.rs
Normal file
@ -0,0 +1,6 @@
|
||||
use nu_plugin::serve_plugin;
|
||||
use nu_plugin_selector::Selector;
|
||||
|
||||
fn main() {
|
||||
serve_plugin(&mut Selector::new());
|
||||
}
|
52
crates/nu_plugin_selector/src/nu/mod.rs
Normal file
52
crates/nu_plugin_selector/src/nu/mod.rs
Normal file
@ -0,0 +1,52 @@
|
||||
use nu_errors::ShellError;
|
||||
use nu_plugin::Plugin;
|
||||
use nu_protocol::{
|
||||
CallInfo, Primitive, ReturnSuccess, ReturnValue, Signature, SyntaxShape, UntaggedValue, Value,
|
||||
};
|
||||
use nu_source::TaggedItem;
|
||||
|
||||
use crate::{selector::begin_selector_query, Selector};
|
||||
|
||||
impl Plugin for Selector {
|
||||
fn config(&mut self) -> Result<Signature, ShellError> {
|
||||
Ok(Signature::build("selector")
|
||||
.desc("execute selector query on html/web")
|
||||
.required("query", SyntaxShape::String, "selector query")
|
||||
.filter())
|
||||
}
|
||||
|
||||
fn begin_filter(&mut self, call_info: CallInfo) -> Result<Vec<ReturnValue>, ShellError> {
|
||||
let tag = call_info.name_tag;
|
||||
let query = call_info.args.nth(0).ok_or_else(|| {
|
||||
ShellError::labeled_error(
|
||||
"selector query not passed",
|
||||
"selector query not passed",
|
||||
&tag,
|
||||
)
|
||||
})?;
|
||||
|
||||
self.query = query.as_string()?;
|
||||
self.tag = tag;
|
||||
|
||||
Ok(vec![])
|
||||
}
|
||||
|
||||
fn filter(&mut self, input: Value) -> Result<Vec<ReturnValue>, ShellError> {
|
||||
match input {
|
||||
Value {
|
||||
value: UntaggedValue::Primitive(Primitive::String(s)),
|
||||
..
|
||||
} => Ok(begin_selector_query(s, (*self.query).tagged(&self.tag))?
|
||||
.into_iter()
|
||||
.map(ReturnSuccess::value)
|
||||
.collect()),
|
||||
Value { tag, .. } => Err(ShellError::labeled_error_with_secondary(
|
||||
"Expected text from pipeline",
|
||||
"requires text input",
|
||||
&self.tag,
|
||||
"value originates from here",
|
||||
tag,
|
||||
)),
|
||||
}
|
||||
}
|
||||
}
|
100
crates/nu_plugin_selector/src/selector.rs
Normal file
100
crates/nu_plugin_selector/src/selector.rs
Normal file
@ -0,0 +1,100 @@
|
||||
use nipper::Document;
|
||||
use nu_errors::ShellError;
|
||||
use nu_protocol::{value::StringExt, Value};
|
||||
use nu_source::{Tag, Tagged};
|
||||
|
||||
pub struct Selector {
|
||||
pub query: String,
|
||||
pub tag: Tag,
|
||||
}
|
||||
|
||||
impl Selector {
|
||||
pub fn new() -> Selector {
|
||||
Selector {
|
||||
query: String::new(),
|
||||
tag: Tag::unknown(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for Selector {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
pub fn begin_selector_query(raw: String, query: Tagged<&str>) -> Result<Vec<Value>, ShellError> {
|
||||
execute_selector_query(raw, query.item.to_string(), query.tag())
|
||||
}
|
||||
|
||||
fn execute_selector_query(
|
||||
input_string: String,
|
||||
query_string: String,
|
||||
tag: impl Into<Tag>,
|
||||
) -> Result<Vec<Value>, ShellError> {
|
||||
let _tag = tag.into();
|
||||
let mut ret = vec![];
|
||||
let doc = Document::from(&input_string);
|
||||
|
||||
// How to internally iterate
|
||||
// doc.nip("tr.athing").iter().for_each(|athing| {
|
||||
// let title = format!("{}", athing.select(".title a").text().to_string());
|
||||
// let href = athing
|
||||
// .select(".storylink")
|
||||
// .attr("href")
|
||||
// .unwrap()
|
||||
// .to_string();
|
||||
// let title_url = format!("{} - {}\n", title, href);
|
||||
// ret.push(title_url.to_string_value_create_tag());
|
||||
// });
|
||||
|
||||
doc.nip(&query_string).iter().for_each(|athing| {
|
||||
ret.push(athing.text().to_string().to_string_value_create_tag());
|
||||
});
|
||||
|
||||
Ok(ret)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use nipper::Document;
|
||||
use nu_errors::ShellError;
|
||||
|
||||
#[test]
|
||||
fn create_document_from_string() -> Result<(), ShellError> {
|
||||
let html = r#"<div name="foo" value="bar"></div>"#;
|
||||
let document = Document::from(html);
|
||||
let shouldbe =
|
||||
r#"<html><head></head><body><div name="foo" value="bar"></div></body></html>"#;
|
||||
|
||||
assert_eq!(shouldbe.to_string(), document.html().to_string());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn modify_html_document() -> Result<(), ShellError> {
|
||||
let html = r#"<div name="foo" value="bar"></div>"#;
|
||||
let document = Document::from(html);
|
||||
let mut input = document.select(r#"div[name="foo"]"#);
|
||||
input.set_attr("id", "input");
|
||||
input.remove_attr("name");
|
||||
|
||||
let shouldbe = "bar".to_string();
|
||||
let actual = input.attr("value").unwrap().to_string();
|
||||
|
||||
assert_eq!(shouldbe, actual);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// #[test]
|
||||
// fn test_hacker_news() -> Result<(), ShellError> {
|
||||
// let html = reqwest::blocking::get("https://news.ycombinator.com")?.text()?;
|
||||
// let document = Document::from(&html);
|
||||
// let result = query(html, ".hnname a".to_string(), Tag::unknown());
|
||||
// let shouldbe = Ok(vec!["Hacker News".to_str_value_create_tag()]);
|
||||
// assert_eq!(shouldbe, result);
|
||||
// Ok(())
|
||||
// }
|
||||
}
|
Reference in New Issue
Block a user