web scraping with css selectors (#2725)

* first step of making selector

* wip

* wip tests working

* probably good enough for a first pass

* oops, missed something.

* and something else...

* grrrr version errors
This commit is contained in:
Darren Schroeder
2020-11-03 15:46:42 -06:00
committed by GitHub
parent b674cee9d2
commit 97f3671e2c
9 changed files with 501 additions and 3 deletions

View File

@ -0,0 +1,20 @@
[package]
authors = ["The Nu Project Contributors"]
description = "web scraping using css selector"
edition = "2018"
license = "MIT"
name = "nu_plugin_selector"
version = "0.22.0"
[lib]
doctest = false
[dependencies]
nu-errors = {version = "0.22.0", path = "../nu-errors"}
nu-plugin = {version = "0.22.0", path = "../nu-plugin"}
nu-protocol = {version = "0.22.0", path = "../nu-protocol"}
nu-source = {version = "0.22.0", path = "../nu-source"}
nipper = "0.1.8"
[dev-dependencies]
nu-test-support = {path = "../nu-test-support", version = "0.22.0"}

View File

@ -0,0 +1,4 @@
mod nu;
mod selector;
pub use selector::Selector;

View File

@ -0,0 +1,6 @@
use nu_plugin::serve_plugin;
use nu_plugin_selector::Selector;
fn main() {
serve_plugin(&mut Selector::new());
}

View File

@ -0,0 +1,52 @@
use nu_errors::ShellError;
use nu_plugin::Plugin;
use nu_protocol::{
CallInfo, Primitive, ReturnSuccess, ReturnValue, Signature, SyntaxShape, UntaggedValue, Value,
};
use nu_source::TaggedItem;
use crate::{selector::begin_selector_query, Selector};
impl Plugin for Selector {
fn config(&mut self) -> Result<Signature, ShellError> {
Ok(Signature::build("selector")
.desc("execute selector query on html/web")
.required("query", SyntaxShape::String, "selector query")
.filter())
}
fn begin_filter(&mut self, call_info: CallInfo) -> Result<Vec<ReturnValue>, ShellError> {
let tag = call_info.name_tag;
let query = call_info.args.nth(0).ok_or_else(|| {
ShellError::labeled_error(
"selector query not passed",
"selector query not passed",
&tag,
)
})?;
self.query = query.as_string()?;
self.tag = tag;
Ok(vec![])
}
fn filter(&mut self, input: Value) -> Result<Vec<ReturnValue>, ShellError> {
match input {
Value {
value: UntaggedValue::Primitive(Primitive::String(s)),
..
} => Ok(begin_selector_query(s, (*self.query).tagged(&self.tag))?
.into_iter()
.map(ReturnSuccess::value)
.collect()),
Value { tag, .. } => Err(ShellError::labeled_error_with_secondary(
"Expected text from pipeline",
"requires text input",
&self.tag,
"value originates from here",
tag,
)),
}
}
}

View File

@ -0,0 +1,100 @@
use nipper::Document;
use nu_errors::ShellError;
use nu_protocol::{value::StringExt, Value};
use nu_source::{Tag, Tagged};
pub struct Selector {
pub query: String,
pub tag: Tag,
}
impl Selector {
pub fn new() -> Selector {
Selector {
query: String::new(),
tag: Tag::unknown(),
}
}
}
impl Default for Selector {
fn default() -> Self {
Self::new()
}
}
pub fn begin_selector_query(raw: String, query: Tagged<&str>) -> Result<Vec<Value>, ShellError> {
execute_selector_query(raw, query.item.to_string(), query.tag())
}
fn execute_selector_query(
input_string: String,
query_string: String,
tag: impl Into<Tag>,
) -> Result<Vec<Value>, ShellError> {
let _tag = tag.into();
let mut ret = vec![];
let doc = Document::from(&input_string);
// How to internally iterate
// doc.nip("tr.athing").iter().for_each(|athing| {
// let title = format!("{}", athing.select(".title a").text().to_string());
// let href = athing
// .select(".storylink")
// .attr("href")
// .unwrap()
// .to_string();
// let title_url = format!("{} - {}\n", title, href);
// ret.push(title_url.to_string_value_create_tag());
// });
doc.nip(&query_string).iter().for_each(|athing| {
ret.push(athing.text().to_string().to_string_value_create_tag());
});
Ok(ret)
}
#[cfg(test)]
mod tests {
use nipper::Document;
use nu_errors::ShellError;
#[test]
fn create_document_from_string() -> Result<(), ShellError> {
let html = r#"<div name="foo" value="bar"></div>"#;
let document = Document::from(html);
let shouldbe =
r#"<html><head></head><body><div name="foo" value="bar"></div></body></html>"#;
assert_eq!(shouldbe.to_string(), document.html().to_string());
Ok(())
}
#[test]
fn modify_html_document() -> Result<(), ShellError> {
let html = r#"<div name="foo" value="bar"></div>"#;
let document = Document::from(html);
let mut input = document.select(r#"div[name="foo"]"#);
input.set_attr("id", "input");
input.remove_attr("name");
let shouldbe = "bar".to_string();
let actual = input.attr("value").unwrap().to_string();
assert_eq!(shouldbe, actual);
Ok(())
}
// #[test]
// fn test_hacker_news() -> Result<(), ShellError> {
// let html = reqwest::blocking::get("https://news.ycombinator.com")?.text()?;
// let document = Document::from(&html);
// let result = query(html, ".hnname a".to_string(), Tag::unknown());
// let shouldbe = Ok(vec!["Hacker News".to_str_value_create_tag()]);
// assert_eq!(shouldbe, result);
// Ok(())
// }
}