diff --git a/Cargo.lock b/Cargo.lock index e2652a4178..aad9bdd990 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1117,6 +1117,36 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "curl" +version = "0.4.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e2161dd6eba090ff1594084e95fd67aeccf04382ffea77999ea94ed42ec67b6" +dependencies = [ + "curl-sys", + "libc", + "openssl-probe", + "openssl-sys", + "schannel", + "socket2", + "windows-sys 0.52.0", +] + +[[package]] +name = "curl-sys" +version = "0.4.73+curl-8.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "450ab250ecf17227c39afb9a2dd9261dc0035cb80f2612472fc0c4aac2dcb84d" +dependencies = [ + "cc", + "libc", + "libz-sys", + "openssl-sys", + "pkg-config", + "vcpkg", + "windows-sys 0.52.0", +] + [[package]] name = "deranged" version = "0.3.11" @@ -1883,12 +1913,26 @@ checksum = "bea68cab48b8459f17cf1c944c67ddc572d272d9f2b274140f223ecb1da4a3b7" dependencies = [ "log", "mac", - "markup5ever", + "markup5ever 0.11.0", "proc-macro2", "quote", "syn 1.0.109", ] +[[package]] +name = "html5ever" +version = "0.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c13771afe0e6e846f1e67d038d4cb29998a6779f93c809212e4e9c32efd244d4" +dependencies = [ + "log", + "mac", + "markup5ever 0.12.1", + "proc-macro2", + "quote", + "syn 2.0.60", +] + [[package]] name = "http" version = "0.2.12" @@ -2552,6 +2596,32 @@ dependencies = [ "tendril", ] +[[package]] +name = "markup5ever" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16ce3abbeba692c8b8441d036ef91aea6df8da2c6b6e21c7e14d3c18e526be45" +dependencies = [ + "log", + "phf 0.11.2", + "phf_codegen 0.11.2", + "string_cache", + "string_cache_codegen", + "tendril", +] + +[[package]] +name = "markup5ever_rcdom" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edaa21ab3701bfee5099ade5f7e1f84553fd19228cf332f13cd6e964bf59be18" +dependencies = [ + "html5ever 0.27.0", + "markup5ever 0.12.1", + "tendril", + "xml5ever", +] + [[package]] name = "md-5" version = "0.10.6" @@ -3463,8 +3533,11 @@ dependencies = [ "nu-plugin", "nu-protocol", "scraper", + "serde", + "serde_json", "sxd-document", "sxd-xpath", + "webpage", ] [[package]] @@ -5283,7 +5356,7 @@ dependencies = [ "ahash 0.8.11", "cssparser", "ego-tree", - "html5ever", + "html5ever 0.26.0", "once_cell", "selectors", "tendril", @@ -6797,6 +6870,20 @@ dependencies = [ "pkg-config", ] +[[package]] +name = "webpage" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70862efc041d46e6bbaa82bb9c34ae0596d090e86cbd14bd9e93b36ee6802eac" +dependencies = [ + "curl", + "html5ever 0.27.0", + "markup5ever_rcdom", + "serde", + "serde_json", + "url", +] + [[package]] name = "which" version = "6.0.1" @@ -7209,6 +7296,17 @@ dependencies = [ "rustix", ] +[[package]] +name = "xml5ever" +version = "0.18.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9bbb26405d8e919bc1547a5aa9abc95cbfa438f04844f5fdd9dc7596b748bf69" +dependencies = [ + "log", + "mac", + "markup5ever 0.12.1", +] + [[package]] name = "xxhash-rust" version = "0.8.10" diff --git a/crates/nu_plugin_query/Cargo.toml b/crates/nu_plugin_query/Cargo.toml index 4a54e9bf60..551610e390 100644 --- a/crates/nu_plugin_query/Cargo.toml +++ b/crates/nu_plugin_query/Cargo.toml @@ -22,4 +22,7 @@ nu-protocol = { path = "../nu-protocol", version = "0.95.1" } gjson = "0.8" scraper = { default-features = false, version = "0.19" } sxd-document = "0.3" -sxd-xpath = "0.4" \ No newline at end of file +sxd-xpath = "0.4" +webpage = { version = "2.0.1", features = ["serde"] } +serde_json.workspace = true +serde.workspace = true diff --git a/crates/nu_plugin_query/src/lib.rs b/crates/nu_plugin_query/src/lib.rs index 8027c67493..555144d9c7 100644 --- a/crates/nu_plugin_query/src/lib.rs +++ b/crates/nu_plugin_query/src/lib.rs @@ -1,6 +1,7 @@ mod query; mod query_json; mod query_web; +mod query_webpage_info; mod query_xml; mod web_tables; diff --git a/crates/nu_plugin_query/src/query.rs b/crates/nu_plugin_query/src/query.rs index c22339ab4a..273fc858c6 100644 --- a/crates/nu_plugin_query/src/query.rs +++ b/crates/nu_plugin_query/src/query.rs @@ -1,4 +1,7 @@ -use crate::{query_json::QueryJson, query_web::QueryWeb, query_xml::QueryXml}; +use crate::{ + query_json::QueryJson, query_web::QueryWeb, query_webpage_info::QueryWebpageInfo, + query_xml::QueryXml, +}; use nu_plugin::{EvaluatedCall, Plugin, PluginCommand, SimplePluginCommand}; use nu_protocol::{Category, LabeledError, Signature, Value}; @@ -26,6 +29,7 @@ impl Plugin for Query { Box::new(QueryJson), Box::new(QueryXml), Box::new(QueryWeb), + Box::new(QueryWebpageInfo), ] } } diff --git a/crates/nu_plugin_query/src/query_webpage_info.rs b/crates/nu_plugin_query/src/query_webpage_info.rs new file mode 100644 index 0000000000..8fed37dcd3 --- /dev/null +++ b/crates/nu_plugin_query/src/query_webpage_info.rs @@ -0,0 +1,478 @@ +use nu_plugin::{EngineInterface, EvaluatedCall, SimplePluginCommand}; +use nu_protocol::{Category, Example, LabeledError, Record, Signature, Span, Type, Value}; + +use crate::Query; + +pub struct QueryWebpageInfo; + +impl SimplePluginCommand for QueryWebpageInfo { + type Plugin = Query; + + fn name(&self) -> &str { + "query webpage-info" + } + + fn usage(&self) -> &str { + "uses the webpage crate to extract info from html: title, description, language, links, RSS feeds, Opengraph, Schema.org, and more" + } + + fn signature(&self) -> Signature { + Signature::build(self.name()) + .input_output_type(Type::String, Type::record()) + .category(Category::Network) + } + + fn examples(&self) -> Vec { + web_examples() + } + + fn run( + &self, + _plugin: &Query, + _engine: &EngineInterface, + _call: &EvaluatedCall, + input: &Value, + ) -> Result { + let span = input.span(); + match input { + Value::String { val, .. } => execute_webpage(val, span), + _ => Err(LabeledError::new("Requires text input") + .with_label("expected text from pipeline", span)), + } + } +} + +pub fn web_examples() -> Vec> { + vec![Example { + example: "http get https://phoronix.com | query webpage-info", + description: "extract detailed info from phoronix.com website", + result: None, + }] +} + +fn execute_webpage(html: &str, span: Span) -> Result { + let info = webpage::HTML::from_string(html.to_string(), None) + .map_err(|e| LabeledError::new(e.to_string()).with_label("error parsing html", span))?; + + let value = to_value(info, span).map_err(|e| { + LabeledError::new(e.to_string()).with_label("error convert Value::Record", span) + })?; + + Ok(value) +} + +#[cfg(test)] +mod tests { + use super::*; + + const HTML: &str = r#" + My Title + "#; + + #[test] + fn test_basics() { + let info = execute_webpage(HTML, Span::test_data()).unwrap(); + let record = info.as_record().unwrap(); + assert_eq!(record.get("title").unwrap().as_str().unwrap(), "My Title"); + } +} + +// revive nu-serde sketch + +use serde::Serialize; + +/// Convert any serde:Serialize into a `nu_protocol::Value` +pub fn to_value(value: T, span: Span) -> Result +where + T: Serialize, +{ + value.serialize(&ValueSerializer { span }) +} + +struct ValueSerializer { + span: Span, +} + +struct MapSerializer<'a> { + record: Record, + serializer: &'a ValueSerializer, + current_key: Option, +} + +impl<'a> serde::Serializer for &'a ValueSerializer { + type Ok = Value; + type Error = Error; + + type SerializeSeq = SeqSerializer<'a>; + type SerializeTuple = SeqSerializer<'a>; + type SerializeTupleStruct = SeqSerializer<'a>; + type SerializeTupleVariant = SeqSerializer<'a>; + + type SerializeMap = MapSerializer<'a>; + type SerializeStruct = MapSerializer<'a>; + type SerializeStructVariant = MapSerializer<'a>; + + fn serialize_bool(self, v: bool) -> Result { + Ok(Value::bool(v, self.span)) + } + + fn serialize_i8(self, v: i8) -> Result { + Ok(Value::int(v.into(), self.span)) + } + + fn serialize_i16(self, v: i16) -> Result { + Ok(Value::int(v.into(), self.span)) + } + + fn serialize_i32(self, v: i32) -> Result { + Ok(Value::int(v.into(), self.span)) + } + + fn serialize_i64(self, v: i64) -> Result { + Ok(Value::int(v, self.span)) + } + + fn serialize_u8(self, v: u8) -> Result { + Ok(Value::int(v.into(), self.span)) + } + + fn serialize_u16(self, v: u16) -> Result { + Ok(Value::int(v.into(), self.span)) + } + + fn serialize_u32(self, v: u32) -> Result { + Ok(Value::int(v.into(), self.span)) + } + + fn serialize_u64(self, _v: u64) -> Result { + // TODO: how to represent a u64 value a Value? + Err(Error::new("the numbers are too big")) + // Ok(Value::int(v.into(), self.span)) + } + + fn serialize_f32(self, v: f32) -> Result { + Ok(Value::float(v.into(), self.span)) + } + + fn serialize_f64(self, v: f64) -> Result { + Ok(Value::float(v, self.span)) + } + + fn serialize_char(self, v: char) -> Result { + Ok(Value::string(v, self.span)) + } + + fn serialize_str(self, v: &str) -> Result { + Ok(Value::string(v, self.span)) + } + + fn serialize_bytes(self, v: &[u8]) -> Result { + Ok(Value::binary(v, self.span)) + } + + fn serialize_none(self) -> Result { + Ok(Value::nothing(self.span)) + } + + fn serialize_some(self, value: &T) -> Result + where + T: Serialize, + { + value.serialize(self) + } + + fn serialize_unit(self) -> Result { + // TODO: is this OK? + Ok(Value::nothing(self.span)) + } + + fn serialize_unit_struct(self, _name: &'static str) -> Result { + // TODO: is this OK? + Ok(Value::nothing(self.span)) + } + + fn serialize_unit_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + ) -> Result { + // TODO: is this OK? + Ok(Value::nothing(self.span)) + } + + fn serialize_newtype_struct( + self, + _name: &'static str, + value: &T, + ) -> Result + where + T: Serialize, + { + value.serialize(self) + } + + fn serialize_newtype_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + value: &T, + ) -> Result + where + T: Serialize, + { + value.serialize(self) + } + + fn serialize_seq(self, _len: Option) -> Result { + Ok(SeqSerializer::new(self)) + } + + fn serialize_tuple(self, _len: usize) -> Result { + Ok(SeqSerializer::new(self)) + } + + fn serialize_tuple_struct( + self, + _name: &'static str, + _len: usize, + ) -> Result { + Ok(SeqSerializer::new(self)) + } + + fn serialize_tuple_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _len: usize, + ) -> Result { + Ok(SeqSerializer::new(self)) + } + + fn serialize_map(self, _len: Option) -> Result { + Ok(MapSerializer::new(self)) + } + + fn serialize_struct( + self, + _name: &'static str, + _len: usize, + ) -> Result { + Ok(MapSerializer::new(self)) + } + + fn serialize_struct_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _len: usize, + ) -> Result { + Ok(MapSerializer::new(self)) + } +} + +pub struct Error { + message: String, +} + +impl Error { + pub fn new(msg: T) -> Self { + Error { + message: msg.to_string(), + } + } +} + +impl serde::ser::Error for Error { + fn custom(msg: T) -> Self { + Error::new(msg.to_string()) + } +} + +impl std::fmt::Debug for Error { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.message) + } +} + +impl std::fmt::Display for Error { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.message) + } +} + +impl std::error::Error for Error {} + +// +// maps +impl<'a> MapSerializer<'a> { + fn new(serializer: &'a ValueSerializer) -> Self { + Self { + record: Record::new(), + current_key: None, + serializer, + } + } +} + +impl<'a> serde::ser::SerializeStruct for MapSerializer<'a> { + type Ok = Value; + type Error = Error; + + fn serialize_field( + &mut self, + key: &'static str, + value: &T, + ) -> Result<(), Self::Error> + where + T: Serialize, + { + self.record + .insert(key.to_owned(), value.serialize(self.serializer)?); + Ok(()) + } + + fn end(self) -> Result { + Ok(Value::record(self.record, self.serializer.span)) + } +} + +impl<'a> serde::ser::SerializeMap for MapSerializer<'a> { + type Ok = Value; + type Error = Error; + + fn serialize_key(&mut self, key: &T) -> Result<(), Self::Error> + where + T: Serialize, + { + let value = serde_json::to_value(key).map_err(Error::new)?; + let key = value + .as_str() + .ok_or(Error::new("key must be a string"))? + .to_string(); + self.current_key = Some(key); + Ok(()) + } + + fn serialize_value(&mut self, value: &T) -> Result<(), Self::Error> + where + T: Serialize, + { + let key = self.current_key.take().ok_or(Error::new("key expected"))?; + self.record.insert(key, value.serialize(self.serializer)?); + Ok(()) + } + + fn end(self) -> Result { + Ok(Value::record(self.record, self.serializer.span)) + } +} + +impl<'a> serde::ser::SerializeStructVariant for MapSerializer<'a> { + type Ok = Value; + type Error = Error; + + fn serialize_field( + &mut self, + key: &'static str, + value: &T, + ) -> Result<(), Self::Error> + where + T: Serialize, + { + self.record + .insert(key.to_owned(), value.serialize(self.serializer)?); + Ok(()) + } + + fn end(self) -> Result { + Ok(Value::record(self.record, self.serializer.span)) + } +} + +// +// sequences +struct SeqSerializer<'a> { + seq: Vec, + serializer: &'a ValueSerializer, +} + +impl<'a> SeqSerializer<'a> { + fn new(serializer: &'a ValueSerializer) -> Self { + Self { + seq: Vec::new(), + serializer, + } + } +} + +impl<'a> serde::ser::SerializeSeq for SeqSerializer<'a> { + type Ok = Value; + type Error = Error; + + fn serialize_element(&mut self, value: &T) -> Result<(), Self::Error> + where + T: Serialize, + { + self.seq.push(value.serialize(self.serializer)?); + Ok(()) + } + + fn end(self) -> Result { + Ok(Value::list(self.seq, self.serializer.span)) + } +} + +impl<'a> serde::ser::SerializeTuple for SeqSerializer<'a> { + type Ok = Value; + type Error = Error; + + fn serialize_element(&mut self, value: &T) -> Result<(), Self::Error> + where + T: Serialize, + { + self.seq.push(value.serialize(self.serializer)?); + Ok(()) + } + + fn end(self) -> Result { + Ok(Value::list(self.seq, self.serializer.span)) + } +} + +impl<'a> serde::ser::SerializeTupleStruct for SeqSerializer<'a> { + type Ok = Value; + type Error = Error; + + fn serialize_field(&mut self, value: &T) -> Result<(), Self::Error> + where + T: Serialize, + { + self.seq.push(value.serialize(self.serializer)?); + Ok(()) + } + + fn end(self) -> Result { + Ok(Value::list(self.seq, self.serializer.span)) + } +} + +impl<'a> serde::ser::SerializeTupleVariant for SeqSerializer<'a> { + type Ok = Value; + type Error = Error; + + fn serialize_field(&mut self, value: &T) -> Result<(), Self::Error> + where + T: Serialize, + { + self.seq.push(value.serialize(self.serializer)?); + Ok(()) + } + + fn end(self) -> Result { + Ok(Value::list(self.seq, self.serializer.span)) + } +}