mirror of
https://github.com/nushell/nushell.git
synced 2025-05-05 18:44:24 +02:00
232 lines
6.8 KiB
Rust
232 lines
6.8 KiB
Rust
use crate::Table;
|
|
use nu_protocol::{value::StringExt, Primitive, TaggedDictBuilder, UntaggedValue, Value};
|
|
use nu_source::Tag;
|
|
use scraper::{Html, Selector as ScraperSelector};
|
|
|
|
pub struct Selector {
|
|
pub query: String,
|
|
pub tag: Tag,
|
|
pub as_html: bool,
|
|
pub attribute: String,
|
|
pub as_table: Value,
|
|
pub inspect: bool,
|
|
}
|
|
|
|
impl Selector {
|
|
pub fn new() -> Selector {
|
|
Selector {
|
|
query: String::new(),
|
|
tag: Tag::unknown(),
|
|
as_html: false,
|
|
attribute: String::new(),
|
|
as_table: Value::new(
|
|
UntaggedValue::Primitive(Primitive::String("".to_string())),
|
|
Tag::unknown(),
|
|
),
|
|
inspect: false,
|
|
}
|
|
}
|
|
}
|
|
|
|
impl Default for Selector {
|
|
fn default() -> Self {
|
|
Self::new()
|
|
}
|
|
}
|
|
|
|
pub fn begin_selector_query(input_html: String, selector: &Selector) -> Vec<Value> {
|
|
if !selector.as_table.value.is_string() {
|
|
retrieve_tables(input_html.as_str(), &selector.as_table, selector.inspect)
|
|
} else {
|
|
match selector.attribute.is_empty() {
|
|
true => execute_selector_query(
|
|
input_html.as_str(),
|
|
selector.query.as_str(),
|
|
selector.as_html,
|
|
),
|
|
false => execute_selector_query_with_attribute(
|
|
input_html.as_str(),
|
|
selector.query.as_str(),
|
|
selector.attribute.as_str(),
|
|
),
|
|
}
|
|
}
|
|
}
|
|
|
|
pub fn retrieve_tables(input_string: &str, columns: &Value, inspect_mode: bool) -> Vec<Value> {
|
|
let html = input_string;
|
|
let mut cols = Vec::new();
|
|
if let UntaggedValue::Table(t) = &columns.value {
|
|
for x in t {
|
|
cols.push(x.convert_to_string());
|
|
}
|
|
}
|
|
|
|
if inspect_mode {
|
|
eprintln!("Passed in Column Headers = {:#?}", &cols,);
|
|
}
|
|
|
|
let tables = match Table::find_by_headers(html, &cols) {
|
|
Some(t) => {
|
|
if inspect_mode {
|
|
eprintln!("Table Found = {:#?}", &t);
|
|
}
|
|
t
|
|
}
|
|
None => vec![Table::empty()],
|
|
};
|
|
if tables.len() == 1 {
|
|
return retrieve_table(
|
|
tables
|
|
.into_iter()
|
|
.next()
|
|
.expect("This should never trigger"),
|
|
columns,
|
|
);
|
|
}
|
|
tables
|
|
.into_iter()
|
|
.map(move |table| {
|
|
UntaggedValue::Table(retrieve_table(table, columns)).into_value(Tag::unknown())
|
|
})
|
|
.collect()
|
|
}
|
|
|
|
fn retrieve_table(mut table: Table, columns: &Value) -> Vec<Value> {
|
|
let mut cols = Vec::new();
|
|
if let UntaggedValue::Table(t) = &columns.value {
|
|
for x in t {
|
|
cols.push(x.convert_to_string());
|
|
}
|
|
}
|
|
|
|
if cols.is_empty() && !table.headers().is_empty() {
|
|
for col in table.headers().keys() {
|
|
cols.push(col.to_string());
|
|
}
|
|
}
|
|
|
|
let mut table_out = Vec::new();
|
|
// sometimes there are tables where the first column is the headers, kind of like
|
|
// a table has ben rotated ccw 90 degrees, in these cases all columns will be missing
|
|
// we keep track of this with this variable so we can deal with it later
|
|
let mut at_least_one_row_filled = false;
|
|
// if columns are still empty, let's just make a single column table with the data
|
|
if cols.is_empty() {
|
|
at_least_one_row_filled = true;
|
|
let table_with_no_empties: Vec<_> = table.iter().filter(|item| !item.is_empty()).collect();
|
|
|
|
for row in &table_with_no_empties {
|
|
let mut dict = TaggedDictBuilder::new(Tag::unknown());
|
|
for (counter, cell) in row.iter().enumerate() {
|
|
let col_name = format!("Column{}", counter);
|
|
dict.insert_value(
|
|
col_name,
|
|
UntaggedValue::Primitive(Primitive::String(cell.to_string()))
|
|
.into_value(Tag::unknown()),
|
|
);
|
|
}
|
|
table_out.push(dict.into_value());
|
|
}
|
|
} else {
|
|
for row in &table {
|
|
let mut dict = TaggedDictBuilder::new(Tag::unknown());
|
|
// eprintln!("row={:?}", &row);
|
|
for col in &cols {
|
|
//eprintln!("col={:?}", &col);
|
|
let key = col.to_string();
|
|
let val = row
|
|
.get(col)
|
|
.unwrap_or(&format!("Missing column: '{}'", &col))
|
|
.to_string();
|
|
if !at_least_one_row_filled && val != format!("Missing column: '{}'", &col) {
|
|
at_least_one_row_filled = true;
|
|
}
|
|
dict.insert_value(
|
|
key,
|
|
UntaggedValue::Primitive(Primitive::String(val)).into_value(Tag::unknown()),
|
|
);
|
|
}
|
|
table_out.push(dict.into_value());
|
|
}
|
|
}
|
|
if !at_least_one_row_filled {
|
|
let mut data2 = Vec::new();
|
|
for x in &table.data {
|
|
data2.push(x.join(", "));
|
|
}
|
|
table.data = vec![data2];
|
|
return retrieve_table(table, columns);
|
|
}
|
|
table_out
|
|
}
|
|
|
|
fn execute_selector_query_with_attribute(
|
|
input_string: &str,
|
|
query_string: &str,
|
|
attribute: &str,
|
|
) -> Vec<Value> {
|
|
let doc = Html::parse_fragment(input_string);
|
|
|
|
doc.select(&css(query_string))
|
|
.map(|selection| {
|
|
selection
|
|
.value()
|
|
.attr(attribute)
|
|
.unwrap_or("")
|
|
.to_string()
|
|
.to_string_value_create_tag()
|
|
})
|
|
.collect()
|
|
}
|
|
|
|
fn execute_selector_query(input_string: &str, query_string: &str, as_html: bool) -> Vec<Value> {
|
|
let doc = Html::parse_fragment(input_string);
|
|
|
|
match as_html {
|
|
true => doc
|
|
.select(&css(query_string))
|
|
.map(|selection| selection.html().to_string_value_create_tag())
|
|
.collect(),
|
|
false => doc
|
|
.select(&css(query_string))
|
|
.map(|selection| {
|
|
selection
|
|
.text()
|
|
.fold("".to_string(), |acc, x| format!("{}{}", acc, x))
|
|
.to_string_value_create_tag()
|
|
})
|
|
.collect(),
|
|
}
|
|
}
|
|
|
|
pub fn css(selector: &str) -> ScraperSelector {
|
|
ScraperSelector::parse(selector).expect("this should never trigger")
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
const SIMPLE_LIST: &str = r#"
|
|
<ul>
|
|
<li>Coffee</li>
|
|
<li>Tea</li>
|
|
<li>Milk</li>
|
|
</ul>
|
|
"#;
|
|
|
|
#[test]
|
|
fn test_first_child_is_not_empty() {
|
|
assert!(!execute_selector_query(SIMPLE_LIST, "li:first-child", false).is_empty())
|
|
}
|
|
|
|
#[test]
|
|
fn test_first_child() {
|
|
assert_eq!(
|
|
vec!["Coffee".to_string().to_string_value_create_tag()],
|
|
execute_selector_query(SIMPLE_LIST, "li:first-child", false)
|
|
)
|
|
}
|
|
}
|