mirror of
https://github.com/nushell/nushell.git
synced 2025-04-11 14:58:21 +02:00
Scraping multiple tables (#4036)
* Output error when ls into a file without permission * math sqrt * added test to check fails when ls into prohibited dir * fix lint * math sqrt with tests and doc * trigger wasm build * Update filesystem_shell.rs * Fix Running echo .. starts printing integers forever * Allow for multiple table scraping * linting * Fix clippy * linting Co-authored-by: Jonathan Turner <jonathandturner@users.noreply.github.com>
This commit is contained in:
parent
962b258cc6
commit
1de7c3d033
@ -66,18 +66,39 @@ pub fn retrieve_tables(input_string: &str, columns: &Value, inspect_mode: bool)
|
||||
eprintln!("Passed in Column Headers = {:#?}", &cols,);
|
||||
}
|
||||
|
||||
let mut table = match Table::find_by_headers(html, &cols) {
|
||||
let tables = match Table::find_by_headers(html, &cols) {
|
||||
Some(t) => {
|
||||
if inspect_mode {
|
||||
eprintln!("Table Found = {:#?}", &t);
|
||||
}
|
||||
t
|
||||
}
|
||||
None => Table::empty(),
|
||||
None => vec![Table::empty()],
|
||||
};
|
||||
if tables.len() == 1 {
|
||||
return retrieve_table(
|
||||
tables
|
||||
.into_iter()
|
||||
.next()
|
||||
.expect("This should never trigger"),
|
||||
columns,
|
||||
);
|
||||
}
|
||||
tables
|
||||
.into_iter()
|
||||
.map(move |table| {
|
||||
UntaggedValue::Table(retrieve_table(table, columns)).into_value(Tag::unknown())
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
let mut table_out = Vec::new();
|
||||
|
||||
fn retrieve_table(mut table: Table, columns: &Value) -> Vec<Value> {
|
||||
let mut cols = Vec::new();
|
||||
if let UntaggedValue::Table(t) = &columns.value {
|
||||
for x in t {
|
||||
cols.push(x.convert_to_string());
|
||||
}
|
||||
}
|
||||
// since cols was empty and headers is not, it means that headers were manually populated
|
||||
// so let's fake the data in order to build a proper table. this situation happens when
|
||||
// there are tables where the first column is actually the headers. kind of like a table
|
||||
@ -95,6 +116,7 @@ pub fn retrieve_tables(input_string: &str, columns: &Value, inspect_mode: bool)
|
||||
table.data = vec![data2];
|
||||
}
|
||||
|
||||
let mut table_out = Vec::new();
|
||||
// if columns are still empty, let's just make a single column table with the data
|
||||
if cols.is_empty() {
|
||||
let table_with_no_empties: Vec<_> = table.iter().filter(|item| !item.is_empty()).collect();
|
||||
|
@ -18,9 +18,13 @@ impl Table {
|
||||
html.select(&css("table")).next().map(Table::new)
|
||||
}
|
||||
|
||||
pub fn find_all_tables(html: &str) -> Vec<Table> {
|
||||
pub fn find_all_tables(html: &str) -> Option<Vec<Table>> {
|
||||
let html = Html::parse_fragment(html);
|
||||
html.select(&css("table")).map(Table::new).collect()
|
||||
let iter: Vec<Table> = html.select(&css("table")).map(Table::new).collect();
|
||||
if iter.is_empty() {
|
||||
return None;
|
||||
}
|
||||
Some(iter)
|
||||
}
|
||||
|
||||
/// Finds the table in `html` with an id of `id`.
|
||||
@ -40,12 +44,12 @@ impl Table {
|
||||
///
|
||||
/// If `headers` is empty, this is the same as
|
||||
/// [`find_first`](#method.find_first).
|
||||
pub fn find_by_headers<T>(html: &str, headers: &[T]) -> Option<Table>
|
||||
pub fn find_by_headers<T>(html: &str, headers: &[T]) -> Option<Vec<Table>>
|
||||
where
|
||||
T: AsRef<str>,
|
||||
{
|
||||
if headers.is_empty() {
|
||||
return Table::find_first(html);
|
||||
return Table::find_all_tables(html);
|
||||
}
|
||||
|
||||
let sel_table = css("table");
|
||||
@ -53,14 +57,17 @@ impl Table {
|
||||
let sel_th = css("th");
|
||||
|
||||
let html = Html::parse_fragment(html);
|
||||
html.select(&sel_table)
|
||||
.find(|table| {
|
||||
let mut tables = html
|
||||
.select(&sel_table)
|
||||
.filter(|table| {
|
||||
table.select(&sel_tr).next().map_or(false, |tr| {
|
||||
let cells = select_cells(tr, &sel_th, true);
|
||||
headers.iter().all(|h| contains_str(&cells, h.as_ref()))
|
||||
})
|
||||
})
|
||||
.map(Table::new)
|
||||
.peekable();
|
||||
tables.peek()?;
|
||||
Some(tables.map(Table::new).collect())
|
||||
}
|
||||
|
||||
/// Returns the headers of the table.
|
||||
@ -350,6 +357,15 @@ mod tests {
|
||||
<table>
|
||||
<tr><td>Name</td><td>Age</td></tr>
|
||||
</table>
|
||||
"#;
|
||||
|
||||
const TWO_TABLES_TD: &'static str = r#"
|
||||
<table>
|
||||
<tr><td>Name</td><td>Age</td></tr>
|
||||
</table>
|
||||
<table>
|
||||
<tr><td>Profession</td><td>Civil State</td></tr>
|
||||
</table>
|
||||
"#;
|
||||
|
||||
const TABLE_TH_TD: &'static str = r#"
|
||||
@ -357,6 +373,17 @@ mod tests {
|
||||
<tr><th>Name</th><th>Age</th></tr>
|
||||
<tr><td>John</td><td>20</td></tr>
|
||||
</table>
|
||||
"#;
|
||||
|
||||
const TWO_TABLES_TH_TD: &'static str = r#"
|
||||
<table>
|
||||
<tr><th>Name</th><th>Age</th></tr>
|
||||
<tr><td>John</td><td>20</td></tr>
|
||||
</table>
|
||||
<table>
|
||||
<tr><th>Profession</th><th>Civil State</th></tr>
|
||||
<tr><td>Mechanic</td><td>Single</td></tr>
|
||||
</table>
|
||||
"#;
|
||||
|
||||
const TABLE_TD_TD: &'static str = r#"
|
||||
@ -381,6 +408,29 @@ mod tests {
|
||||
<tr></tr>
|
||||
<tr><td>a</td><td>b</td><td>c</td><td>d</td></tr>
|
||||
</table>
|
||||
"#;
|
||||
|
||||
const TWO_TABLES_COMPLEX: &'static str = r#"
|
||||
<!doctype HTML>
|
||||
<html>
|
||||
<head><title>foo</title></head>
|
||||
<body>
|
||||
<table>
|
||||
<tr><th>Name</th><th>Age</th><th>Extra</th></tr>
|
||||
<tr><td>John</td><td>20</td></tr>
|
||||
<tr><td>May</td><td>30</td><td>foo</td></tr>
|
||||
<tr></tr>
|
||||
<tr><td>a</td><td>b</td><td>c</td><td>d</td></tr>
|
||||
</table>
|
||||
<table>
|
||||
<tr><th>Profession</th><th>Civil State</th><th>Extra</th></tr>
|
||||
<tr><td>Carpenter</td><td>Single</td></tr>
|
||||
<tr><td>Mechanic</td><td>Married</td><td>bar</td></tr>
|
||||
<tr></tr>
|
||||
<tr><td>e</td><td>f</td><td>g</td><td>h</td></tr>
|
||||
</table>
|
||||
</body>
|
||||
</html>
|
||||
"#;
|
||||
|
||||
const HTML_NO_TABLE: &'static str = r#"
|
||||
@ -775,6 +825,29 @@ mod tests {
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_row_len_two_tables() {
|
||||
let tables = Table::find_all_tables(HTML_TWO_TABLES).unwrap();
|
||||
let mut tables_iter = tables.iter();
|
||||
let table_1 = tables_iter.next().unwrap();
|
||||
let table_2 = tables_iter.next().unwrap();
|
||||
assert_eq!(vec![2], table_1.iter().map(|r| r.len()).collect::<Vec<_>>());
|
||||
assert_eq!(vec![2], table_2.iter().map(|r| r.len()).collect::<Vec<_>>());
|
||||
|
||||
let tables = Table::find_all_tables(TWO_TABLES_COMPLEX).unwrap();
|
||||
let mut tables_iter = tables.iter();
|
||||
let table_1 = tables_iter.next().unwrap();
|
||||
let table_2 = tables_iter.next().unwrap();
|
||||
assert_eq!(
|
||||
vec![2, 3, 0, 4],
|
||||
table_1.iter().map(|r| r.len()).collect::<Vec<_>>()
|
||||
);
|
||||
assert_eq!(
|
||||
vec![2, 3, 0, 4],
|
||||
table_2.iter().map(|r| r.len()).collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_row_get_without_headers() {
|
||||
let table = Table::find_first(TABLE_TD).unwrap();
|
||||
@ -831,6 +904,55 @@ mod tests {
|
||||
assert_eq!(None, iter.next());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_two_tables_row_get_complex() {
|
||||
let tables = Table::find_all_tables(TWO_TABLES_COMPLEX).unwrap();
|
||||
let mut tables_iter = tables.iter();
|
||||
let table_1 = tables_iter.next().unwrap();
|
||||
let table_2 = tables_iter.next().unwrap();
|
||||
let mut iter_1 = table_1.iter();
|
||||
let mut iter_2 = table_2.iter();
|
||||
|
||||
let row_table_1 = iter_1.next().unwrap();
|
||||
let row_table_2 = iter_2.next().unwrap();
|
||||
assert_eq!(Some("John"), row_table_1.get("Name"));
|
||||
assert_eq!(Some("20"), row_table_1.get("Age"));
|
||||
assert_eq!(None, row_table_1.get("Extra"));
|
||||
assert_eq!(Some("Carpenter"), row_table_2.get("Profession"));
|
||||
assert_eq!(Some("Single"), row_table_2.get("Civil State"));
|
||||
assert_eq!(None, row_table_2.get("Extra"));
|
||||
|
||||
let row_table_1 = iter_1.next().unwrap();
|
||||
let row_table_2 = iter_2.next().unwrap();
|
||||
assert_eq!(Some("May"), row_table_1.get("Name"));
|
||||
assert_eq!(Some("30"), row_table_1.get("Age"));
|
||||
assert_eq!(Some("foo"), row_table_1.get("Extra"));
|
||||
assert_eq!(Some("Mechanic"), row_table_2.get("Profession"));
|
||||
assert_eq!(Some("Married"), row_table_2.get("Civil State"));
|
||||
assert_eq!(Some("bar"), row_table_2.get("Extra"));
|
||||
|
||||
let row_table_1 = iter_1.next().unwrap();
|
||||
let row_table_2 = iter_2.next().unwrap();
|
||||
assert_eq!(None, row_table_1.get("Name"));
|
||||
assert_eq!(None, row_table_1.get("Age"));
|
||||
assert_eq!(None, row_table_1.get("Extra"));
|
||||
assert_eq!(None, row_table_2.get("Name"));
|
||||
assert_eq!(None, row_table_2.get("Age"));
|
||||
assert_eq!(None, row_table_2.get("Extra"));
|
||||
|
||||
let row_table_1 = iter_1.next().unwrap();
|
||||
let row_table_2 = iter_2.next().unwrap();
|
||||
assert_eq!(Some("a"), row_table_1.get("Name"));
|
||||
assert_eq!(Some("b"), row_table_1.get("Age"));
|
||||
assert_eq!(Some("c"), row_table_1.get("Extra"));
|
||||
assert_eq!(Some("e"), row_table_2.get("Profession"));
|
||||
assert_eq!(Some("f"), row_table_2.get("Civil State"));
|
||||
assert_eq!(Some("g"), row_table_2.get("Extra"));
|
||||
|
||||
assert_eq!(None, iter_1.next());
|
||||
assert_eq!(None, iter_2.next());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_row_as_slice_without_headers() {
|
||||
let table = Table::find_first(TABLE_TD).unwrap();
|
||||
@ -840,6 +962,24 @@ mod tests {
|
||||
assert_eq!(None, iter.next());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_row_as_slice_without_headers_two_tables() {
|
||||
let tables = Table::find_all_tables(TWO_TABLES_TD).unwrap();
|
||||
let mut tables_iter = tables.iter();
|
||||
let table_1 = tables_iter.next().unwrap();
|
||||
let table_2 = tables_iter.next().unwrap();
|
||||
let mut iter_1 = table_1.iter();
|
||||
let mut iter_2 = table_2.iter();
|
||||
|
||||
assert_eq!(&["Name", "Age"], iter_1.next().unwrap().as_slice());
|
||||
assert_eq!(
|
||||
&["Profession", "Civil State"],
|
||||
iter_2.next().unwrap().as_slice()
|
||||
);
|
||||
assert_eq!(None, iter_1.next());
|
||||
assert_eq!(None, iter_2.next());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_row_as_slice_with_headers() {
|
||||
let table = Table::find_first(TABLE_TH_TD).unwrap();
|
||||
@ -849,6 +989,21 @@ mod tests {
|
||||
assert_eq!(None, iter.next());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_row_as_slice_with_headers_two_tables() {
|
||||
let tables = Table::find_all_tables(TWO_TABLES_TH_TD).unwrap();
|
||||
let mut tables_iter = tables.iter();
|
||||
let table_1 = tables_iter.next().unwrap();
|
||||
let table_2 = tables_iter.next().unwrap();
|
||||
let mut iter_1 = table_1.iter();
|
||||
let mut iter_2 = table_2.iter();
|
||||
|
||||
assert_eq!(&["John", "20"], iter_1.next().unwrap().as_slice());
|
||||
assert_eq!(&["Mechanic", "Single"], iter_2.next().unwrap().as_slice());
|
||||
assert_eq!(None, iter_1.next());
|
||||
assert_eq!(None, iter_2.next());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_row_as_slice_complex() {
|
||||
let table = Table::find_first(TABLE_COMPLEX).unwrap();
|
||||
@ -862,6 +1017,31 @@ mod tests {
|
||||
assert_eq!(None, iter.next());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_row_as_slice_complex_two_tables() {
|
||||
let tables = Table::find_all_tables(TWO_TABLES_COMPLEX).unwrap();
|
||||
let mut tables_iter = tables.iter();
|
||||
let table_1 = tables_iter.next().unwrap();
|
||||
let table_2 = tables_iter.next().unwrap();
|
||||
let mut iter_1 = table_1.iter();
|
||||
let mut iter_2 = table_2.iter();
|
||||
let empty: [&str; 0] = [];
|
||||
|
||||
assert_eq!(&["John", "20"], iter_1.next().unwrap().as_slice());
|
||||
assert_eq!(&["May", "30", "foo"], iter_1.next().unwrap().as_slice());
|
||||
assert_eq!(&empty, iter_1.next().unwrap().as_slice());
|
||||
assert_eq!(&["a", "b", "c", "d"], iter_1.next().unwrap().as_slice());
|
||||
assert_eq!(None, iter_1.next());
|
||||
assert_eq!(&["Carpenter", "Single"], iter_2.next().unwrap().as_slice());
|
||||
assert_eq!(
|
||||
&["Mechanic", "Married", "bar"],
|
||||
iter_2.next().unwrap().as_slice()
|
||||
);
|
||||
assert_eq!(&empty, iter_2.next().unwrap().as_slice());
|
||||
assert_eq!(&["e", "f", "g", "h"], iter_2.next().unwrap().as_slice());
|
||||
assert_eq!(None, iter_2.next());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_row_iter_simple() {
|
||||
let table = Table::find_first(TABLE_TD).unwrap();
|
||||
@ -873,6 +1053,25 @@ mod tests {
|
||||
assert_eq!(None, iter.next());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_row_iter_simple_two_tables() {
|
||||
let tables = Table::find_all_tables(TWO_TABLES_TD).unwrap();
|
||||
let mut tables_iter = tables.iter();
|
||||
let table_1 = tables_iter.next().unwrap();
|
||||
let table_2 = tables_iter.next().unwrap();
|
||||
let row_1 = table_1.iter().next().unwrap();
|
||||
let row_2 = table_2.iter().next().unwrap();
|
||||
let mut iter_1 = row_1.iter();
|
||||
let mut iter_2 = row_2.iter();
|
||||
|
||||
assert_eq!(Some("Name"), iter_1.next().map(String::as_str));
|
||||
assert_eq!(Some("Age"), iter_1.next().map(String::as_str));
|
||||
assert_eq!(None, iter_1.next());
|
||||
assert_eq!(Some("Profession"), iter_2.next().map(String::as_str));
|
||||
assert_eq!(Some("Civil State"), iter_2.next().map(String::as_str));
|
||||
assert_eq!(None, iter_2.next());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_row_iter_complex() {
|
||||
let table = Table::find_first(TABLE_COMPLEX).unwrap();
|
||||
@ -904,6 +1103,60 @@ mod tests {
|
||||
assert_eq!(None, iter.next());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_row_iter_complex_two_tables() {
|
||||
let tables = Table::find_all_tables(TWO_TABLES_COMPLEX).unwrap();
|
||||
let mut tables_iter = tables.iter();
|
||||
let mut table_1 = tables_iter.next().unwrap().iter();
|
||||
let mut table_2 = tables_iter.next().unwrap().iter();
|
||||
|
||||
let row_1 = table_1.next().unwrap();
|
||||
let row_2 = table_2.next().unwrap();
|
||||
let mut iter_1 = row_1.iter();
|
||||
let mut iter_2 = row_2.iter();
|
||||
assert_eq!(Some("John"), iter_1.next().map(String::as_str));
|
||||
assert_eq!(Some("20"), iter_1.next().map(String::as_str));
|
||||
assert_eq!(None, iter_1.next());
|
||||
assert_eq!(Some("Carpenter"), iter_2.next().map(String::as_str));
|
||||
assert_eq!(Some("Single"), iter_2.next().map(String::as_str));
|
||||
assert_eq!(None, iter_2.next());
|
||||
|
||||
let row_1 = table_1.next().unwrap();
|
||||
let row_2 = table_2.next().unwrap();
|
||||
let mut iter_1 = row_1.iter();
|
||||
let mut iter_2 = row_2.iter();
|
||||
assert_eq!(Some("May"), iter_1.next().map(String::as_str));
|
||||
assert_eq!(Some("30"), iter_1.next().map(String::as_str));
|
||||
assert_eq!(Some("foo"), iter_1.next().map(String::as_str));
|
||||
assert_eq!(None, iter_1.next());
|
||||
assert_eq!(Some("Mechanic"), iter_2.next().map(String::as_str));
|
||||
assert_eq!(Some("Married"), iter_2.next().map(String::as_str));
|
||||
assert_eq!(Some("bar"), iter_2.next().map(String::as_str));
|
||||
assert_eq!(None, iter_2.next());
|
||||
|
||||
let row_1 = table_1.next().unwrap();
|
||||
let row_2 = table_2.next().unwrap();
|
||||
let mut iter_1 = row_1.iter();
|
||||
let mut iter_2 = row_2.iter();
|
||||
assert_eq!(None, iter_1.next());
|
||||
assert_eq!(None, iter_2.next());
|
||||
|
||||
let row_1 = table_1.next().unwrap();
|
||||
let row_2 = table_2.next().unwrap();
|
||||
let mut iter_1 = row_1.iter();
|
||||
let mut iter_2 = row_2.iter();
|
||||
assert_eq!(Some("a"), iter_1.next().map(String::as_str));
|
||||
assert_eq!(Some("b"), iter_1.next().map(String::as_str));
|
||||
assert_eq!(Some("c"), iter_1.next().map(String::as_str));
|
||||
assert_eq!(Some("d"), iter_1.next().map(String::as_str));
|
||||
assert_eq!(None, iter_1.next());
|
||||
assert_eq!(Some("e"), iter_2.next().map(String::as_str));
|
||||
assert_eq!(Some("f"), iter_2.next().map(String::as_str));
|
||||
assert_eq!(Some("g"), iter_2.next().map(String::as_str));
|
||||
assert_eq!(Some("h"), iter_2.next().map(String::as_str));
|
||||
assert_eq!(None, iter_2.next());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_wikipedia_swapped_rows_columns() {
|
||||
// empty columns
|
||||
|
Loading…
Reference in New Issue
Block a user