Scraping multiple tables (#4036)

* Output error when ls into a file without permission

* math sqrt

* added test to check fails when ls into prohibited dir

* fix lint

* math sqrt with tests and doc

* trigger wasm build

* Update filesystem_shell.rs

* Fix Running echo .. starts printing integers forever

* Allow for multiple table scraping

* linting

* Fix clippy

* linting

Co-authored-by: Jonathan Turner <jonathandturner@users.noreply.github.com>
This commit is contained in:
Luccas Mateus 2021-09-24 10:08:13 -03:00 committed by GitHub
parent 962b258cc6
commit 1de7c3d033
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 286 additions and 11 deletions

View File

@ -66,18 +66,39 @@ pub fn retrieve_tables(input_string: &str, columns: &Value, inspect_mode: bool)
eprintln!("Passed in Column Headers = {:#?}", &cols,);
}
let mut table = match Table::find_by_headers(html, &cols) {
let tables = match Table::find_by_headers(html, &cols) {
Some(t) => {
if inspect_mode {
eprintln!("Table Found = {:#?}", &t);
}
t
}
None => Table::empty(),
None => vec![Table::empty()],
};
if tables.len() == 1 {
return retrieve_table(
tables
.into_iter()
.next()
.expect("This should never trigger"),
columns,
);
}
tables
.into_iter()
.map(move |table| {
UntaggedValue::Table(retrieve_table(table, columns)).into_value(Tag::unknown())
})
.collect()
}
let mut table_out = Vec::new();
fn retrieve_table(mut table: Table, columns: &Value) -> Vec<Value> {
let mut cols = Vec::new();
if let UntaggedValue::Table(t) = &columns.value {
for x in t {
cols.push(x.convert_to_string());
}
}
// since cols was empty and headers is not, it means that headers were manually populated
// so let's fake the data in order to build a proper table. this situation happens when
// there are tables where the first column is actually the headers. kind of like a table
@ -95,6 +116,7 @@ pub fn retrieve_tables(input_string: &str, columns: &Value, inspect_mode: bool)
table.data = vec![data2];
}
let mut table_out = Vec::new();
// if columns are still empty, let's just make a single column table with the data
if cols.is_empty() {
let table_with_no_empties: Vec<_> = table.iter().filter(|item| !item.is_empty()).collect();

View File

@ -18,9 +18,13 @@ impl Table {
html.select(&css("table")).next().map(Table::new)
}
pub fn find_all_tables(html: &str) -> Vec<Table> {
pub fn find_all_tables(html: &str) -> Option<Vec<Table>> {
let html = Html::parse_fragment(html);
html.select(&css("table")).map(Table::new).collect()
let iter: Vec<Table> = html.select(&css("table")).map(Table::new).collect();
if iter.is_empty() {
return None;
}
Some(iter)
}
/// Finds the table in `html` with an id of `id`.
@ -40,12 +44,12 @@ impl Table {
///
/// If `headers` is empty, this is the same as
/// [`find_first`](#method.find_first).
pub fn find_by_headers<T>(html: &str, headers: &[T]) -> Option<Table>
pub fn find_by_headers<T>(html: &str, headers: &[T]) -> Option<Vec<Table>>
where
T: AsRef<str>,
{
if headers.is_empty() {
return Table::find_first(html);
return Table::find_all_tables(html);
}
let sel_table = css("table");
@ -53,14 +57,17 @@ impl Table {
let sel_th = css("th");
let html = Html::parse_fragment(html);
html.select(&sel_table)
.find(|table| {
let mut tables = html
.select(&sel_table)
.filter(|table| {
table.select(&sel_tr).next().map_or(false, |tr| {
let cells = select_cells(tr, &sel_th, true);
headers.iter().all(|h| contains_str(&cells, h.as_ref()))
})
})
.map(Table::new)
.peekable();
tables.peek()?;
Some(tables.map(Table::new).collect())
}
/// Returns the headers of the table.
@ -350,6 +357,15 @@ mod tests {
<table>
<tr><td>Name</td><td>Age</td></tr>
</table>
"#;
const TWO_TABLES_TD: &'static str = r#"
<table>
<tr><td>Name</td><td>Age</td></tr>
</table>
<table>
<tr><td>Profession</td><td>Civil State</td></tr>
</table>
"#;
const TABLE_TH_TD: &'static str = r#"
@ -357,6 +373,17 @@ mod tests {
<tr><th>Name</th><th>Age</th></tr>
<tr><td>John</td><td>20</td></tr>
</table>
"#;
const TWO_TABLES_TH_TD: &'static str = r#"
<table>
<tr><th>Name</th><th>Age</th></tr>
<tr><td>John</td><td>20</td></tr>
</table>
<table>
<tr><th>Profession</th><th>Civil State</th></tr>
<tr><td>Mechanic</td><td>Single</td></tr>
</table>
"#;
const TABLE_TD_TD: &'static str = r#"
@ -381,6 +408,29 @@ mod tests {
<tr></tr>
<tr><td>a</td><td>b</td><td>c</td><td>d</td></tr>
</table>
"#;
const TWO_TABLES_COMPLEX: &'static str = r#"
<!doctype HTML>
<html>
<head><title>foo</title></head>
<body>
<table>
<tr><th>Name</th><th>Age</th><th>Extra</th></tr>
<tr><td>John</td><td>20</td></tr>
<tr><td>May</td><td>30</td><td>foo</td></tr>
<tr></tr>
<tr><td>a</td><td>b</td><td>c</td><td>d</td></tr>
</table>
<table>
<tr><th>Profession</th><th>Civil State</th><th>Extra</th></tr>
<tr><td>Carpenter</td><td>Single</td></tr>
<tr><td>Mechanic</td><td>Married</td><td>bar</td></tr>
<tr></tr>
<tr><td>e</td><td>f</td><td>g</td><td>h</td></tr>
</table>
</body>
</html>
"#;
const HTML_NO_TABLE: &'static str = r#"
@ -775,6 +825,29 @@ mod tests {
);
}
#[test]
fn test_row_len_two_tables() {
let tables = Table::find_all_tables(HTML_TWO_TABLES).unwrap();
let mut tables_iter = tables.iter();
let table_1 = tables_iter.next().unwrap();
let table_2 = tables_iter.next().unwrap();
assert_eq!(vec![2], table_1.iter().map(|r| r.len()).collect::<Vec<_>>());
assert_eq!(vec![2], table_2.iter().map(|r| r.len()).collect::<Vec<_>>());
let tables = Table::find_all_tables(TWO_TABLES_COMPLEX).unwrap();
let mut tables_iter = tables.iter();
let table_1 = tables_iter.next().unwrap();
let table_2 = tables_iter.next().unwrap();
assert_eq!(
vec![2, 3, 0, 4],
table_1.iter().map(|r| r.len()).collect::<Vec<_>>()
);
assert_eq!(
vec![2, 3, 0, 4],
table_2.iter().map(|r| r.len()).collect::<Vec<_>>()
);
}
#[test]
fn test_row_get_without_headers() {
let table = Table::find_first(TABLE_TD).unwrap();
@ -831,6 +904,55 @@ mod tests {
assert_eq!(None, iter.next());
}
#[test]
fn test_two_tables_row_get_complex() {
let tables = Table::find_all_tables(TWO_TABLES_COMPLEX).unwrap();
let mut tables_iter = tables.iter();
let table_1 = tables_iter.next().unwrap();
let table_2 = tables_iter.next().unwrap();
let mut iter_1 = table_1.iter();
let mut iter_2 = table_2.iter();
let row_table_1 = iter_1.next().unwrap();
let row_table_2 = iter_2.next().unwrap();
assert_eq!(Some("John"), row_table_1.get("Name"));
assert_eq!(Some("20"), row_table_1.get("Age"));
assert_eq!(None, row_table_1.get("Extra"));
assert_eq!(Some("Carpenter"), row_table_2.get("Profession"));
assert_eq!(Some("Single"), row_table_2.get("Civil State"));
assert_eq!(None, row_table_2.get("Extra"));
let row_table_1 = iter_1.next().unwrap();
let row_table_2 = iter_2.next().unwrap();
assert_eq!(Some("May"), row_table_1.get("Name"));
assert_eq!(Some("30"), row_table_1.get("Age"));
assert_eq!(Some("foo"), row_table_1.get("Extra"));
assert_eq!(Some("Mechanic"), row_table_2.get("Profession"));
assert_eq!(Some("Married"), row_table_2.get("Civil State"));
assert_eq!(Some("bar"), row_table_2.get("Extra"));
let row_table_1 = iter_1.next().unwrap();
let row_table_2 = iter_2.next().unwrap();
assert_eq!(None, row_table_1.get("Name"));
assert_eq!(None, row_table_1.get("Age"));
assert_eq!(None, row_table_1.get("Extra"));
assert_eq!(None, row_table_2.get("Name"));
assert_eq!(None, row_table_2.get("Age"));
assert_eq!(None, row_table_2.get("Extra"));
let row_table_1 = iter_1.next().unwrap();
let row_table_2 = iter_2.next().unwrap();
assert_eq!(Some("a"), row_table_1.get("Name"));
assert_eq!(Some("b"), row_table_1.get("Age"));
assert_eq!(Some("c"), row_table_1.get("Extra"));
assert_eq!(Some("e"), row_table_2.get("Profession"));
assert_eq!(Some("f"), row_table_2.get("Civil State"));
assert_eq!(Some("g"), row_table_2.get("Extra"));
assert_eq!(None, iter_1.next());
assert_eq!(None, iter_2.next());
}
#[test]
fn test_row_as_slice_without_headers() {
let table = Table::find_first(TABLE_TD).unwrap();
@ -840,6 +962,24 @@ mod tests {
assert_eq!(None, iter.next());
}
#[test]
fn test_row_as_slice_without_headers_two_tables() {
let tables = Table::find_all_tables(TWO_TABLES_TD).unwrap();
let mut tables_iter = tables.iter();
let table_1 = tables_iter.next().unwrap();
let table_2 = tables_iter.next().unwrap();
let mut iter_1 = table_1.iter();
let mut iter_2 = table_2.iter();
assert_eq!(&["Name", "Age"], iter_1.next().unwrap().as_slice());
assert_eq!(
&["Profession", "Civil State"],
iter_2.next().unwrap().as_slice()
);
assert_eq!(None, iter_1.next());
assert_eq!(None, iter_2.next());
}
#[test]
fn test_row_as_slice_with_headers() {
let table = Table::find_first(TABLE_TH_TD).unwrap();
@ -849,6 +989,21 @@ mod tests {
assert_eq!(None, iter.next());
}
#[test]
fn test_row_as_slice_with_headers_two_tables() {
let tables = Table::find_all_tables(TWO_TABLES_TH_TD).unwrap();
let mut tables_iter = tables.iter();
let table_1 = tables_iter.next().unwrap();
let table_2 = tables_iter.next().unwrap();
let mut iter_1 = table_1.iter();
let mut iter_2 = table_2.iter();
assert_eq!(&["John", "20"], iter_1.next().unwrap().as_slice());
assert_eq!(&["Mechanic", "Single"], iter_2.next().unwrap().as_slice());
assert_eq!(None, iter_1.next());
assert_eq!(None, iter_2.next());
}
#[test]
fn test_row_as_slice_complex() {
let table = Table::find_first(TABLE_COMPLEX).unwrap();
@ -862,6 +1017,31 @@ mod tests {
assert_eq!(None, iter.next());
}
#[test]
fn test_row_as_slice_complex_two_tables() {
let tables = Table::find_all_tables(TWO_TABLES_COMPLEX).unwrap();
let mut tables_iter = tables.iter();
let table_1 = tables_iter.next().unwrap();
let table_2 = tables_iter.next().unwrap();
let mut iter_1 = table_1.iter();
let mut iter_2 = table_2.iter();
let empty: [&str; 0] = [];
assert_eq!(&["John", "20"], iter_1.next().unwrap().as_slice());
assert_eq!(&["May", "30", "foo"], iter_1.next().unwrap().as_slice());
assert_eq!(&empty, iter_1.next().unwrap().as_slice());
assert_eq!(&["a", "b", "c", "d"], iter_1.next().unwrap().as_slice());
assert_eq!(None, iter_1.next());
assert_eq!(&["Carpenter", "Single"], iter_2.next().unwrap().as_slice());
assert_eq!(
&["Mechanic", "Married", "bar"],
iter_2.next().unwrap().as_slice()
);
assert_eq!(&empty, iter_2.next().unwrap().as_slice());
assert_eq!(&["e", "f", "g", "h"], iter_2.next().unwrap().as_slice());
assert_eq!(None, iter_2.next());
}
#[test]
fn test_row_iter_simple() {
let table = Table::find_first(TABLE_TD).unwrap();
@ -873,6 +1053,25 @@ mod tests {
assert_eq!(None, iter.next());
}
#[test]
fn test_row_iter_simple_two_tables() {
let tables = Table::find_all_tables(TWO_TABLES_TD).unwrap();
let mut tables_iter = tables.iter();
let table_1 = tables_iter.next().unwrap();
let table_2 = tables_iter.next().unwrap();
let row_1 = table_1.iter().next().unwrap();
let row_2 = table_2.iter().next().unwrap();
let mut iter_1 = row_1.iter();
let mut iter_2 = row_2.iter();
assert_eq!(Some("Name"), iter_1.next().map(String::as_str));
assert_eq!(Some("Age"), iter_1.next().map(String::as_str));
assert_eq!(None, iter_1.next());
assert_eq!(Some("Profession"), iter_2.next().map(String::as_str));
assert_eq!(Some("Civil State"), iter_2.next().map(String::as_str));
assert_eq!(None, iter_2.next());
}
#[test]
fn test_row_iter_complex() {
let table = Table::find_first(TABLE_COMPLEX).unwrap();
@ -904,6 +1103,60 @@ mod tests {
assert_eq!(None, iter.next());
}
#[test]
fn test_row_iter_complex_two_tables() {
let tables = Table::find_all_tables(TWO_TABLES_COMPLEX).unwrap();
let mut tables_iter = tables.iter();
let mut table_1 = tables_iter.next().unwrap().iter();
let mut table_2 = tables_iter.next().unwrap().iter();
let row_1 = table_1.next().unwrap();
let row_2 = table_2.next().unwrap();
let mut iter_1 = row_1.iter();
let mut iter_2 = row_2.iter();
assert_eq!(Some("John"), iter_1.next().map(String::as_str));
assert_eq!(Some("20"), iter_1.next().map(String::as_str));
assert_eq!(None, iter_1.next());
assert_eq!(Some("Carpenter"), iter_2.next().map(String::as_str));
assert_eq!(Some("Single"), iter_2.next().map(String::as_str));
assert_eq!(None, iter_2.next());
let row_1 = table_1.next().unwrap();
let row_2 = table_2.next().unwrap();
let mut iter_1 = row_1.iter();
let mut iter_2 = row_2.iter();
assert_eq!(Some("May"), iter_1.next().map(String::as_str));
assert_eq!(Some("30"), iter_1.next().map(String::as_str));
assert_eq!(Some("foo"), iter_1.next().map(String::as_str));
assert_eq!(None, iter_1.next());
assert_eq!(Some("Mechanic"), iter_2.next().map(String::as_str));
assert_eq!(Some("Married"), iter_2.next().map(String::as_str));
assert_eq!(Some("bar"), iter_2.next().map(String::as_str));
assert_eq!(None, iter_2.next());
let row_1 = table_1.next().unwrap();
let row_2 = table_2.next().unwrap();
let mut iter_1 = row_1.iter();
let mut iter_2 = row_2.iter();
assert_eq!(None, iter_1.next());
assert_eq!(None, iter_2.next());
let row_1 = table_1.next().unwrap();
let row_2 = table_2.next().unwrap();
let mut iter_1 = row_1.iter();
let mut iter_2 = row_2.iter();
assert_eq!(Some("a"), iter_1.next().map(String::as_str));
assert_eq!(Some("b"), iter_1.next().map(String::as_str));
assert_eq!(Some("c"), iter_1.next().map(String::as_str));
assert_eq!(Some("d"), iter_1.next().map(String::as_str));
assert_eq!(None, iter_1.next());
assert_eq!(Some("e"), iter_2.next().map(String::as_str));
assert_eq!(Some("f"), iter_2.next().map(String::as_str));
assert_eq!(Some("g"), iter_2.next().map(String::as_str));
assert_eq!(Some("h"), iter_2.next().map(String::as_str));
assert_eq!(None, iter_2.next());
}
#[test]
fn test_wikipedia_swapped_rows_columns() {
// empty columns