From 430b2746b805b06a9c7d3c392ac90761e6de4ab8 Mon Sep 17 00:00:00 2001 From: 132ikl <132@ikl.sh> Date: Wed, 12 Mar 2025 09:09:55 -0400 Subject: [PATCH] Parse XML documents with DTDs by default, and add `--disallow-dtd` flag (#15272) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # Description This PR allows `from xml` to parse XML documents with [document type declarations](https://en.wikipedia.org/wiki/Document_type_declaration) by default. This is especially notable since many HTML documents start with ``, and `roxmltree` should be able to parse some simple HTML documents. The security concerns with DTDs are [XXE attacks](https://en.wikipedia.org/wiki/XML_external_entity_attack), and [exponential entity expansion attacks](https://en.wikipedia.org/wiki/Billion_laughs_attack). `roxmltree` [doesn't support](https://github.com/RazrFalcon/roxmltree/blob/d2c7801624757d14f83ba543484479bcd56c3251/src/tokenizer.rs#L535-L547) external entities (it parses them, but doesn't do anything with them), so it is not vulnerable to XXE attacks. Additionally, `roxmltree` has [some safeguards](https://github.com/RazrFalcon/roxmltree/blob/d2c7801624757d14f83ba543484479bcd56c3251/src/parse.rs#L424-L452) in place to prevent exponential entity expansion, so enabling DTDs by default is relatively safe. The worst case is no worse than running `loop {}`, so I think allowing DTDs by default is best, and DTDs can still be disabled with `--disallow-dtd` if needed. # User-Facing Changes * Allows `from xml` to parse XML documents with [document type declarations](https://en.wikipedia.org/wiki/Document_type_declaration) by default, and adds a `--disallow-dtd` flag to disallow parsing documents with DTDs. This PR also improves the errors in `from xml` by pointing at the issue in the XML source. Example: ``` $ open --raw foo.xml | from xml Error: × Failed to parse XML ╭─[2:7] 1 │ 2 │ hi

· ▲ · ╰── Unexpected character <, expected a whitespace 3 │ ╰──── ``` # Tests + Formatting N/A # After Submitting N/A --- .../src/env/config/config_flatten.rs | 22 +-- crates/nu-command/src/formats/from/json.rs | 24 +-- crates/nu-command/src/formats/from/xml.rs | 163 +++++++++++------- crates/nu-protocol/src/span.rs | 22 +++ 4 files changed, 123 insertions(+), 108 deletions(-) diff --git a/crates/nu-command/src/env/config/config_flatten.rs b/crates/nu-command/src/env/config/config_flatten.rs index ee41897466..c720b27f70 100644 --- a/crates/nu-command/src/env/config/config_flatten.rs +++ b/crates/nu-command/src/env/config/config_flatten.rs @@ -72,7 +72,7 @@ fn convert_string_to_value( Err(x) => match x { nu_json::Error::Syntax(_, row, col) => { let label = x.to_string(); - let label_span = convert_row_column_to_span(row, col, string_input); + let label_span = Span::from_row_column(row, col, string_input); Err(ShellError::GenericError { error: "Error while parsing JSON text".into(), msg: "error parsing JSON text".into(), @@ -173,23 +173,3 @@ fn expand_closure( _ => None, } } - -// Converts row+column to a Span, assuming bytes (1-based rows) -fn convert_row_column_to_span(row: usize, col: usize, contents: &str) -> Span { - let mut cur_row = 1; - let mut cur_col = 1; - - for (offset, curr_byte) in contents.bytes().enumerate() { - if curr_byte == b'\n' { - cur_row += 1; - cur_col = 1; - } - if cur_row >= row && cur_col >= col { - return Span::new(offset, offset); - } else { - cur_col += 1; - } - } - - Span::new(contents.len(), contents.len()) -} diff --git a/crates/nu-command/src/formats/from/json.rs b/crates/nu-command/src/formats/from/json.rs index 17a632d949..81a8175cca 100644 --- a/crates/nu-command/src/formats/from/json.rs +++ b/crates/nu-command/src/formats/from/json.rs @@ -184,26 +184,6 @@ fn convert_nujson_to_value(value: nu_json::Value, span: Span) -> Value { } } -// Converts row+column to a Span, assuming bytes (1-based rows) -fn convert_row_column_to_span(row: usize, col: usize, contents: &str) -> Span { - let mut cur_row = 1; - let mut cur_col = 1; - - for (offset, curr_byte) in contents.bytes().enumerate() { - if curr_byte == b'\n' { - cur_row += 1; - cur_col = 1; - } - if cur_row >= row && cur_col >= col { - return Span::new(offset, offset); - } else { - cur_col += 1; - } - } - - Span::new(contents.len(), contents.len()) -} - fn convert_string_to_value(string_input: &str, span: Span) -> Result { match nu_json::from_str(string_input) { Ok(value) => Ok(convert_nujson_to_value(value, span)), @@ -211,7 +191,7 @@ fn convert_string_to_value(string_input: &str, span: Span) -> Result match x { nu_json::Error::Syntax(_, row, col) => { let label = x.to_string(); - let label_span = convert_row_column_to_span(row, col, string_input); + let label_span = Span::from_row_column(row, col, string_input); Err(ShellError::GenericError { error: "Error while parsing JSON text".into(), msg: "error parsing JSON text".into(), @@ -240,7 +220,7 @@ fn convert_string_to_value_strict(string_input: &str, span: Span) -> Result Ok(convert_nujson_to_value(value, span)), Err(err) => Err(if err.is_syntax() { let label = err.to_string(); - let label_span = convert_row_column_to_span(err.line(), err.column(), string_input); + let label_span = Span::from_row_column(err.line(), err.column(), string_input); ShellError::GenericError { error: "Error while parsing JSON text".into(), msg: "error parsing JSON text".into(), diff --git a/crates/nu-command/src/formats/from/xml.rs b/crates/nu-command/src/formats/from/xml.rs index 70c1048972..0c63d97c93 100644 --- a/crates/nu-command/src/formats/from/xml.rs +++ b/crates/nu-command/src/formats/from/xml.rs @@ -2,7 +2,7 @@ use crate::formats::nu_xml_format::{COLUMN_ATTRS_NAME, COLUMN_CONTENT_NAME, COLU use indexmap::IndexMap; use nu_engine::command_prelude::*; -use roxmltree::NodeType; +use roxmltree::{NodeType, ParsingOptions, TextPos}; #[derive(Clone)] pub struct FromXml; @@ -16,6 +16,11 @@ impl Command for FromXml { Signature::build("from xml") .input_output_types(vec![(Type::String, Type::record())]) .switch("keep-comments", "add comment nodes to result", None) + .switch( + "disallow-dtd", + "disallow parsing documents with DTDs (prevents exponential entity expansion attacks)", + None, + ) .switch( "keep-pi", "add processing instruction nodes to result", @@ -50,10 +55,12 @@ string. This way content of every tag is always a table and is easier to parse"# let head = call.head; let keep_comments = call.has_flag(engine_state, stack, "keep-comments")?; let keep_processing_instructions = call.has_flag(engine_state, stack, "keep-pi")?; + let allow_dtd = !call.has_flag(engine_state, stack, "disallow-dtd")?; let info = ParsingInfo { span: head, keep_comments, keep_processing_instructions, + allow_dtd, }; from_xml(input, &info) } @@ -90,6 +97,7 @@ struct ParsingInfo { span: Span, keep_comments: bool, keep_processing_instructions: bool, + allow_dtd: bool, } fn from_attributes_to_value(attributes: &[roxmltree::Attribute], info: &ParsingInfo) -> Value { @@ -198,7 +206,12 @@ fn from_document_to_value(d: &roxmltree::Document, info: &ParsingInfo) -> Value } fn from_xml_string_to_value(s: &str, info: &ParsingInfo) -> Result { - let parsed = roxmltree::Document::parse(s)?; + let options = ParsingOptions { + allow_dtd: info.allow_dtd, + ..Default::default() + }; + + let parsed = roxmltree::Document::parse_with_options(s, options)?; Ok(from_document_to_value(&parsed, info)) } @@ -209,116 +222,135 @@ fn from_xml(input: PipelineData, info: &ParsingInfo) -> Result { Ok(x.into_pipeline_data_with_metadata(metadata.map(|md| md.with_content_type(None)))) } - Err(err) => Err(process_xml_parse_error(err, span)), + Err(err) => Err(process_xml_parse_error(concat_string, err, span)), } } -fn process_xml_parse_error(err: roxmltree::Error, span: Span) -> ShellError { +fn process_xml_parse_error(source: String, err: roxmltree::Error, span: Span) -> ShellError { match err { - roxmltree::Error::InvalidXmlPrefixUri(_) => make_cant_convert_error( + roxmltree::Error::InvalidXmlPrefixUri(pos) => make_xml_error_spanned( "The `xmlns:xml` attribute must have an URI.", - span, + source, pos, ), - roxmltree::Error::UnexpectedXmlUri(_) => make_cant_convert_error( + roxmltree::Error::UnexpectedXmlUri(pos) => make_xml_error_spanned( "Only the xmlns:xml attribute can have the http://www.w3.org/XML/1998/namespace URI.", - span, + source, pos, ), - roxmltree::Error::UnexpectedXmlnsUri(_) => make_cant_convert_error( + roxmltree::Error::UnexpectedXmlnsUri(pos) => make_xml_error_spanned( "The http://www.w3.org/2000/xmlns/ URI must not be declared.", - span, + source, pos, ), - roxmltree::Error::InvalidElementNamePrefix(_) => { - make_cant_convert_error("xmlns can't be used as an element prefix.", span) + roxmltree::Error::InvalidElementNamePrefix(pos) => { + make_xml_error_spanned("xmlns can't be used as an element prefix.", source, pos) } - roxmltree::Error::DuplicatedNamespace(_, _) => { - make_cant_convert_error("A namespace was already defined on this element.", span) + roxmltree::Error::DuplicatedNamespace(namespace, pos) => { + make_xml_error_spanned(format!("Namespace {namespace} was already defined on this element."), source, pos) } - roxmltree::Error::UnknownNamespace(prefix, _) => { - make_cant_convert_error(format!("Unknown prefix {}", prefix), span) + roxmltree::Error::UnknownNamespace(prefix, pos) => { + make_xml_error_spanned(format!("Unknown prefix {}", prefix), source, pos) } - roxmltree::Error::UnexpectedCloseTag { .. } => { - make_cant_convert_error("Unexpected close tag", span) + roxmltree::Error::UnexpectedCloseTag(expected, actual, pos) => { + make_xml_error_spanned(format!("Unexpected close tag {actual}, expected {expected}"), source, pos) } - roxmltree::Error::UnexpectedEntityCloseTag(_) => { - make_cant_convert_error("Entity value starts with a close tag.", span) + roxmltree::Error::UnexpectedEntityCloseTag(pos) => { + make_xml_error_spanned("Entity value starts with a close tag.", source, pos) } - roxmltree::Error::UnknownEntityReference(_, _) => make_cant_convert_error( - "A reference to an entity that was not defined in the DTD.", - span, + roxmltree::Error::UnknownEntityReference(entity, pos) => make_xml_error_spanned( + format!("Reference to unknown entity {entity} (was not defined in the DTD)"), + source, pos, ), - roxmltree::Error::MalformedEntityReference(_) => { - make_cant_convert_error("A malformed entity reference.", span) + roxmltree::Error::MalformedEntityReference(pos) => { + make_xml_error_spanned("Malformed entity reference.", source, pos) } - roxmltree::Error::EntityReferenceLoop(_) => { - make_cant_convert_error("A possible entity reference loop.", span) + roxmltree::Error::EntityReferenceLoop(pos) => { + make_xml_error_spanned("Possible entity reference loop.", source, pos) } - roxmltree::Error::InvalidAttributeValue(_) => { - make_cant_convert_error("Attribute value cannot have a < character.", span) + roxmltree::Error::InvalidAttributeValue(pos) => { + make_xml_error_spanned("Attribute value cannot have a < character.", source, pos) } - roxmltree::Error::DuplicatedAttribute(_, _) => { - make_cant_convert_error("An element has a duplicated attributes.", span) + roxmltree::Error::DuplicatedAttribute(attribute, pos) => { + make_xml_error_spanned(format!("Element has a duplicated attribute: {attribute}"), source, pos) } roxmltree::Error::NoRootNode => { - make_cant_convert_error("The XML document must have at least one element.", span) + make_xml_error("The XML document must have at least one element.", span) } roxmltree::Error::UnclosedRootNode => { - make_cant_convert_error("The root node was opened but never closed.", span) + make_xml_error("The root node was opened but never closed.", span) } - roxmltree::Error::DtdDetected => make_cant_convert_error( - "An XML with DTD detected. DTDs are currently disabled due to security reasons.", - span, + roxmltree::Error::DtdDetected => make_xml_error( + "XML document with DTD detected.", + span ), roxmltree::Error::NodesLimitReached => { - make_cant_convert_error("Node limit was reached.", span) + make_xml_error("Node limit was reached.", span) } roxmltree::Error::AttributesLimitReached => { - make_cant_convert_error("Attribute limit reached", span) + make_xml_error("Attribute limit reached", span) } roxmltree::Error::NamespacesLimitReached => { - make_cant_convert_error("Namespace limit reached", span) + make_xml_error("Namespace limit reached", span) } - roxmltree::Error::UnexpectedDeclaration(_) => { - make_cant_convert_error("An XML document can have only one XML declaration and it must be at the start of the document.", span) + roxmltree::Error::UnexpectedDeclaration(pos) => { + make_xml_error_spanned("An XML document can have only one XML declaration and it must be at the start of the document.", source, pos) } - roxmltree::Error::InvalidName(_) => { - make_cant_convert_error("Invalid name found.", span) + roxmltree::Error::InvalidName(pos) => { + make_xml_error_spanned("Invalid name.", source, pos) } - roxmltree::Error::NonXmlChar(_, _) => { - make_cant_convert_error("A non-XML character has occurred. Valid characters are: ", span) + roxmltree::Error::NonXmlChar(_, pos) => { + make_xml_error_spanned("Non-XML character found. Valid characters are: ", source, pos) } - roxmltree::Error::InvalidChar(_, _, _) => { - make_cant_convert_error("An invalid/unexpected character in XML.", span) + roxmltree::Error::InvalidChar(expected, actual, pos) => { + make_xml_error_spanned( + format!("Unexpected character {}, expected {}", actual as char, expected as char), + source, + pos + ) } - roxmltree::Error::InvalidChar2(_, _, _) => { - make_cant_convert_error("An invalid/unexpected character in XML.", span) + roxmltree::Error::InvalidChar2(expected, actual, pos) => { + make_xml_error_spanned( + format!("Unexpected character {}, expected {}", actual as char, expected), + source, + pos + ) } - roxmltree::Error::InvalidString(_, _) => { - make_cant_convert_error("An invalid/unexpected string in XML.", span) + roxmltree::Error::InvalidString(_, pos) => { + make_xml_error_spanned("Invalid/unexpected string in XML.", source, pos) } - roxmltree::Error::InvalidExternalID(_) => { - make_cant_convert_error("An invalid ExternalID in the DTD.", span) + roxmltree::Error::InvalidExternalID(pos) => { + make_xml_error_spanned("Invalid ExternalID in the DTD.", source, pos) } - roxmltree::Error::InvalidComment(_) => { - make_cant_convert_error("A comment cannot contain `--` or end with `-`.", span) + roxmltree::Error::InvalidComment(pos) => { + make_xml_error_spanned("A comment cannot contain `--` or end with `-`.", source, pos) } - roxmltree::Error::InvalidCharacterData(_) => { - make_cant_convert_error("A Character Data node contains an invalid data. Currently, only `]]>` is not allowed.", span) + roxmltree::Error::InvalidCharacterData(pos) => { + make_xml_error_spanned("Character Data node contains an invalid data. Currently, only `]]>` is not allowed.", source, pos) } - roxmltree::Error::UnknownToken(_) => { - make_cant_convert_error("Unknown token in XML.", span) + roxmltree::Error::UnknownToken(pos) => { + make_xml_error_spanned("Unknown token in XML.", source, pos) } roxmltree::Error::UnexpectedEndOfStream => { - make_cant_convert_error("Unexpected end of stream while parsing XML.", span) + make_xml_error("Unexpected end of stream while parsing XML.", span) } } } -fn make_cant_convert_error(help: impl Into, span: Span) -> ShellError { - ShellError::CantConvert { - from_type: Type::String.to_string(), - to_type: "XML".to_string(), +fn make_xml_error(msg: impl Into, span: Span) -> ShellError { + ShellError::GenericError { + error: "Failed to parse XML".into(), + msg: msg.into(), + help: None, + span: Some(span), + inner: vec![], + } +} + +fn make_xml_error_spanned(msg: impl Into, src: String, pos: TextPos) -> ShellError { + let span = Span::from_row_column(pos.row as usize, pos.col as usize, &src); + ShellError::OutsideSpannedLabeledError { + src, + error: "Failed to parse XML".into(), + msg: msg.into(), span, - help: Some(help.into()), } } @@ -375,6 +407,7 @@ mod tests { span: Span::test_data(), keep_comments: false, keep_processing_instructions: false, + allow_dtd: false, }; from_xml_string_to_value(xml, &info) } diff --git a/crates/nu-protocol/src/span.rs b/crates/nu-protocol/src/span.rs index 77e5bab681..7eeaffe04b 100644 --- a/crates/nu-protocol/src/span.rs +++ b/crates/nu-protocol/src/span.rs @@ -148,6 +148,28 @@ impl Span { } } + /// Converts row and column in a String to a Span, assuming bytes (1-based rows) + pub fn from_row_column(row: usize, col: usize, contents: &str) -> Span { + let mut cur_row = 1; + let mut cur_col = 1; + + for (offset, curr_byte) in contents.bytes().enumerate() { + if curr_byte == b'\n' { + cur_row += 1; + cur_col = 1; + } else if cur_row >= row && cur_col >= col { + return Span::new(offset, offset); + } else { + cur_col += 1; + } + } + + Self { + start: contents.len(), + end: contents.len(), + } + } + /// Returns the minimal [`Span`] that encompasses both of the given spans. /// /// The two `Spans` can overlap in the middle,