Parse XML documents with DTDs by default, and add --disallow-dtd flag (#15272)

<!--
if this PR closes one or more issues, you can automatically link the PR
with
them by using one of the [*linking
keywords*](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue#linking-a-pull-request-to-an-issue-using-a-keyword),
e.g.
- this PR should close #xxxx
- fixes #xxxx

you can also mention related issues, PRs or discussions!
-->

# Description
<!--
Thank you for improving Nushell. Please, check our [contributing
guide](../CONTRIBUTING.md) and talk to the core team before making major
changes.

Description of your pull request goes here. **Provide examples and/or
screenshots** if your changes affect the user experience.
-->
This PR allows `from xml` to parse XML documents with [document type
declarations](https://en.wikipedia.org/wiki/Document_type_declaration)
by default. This is especially notable since many HTML documents start
with `<!DOCTYPE html>`, and `roxmltree` should be able to parse some
simple HTML documents. The security concerns with DTDs are [XXE
attacks](https://en.wikipedia.org/wiki/XML_external_entity_attack), and
[exponential entity expansion
attacks](https://en.wikipedia.org/wiki/Billion_laughs_attack).
`roxmltree` [doesn't
support](d2c7801624/src/tokenizer.rs (L535-L547))
external entities (it parses them, but doesn't do anything with them),
so it is not vulnerable to XXE attacks. Additionally, `roxmltree` has
[some
safeguards](d2c7801624/src/parse.rs (L424-L452))
in place to prevent exponential entity expansion, so enabling DTDs by
default is relatively safe. The worst case is no worse than running
`loop {}`, so I think allowing DTDs by default is best, and DTDs can
still be disabled with `--disallow-dtd` if needed.

# User-Facing Changes
<!-- List of all changes that impact the user experience here. This
helps us keep track of breaking changes. -->
* Allows `from xml` to parse XML documents with [document type
declarations](https://en.wikipedia.org/wiki/Document_type_declaration)
by default, and adds a `--disallow-dtd` flag to disallow parsing
documents with DTDs.

This PR also improves the errors in `from xml` by pointing at the issue
in the XML source. Example:

```
$ open --raw foo.xml | from xml 
Error:   × Failed to parse XML
   ╭─[2:7]
 1 │ <html>
 2 │     <p<>hi</p>
   ·       ▲
   ·       ╰── Unexpected character <, expected a whitespace
 3 │ </html>
   ╰────
```

# Tests + Formatting
<!--
Don't forget to add tests that cover your changes.

Make sure you've run and fixed any issues with these commands:

- `cargo fmt --all -- --check` to check standard code formatting (`cargo
fmt --all` applies these changes)
- `cargo clippy --workspace -- -D warnings -D clippy::unwrap_used` to
check that you're using the standard code style
- `cargo test --workspace` to check that all tests pass (on Windows make
sure to [enable developer
mode](https://learn.microsoft.com/en-us/windows/apps/get-started/developer-mode-features-and-debugging))
- `cargo run -- -c "use toolkit.nu; toolkit test stdlib"` to run the
tests for the standard library

> **Note**
> from `nushell` you can also use the `toolkit` as follows
> ```bash
> use toolkit.nu # or use an `env_change` hook to activate it
automatically
> toolkit check pr
> ```
-->
N/A

# After Submitting
<!-- If your PR had any user-facing changes, update [the
documentation](https://github.com/nushell/nushell.github.io) after the
PR is merged, if necessary. This will help us keep the docs up to date.
-->
N/A
This commit is contained in:
132ikl 2025-03-12 09:09:55 -04:00 committed by GitHub
parent 1e566adcfc
commit 430b2746b8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 123 additions and 108 deletions

View File

@ -72,7 +72,7 @@ fn convert_string_to_value(
Err(x) => match x {
nu_json::Error::Syntax(_, row, col) => {
let label = x.to_string();
let label_span = convert_row_column_to_span(row, col, string_input);
let label_span = Span::from_row_column(row, col, string_input);
Err(ShellError::GenericError {
error: "Error while parsing JSON text".into(),
msg: "error parsing JSON text".into(),
@ -173,23 +173,3 @@ fn expand_closure(
_ => None,
}
}
// Converts row+column to a Span, assuming bytes (1-based rows)
fn convert_row_column_to_span(row: usize, col: usize, contents: &str) -> Span {
let mut cur_row = 1;
let mut cur_col = 1;
for (offset, curr_byte) in contents.bytes().enumerate() {
if curr_byte == b'\n' {
cur_row += 1;
cur_col = 1;
}
if cur_row >= row && cur_col >= col {
return Span::new(offset, offset);
} else {
cur_col += 1;
}
}
Span::new(contents.len(), contents.len())
}

View File

@ -184,26 +184,6 @@ fn convert_nujson_to_value(value: nu_json::Value, span: Span) -> Value {
}
}
// Converts row+column to a Span, assuming bytes (1-based rows)
fn convert_row_column_to_span(row: usize, col: usize, contents: &str) -> Span {
let mut cur_row = 1;
let mut cur_col = 1;
for (offset, curr_byte) in contents.bytes().enumerate() {
if curr_byte == b'\n' {
cur_row += 1;
cur_col = 1;
}
if cur_row >= row && cur_col >= col {
return Span::new(offset, offset);
} else {
cur_col += 1;
}
}
Span::new(contents.len(), contents.len())
}
fn convert_string_to_value(string_input: &str, span: Span) -> Result<Value, ShellError> {
match nu_json::from_str(string_input) {
Ok(value) => Ok(convert_nujson_to_value(value, span)),
@ -211,7 +191,7 @@ fn convert_string_to_value(string_input: &str, span: Span) -> Result<Value, Shel
Err(x) => match x {
nu_json::Error::Syntax(_, row, col) => {
let label = x.to_string();
let label_span = convert_row_column_to_span(row, col, string_input);
let label_span = Span::from_row_column(row, col, string_input);
Err(ShellError::GenericError {
error: "Error while parsing JSON text".into(),
msg: "error parsing JSON text".into(),
@ -240,7 +220,7 @@ fn convert_string_to_value_strict(string_input: &str, span: Span) -> Result<Valu
Ok(value) => Ok(convert_nujson_to_value(value, span)),
Err(err) => Err(if err.is_syntax() {
let label = err.to_string();
let label_span = convert_row_column_to_span(err.line(), err.column(), string_input);
let label_span = Span::from_row_column(err.line(), err.column(), string_input);
ShellError::GenericError {
error: "Error while parsing JSON text".into(),
msg: "error parsing JSON text".into(),

View File

@ -2,7 +2,7 @@ use crate::formats::nu_xml_format::{COLUMN_ATTRS_NAME, COLUMN_CONTENT_NAME, COLU
use indexmap::IndexMap;
use nu_engine::command_prelude::*;
use roxmltree::NodeType;
use roxmltree::{NodeType, ParsingOptions, TextPos};
#[derive(Clone)]
pub struct FromXml;
@ -16,6 +16,11 @@ impl Command for FromXml {
Signature::build("from xml")
.input_output_types(vec![(Type::String, Type::record())])
.switch("keep-comments", "add comment nodes to result", None)
.switch(
"disallow-dtd",
"disallow parsing documents with DTDs (prevents exponential entity expansion attacks)",
None,
)
.switch(
"keep-pi",
"add processing instruction nodes to result",
@ -50,10 +55,12 @@ string. This way content of every tag is always a table and is easier to parse"#
let head = call.head;
let keep_comments = call.has_flag(engine_state, stack, "keep-comments")?;
let keep_processing_instructions = call.has_flag(engine_state, stack, "keep-pi")?;
let allow_dtd = !call.has_flag(engine_state, stack, "disallow-dtd")?;
let info = ParsingInfo {
span: head,
keep_comments,
keep_processing_instructions,
allow_dtd,
};
from_xml(input, &info)
}
@ -90,6 +97,7 @@ struct ParsingInfo {
span: Span,
keep_comments: bool,
keep_processing_instructions: bool,
allow_dtd: bool,
}
fn from_attributes_to_value(attributes: &[roxmltree::Attribute], info: &ParsingInfo) -> Value {
@ -198,7 +206,12 @@ fn from_document_to_value(d: &roxmltree::Document, info: &ParsingInfo) -> Value
}
fn from_xml_string_to_value(s: &str, info: &ParsingInfo) -> Result<Value, roxmltree::Error> {
let parsed = roxmltree::Document::parse(s)?;
let options = ParsingOptions {
allow_dtd: info.allow_dtd,
..Default::default()
};
let parsed = roxmltree::Document::parse_with_options(s, options)?;
Ok(from_document_to_value(&parsed, info))
}
@ -209,116 +222,135 @@ fn from_xml(input: PipelineData, info: &ParsingInfo) -> Result<PipelineData, She
Ok(x) => {
Ok(x.into_pipeline_data_with_metadata(metadata.map(|md| md.with_content_type(None))))
}
Err(err) => Err(process_xml_parse_error(err, span)),
Err(err) => Err(process_xml_parse_error(concat_string, err, span)),
}
}
fn process_xml_parse_error(err: roxmltree::Error, span: Span) -> ShellError {
fn process_xml_parse_error(source: String, err: roxmltree::Error, span: Span) -> ShellError {
match err {
roxmltree::Error::InvalidXmlPrefixUri(_) => make_cant_convert_error(
roxmltree::Error::InvalidXmlPrefixUri(pos) => make_xml_error_spanned(
"The `xmlns:xml` attribute must have an <http://www.w3.org/XML/1998/namespace> URI.",
span,
source, pos,
),
roxmltree::Error::UnexpectedXmlUri(_) => make_cant_convert_error(
roxmltree::Error::UnexpectedXmlUri(pos) => make_xml_error_spanned(
"Only the xmlns:xml attribute can have the http://www.w3.org/XML/1998/namespace URI.",
span,
source, pos,
),
roxmltree::Error::UnexpectedXmlnsUri(_) => make_cant_convert_error(
roxmltree::Error::UnexpectedXmlnsUri(pos) => make_xml_error_spanned(
"The http://www.w3.org/2000/xmlns/ URI must not be declared.",
span,
source, pos,
),
roxmltree::Error::InvalidElementNamePrefix(_) => {
make_cant_convert_error("xmlns can't be used as an element prefix.", span)
roxmltree::Error::InvalidElementNamePrefix(pos) => {
make_xml_error_spanned("xmlns can't be used as an element prefix.", source, pos)
}
roxmltree::Error::DuplicatedNamespace(_, _) => {
make_cant_convert_error("A namespace was already defined on this element.", span)
roxmltree::Error::DuplicatedNamespace(namespace, pos) => {
make_xml_error_spanned(format!("Namespace {namespace} was already defined on this element."), source, pos)
}
roxmltree::Error::UnknownNamespace(prefix, _) => {
make_cant_convert_error(format!("Unknown prefix {}", prefix), span)
roxmltree::Error::UnknownNamespace(prefix, pos) => {
make_xml_error_spanned(format!("Unknown prefix {}", prefix), source, pos)
}
roxmltree::Error::UnexpectedCloseTag { .. } => {
make_cant_convert_error("Unexpected close tag", span)
roxmltree::Error::UnexpectedCloseTag(expected, actual, pos) => {
make_xml_error_spanned(format!("Unexpected close tag {actual}, expected {expected}"), source, pos)
}
roxmltree::Error::UnexpectedEntityCloseTag(_) => {
make_cant_convert_error("Entity value starts with a close tag.", span)
roxmltree::Error::UnexpectedEntityCloseTag(pos) => {
make_xml_error_spanned("Entity value starts with a close tag.", source, pos)
}
roxmltree::Error::UnknownEntityReference(_, _) => make_cant_convert_error(
"A reference to an entity that was not defined in the DTD.",
span,
roxmltree::Error::UnknownEntityReference(entity, pos) => make_xml_error_spanned(
format!("Reference to unknown entity {entity} (was not defined in the DTD)"),
source, pos,
),
roxmltree::Error::MalformedEntityReference(_) => {
make_cant_convert_error("A malformed entity reference.", span)
roxmltree::Error::MalformedEntityReference(pos) => {
make_xml_error_spanned("Malformed entity reference.", source, pos)
}
roxmltree::Error::EntityReferenceLoop(_) => {
make_cant_convert_error("A possible entity reference loop.", span)
roxmltree::Error::EntityReferenceLoop(pos) => {
make_xml_error_spanned("Possible entity reference loop.", source, pos)
}
roxmltree::Error::InvalidAttributeValue(_) => {
make_cant_convert_error("Attribute value cannot have a < character.", span)
roxmltree::Error::InvalidAttributeValue(pos) => {
make_xml_error_spanned("Attribute value cannot have a < character.", source, pos)
}
roxmltree::Error::DuplicatedAttribute(_, _) => {
make_cant_convert_error("An element has a duplicated attributes.", span)
roxmltree::Error::DuplicatedAttribute(attribute, pos) => {
make_xml_error_spanned(format!("Element has a duplicated attribute: {attribute}"), source, pos)
}
roxmltree::Error::NoRootNode => {
make_cant_convert_error("The XML document must have at least one element.", span)
make_xml_error("The XML document must have at least one element.", span)
}
roxmltree::Error::UnclosedRootNode => {
make_cant_convert_error("The root node was opened but never closed.", span)
make_xml_error("The root node was opened but never closed.", span)
}
roxmltree::Error::DtdDetected => make_cant_convert_error(
"An XML with DTD detected. DTDs are currently disabled due to security reasons.",
span,
roxmltree::Error::DtdDetected => make_xml_error(
"XML document with DTD detected.",
span
),
roxmltree::Error::NodesLimitReached => {
make_cant_convert_error("Node limit was reached.", span)
make_xml_error("Node limit was reached.", span)
}
roxmltree::Error::AttributesLimitReached => {
make_cant_convert_error("Attribute limit reached", span)
make_xml_error("Attribute limit reached", span)
}
roxmltree::Error::NamespacesLimitReached => {
make_cant_convert_error("Namespace limit reached", span)
make_xml_error("Namespace limit reached", span)
}
roxmltree::Error::UnexpectedDeclaration(_) => {
make_cant_convert_error("An XML document can have only one XML declaration and it must be at the start of the document.", span)
roxmltree::Error::UnexpectedDeclaration(pos) => {
make_xml_error_spanned("An XML document can have only one XML declaration and it must be at the start of the document.", source, pos)
}
roxmltree::Error::InvalidName(_) => {
make_cant_convert_error("Invalid name found.", span)
roxmltree::Error::InvalidName(pos) => {
make_xml_error_spanned("Invalid name.", source, pos)
}
roxmltree::Error::NonXmlChar(_, _) => {
make_cant_convert_error("A non-XML character has occurred. Valid characters are: <https://www.w3.org/TR/xml/#char32>", span)
roxmltree::Error::NonXmlChar(_, pos) => {
make_xml_error_spanned("Non-XML character found. Valid characters are: <https://www.w3.org/TR/xml/#char32>", source, pos)
}
roxmltree::Error::InvalidChar(_, _, _) => {
make_cant_convert_error("An invalid/unexpected character in XML.", span)
roxmltree::Error::InvalidChar(expected, actual, pos) => {
make_xml_error_spanned(
format!("Unexpected character {}, expected {}", actual as char, expected as char),
source,
pos
)
}
roxmltree::Error::InvalidChar2(_, _, _) => {
make_cant_convert_error("An invalid/unexpected character in XML.", span)
roxmltree::Error::InvalidChar2(expected, actual, pos) => {
make_xml_error_spanned(
format!("Unexpected character {}, expected {}", actual as char, expected),
source,
pos
)
}
roxmltree::Error::InvalidString(_, _) => {
make_cant_convert_error("An invalid/unexpected string in XML.", span)
roxmltree::Error::InvalidString(_, pos) => {
make_xml_error_spanned("Invalid/unexpected string in XML.", source, pos)
}
roxmltree::Error::InvalidExternalID(_) => {
make_cant_convert_error("An invalid ExternalID in the DTD.", span)
roxmltree::Error::InvalidExternalID(pos) => {
make_xml_error_spanned("Invalid ExternalID in the DTD.", source, pos)
}
roxmltree::Error::InvalidComment(_) => {
make_cant_convert_error("A comment cannot contain `--` or end with `-`.", span)
roxmltree::Error::InvalidComment(pos) => {
make_xml_error_spanned("A comment cannot contain `--` or end with `-`.", source, pos)
}
roxmltree::Error::InvalidCharacterData(_) => {
make_cant_convert_error("A Character Data node contains an invalid data. Currently, only `]]>` is not allowed.", span)
roxmltree::Error::InvalidCharacterData(pos) => {
make_xml_error_spanned("Character Data node contains an invalid data. Currently, only `]]>` is not allowed.", source, pos)
}
roxmltree::Error::UnknownToken(_) => {
make_cant_convert_error("Unknown token in XML.", span)
roxmltree::Error::UnknownToken(pos) => {
make_xml_error_spanned("Unknown token in XML.", source, pos)
}
roxmltree::Error::UnexpectedEndOfStream => {
make_cant_convert_error("Unexpected end of stream while parsing XML.", span)
make_xml_error("Unexpected end of stream while parsing XML.", span)
}
}
}
fn make_cant_convert_error(help: impl Into<String>, span: Span) -> ShellError {
ShellError::CantConvert {
from_type: Type::String.to_string(),
to_type: "XML".to_string(),
fn make_xml_error(msg: impl Into<String>, span: Span) -> ShellError {
ShellError::GenericError {
error: "Failed to parse XML".into(),
msg: msg.into(),
help: None,
span: Some(span),
inner: vec![],
}
}
fn make_xml_error_spanned(msg: impl Into<String>, src: String, pos: TextPos) -> ShellError {
let span = Span::from_row_column(pos.row as usize, pos.col as usize, &src);
ShellError::OutsideSpannedLabeledError {
src,
error: "Failed to parse XML".into(),
msg: msg.into(),
span,
help: Some(help.into()),
}
}
@ -375,6 +407,7 @@ mod tests {
span: Span::test_data(),
keep_comments: false,
keep_processing_instructions: false,
allow_dtd: false,
};
from_xml_string_to_value(xml, &info)
}

View File

@ -148,6 +148,28 @@ impl Span {
}
}
/// Converts row and column in a String to a Span, assuming bytes (1-based rows)
pub fn from_row_column(row: usize, col: usize, contents: &str) -> Span {
let mut cur_row = 1;
let mut cur_col = 1;
for (offset, curr_byte) in contents.bytes().enumerate() {
if curr_byte == b'\n' {
cur_row += 1;
cur_col = 1;
} else if cur_row >= row && cur_col >= col {
return Span::new(offset, offset);
} else {
cur_col += 1;
}
}
Self {
start: contents.len(),
end: contents.len(),
}
}
/// Returns the minimal [`Span`] that encompasses both of the given spans.
///
/// The two `Spans` can overlap in the middle,