mirror of
https://github.com/nushell/nushell.git
synced 2025-05-06 11:04:24 +02:00
Parse XML documents with DTDs by default, and add --disallow-dtd
flag (#15272)
<!-- if this PR closes one or more issues, you can automatically link the PR with them by using one of the [*linking keywords*](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue#linking-a-pull-request-to-an-issue-using-a-keyword), e.g. - this PR should close #xxxx - fixes #xxxx you can also mention related issues, PRs or discussions! --> # Description <!-- Thank you for improving Nushell. Please, check our [contributing guide](../CONTRIBUTING.md) and talk to the core team before making major changes. Description of your pull request goes here. **Provide examples and/or screenshots** if your changes affect the user experience. --> This PR allows `from xml` to parse XML documents with [document type declarations](https://en.wikipedia.org/wiki/Document_type_declaration) by default. This is especially notable since many HTML documents start with `<!DOCTYPE html>`, and `roxmltree` should be able to parse some simple HTML documents. The security concerns with DTDs are [XXE attacks](https://en.wikipedia.org/wiki/XML_external_entity_attack), and [exponential entity expansion attacks](https://en.wikipedia.org/wiki/Billion_laughs_attack). `roxmltree` [doesn't support](d2c7801624/src/tokenizer.rs (L535-L547)
) external entities (it parses them, but doesn't do anything with them), so it is not vulnerable to XXE attacks. Additionally, `roxmltree` has [some safeguards](d2c7801624/src/parse.rs (L424-L452)
) in place to prevent exponential entity expansion, so enabling DTDs by default is relatively safe. The worst case is no worse than running `loop {}`, so I think allowing DTDs by default is best, and DTDs can still be disabled with `--disallow-dtd` if needed. # User-Facing Changes <!-- List of all changes that impact the user experience here. This helps us keep track of breaking changes. --> * Allows `from xml` to parse XML documents with [document type declarations](https://en.wikipedia.org/wiki/Document_type_declaration) by default, and adds a `--disallow-dtd` flag to disallow parsing documents with DTDs. This PR also improves the errors in `from xml` by pointing at the issue in the XML source. Example: ``` $ open --raw foo.xml | from xml Error: × Failed to parse XML ╭─[2:7] 1 │ <html> 2 │ <p<>hi</p> · ▲ · ╰── Unexpected character <, expected a whitespace 3 │ </html> ╰──── ``` # Tests + Formatting <!-- Don't forget to add tests that cover your changes. Make sure you've run and fixed any issues with these commands: - `cargo fmt --all -- --check` to check standard code formatting (`cargo fmt --all` applies these changes) - `cargo clippy --workspace -- -D warnings -D clippy::unwrap_used` to check that you're using the standard code style - `cargo test --workspace` to check that all tests pass (on Windows make sure to [enable developer mode](https://learn.microsoft.com/en-us/windows/apps/get-started/developer-mode-features-and-debugging)) - `cargo run -- -c "use toolkit.nu; toolkit test stdlib"` to run the tests for the standard library > **Note** > from `nushell` you can also use the `toolkit` as follows > ```bash > use toolkit.nu # or use an `env_change` hook to activate it automatically > toolkit check pr > ``` --> N/A # After Submitting <!-- If your PR had any user-facing changes, update [the documentation](https://github.com/nushell/nushell.github.io) after the PR is merged, if necessary. This will help us keep the docs up to date. --> N/A
This commit is contained in:
parent
1e566adcfc
commit
430b2746b8
@ -72,7 +72,7 @@ fn convert_string_to_value(
|
||||
Err(x) => match x {
|
||||
nu_json::Error::Syntax(_, row, col) => {
|
||||
let label = x.to_string();
|
||||
let label_span = convert_row_column_to_span(row, col, string_input);
|
||||
let label_span = Span::from_row_column(row, col, string_input);
|
||||
Err(ShellError::GenericError {
|
||||
error: "Error while parsing JSON text".into(),
|
||||
msg: "error parsing JSON text".into(),
|
||||
@ -173,23 +173,3 @@ fn expand_closure(
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
// Converts row+column to a Span, assuming bytes (1-based rows)
|
||||
fn convert_row_column_to_span(row: usize, col: usize, contents: &str) -> Span {
|
||||
let mut cur_row = 1;
|
||||
let mut cur_col = 1;
|
||||
|
||||
for (offset, curr_byte) in contents.bytes().enumerate() {
|
||||
if curr_byte == b'\n' {
|
||||
cur_row += 1;
|
||||
cur_col = 1;
|
||||
}
|
||||
if cur_row >= row && cur_col >= col {
|
||||
return Span::new(offset, offset);
|
||||
} else {
|
||||
cur_col += 1;
|
||||
}
|
||||
}
|
||||
|
||||
Span::new(contents.len(), contents.len())
|
||||
}
|
||||
|
@ -184,26 +184,6 @@ fn convert_nujson_to_value(value: nu_json::Value, span: Span) -> Value {
|
||||
}
|
||||
}
|
||||
|
||||
// Converts row+column to a Span, assuming bytes (1-based rows)
|
||||
fn convert_row_column_to_span(row: usize, col: usize, contents: &str) -> Span {
|
||||
let mut cur_row = 1;
|
||||
let mut cur_col = 1;
|
||||
|
||||
for (offset, curr_byte) in contents.bytes().enumerate() {
|
||||
if curr_byte == b'\n' {
|
||||
cur_row += 1;
|
||||
cur_col = 1;
|
||||
}
|
||||
if cur_row >= row && cur_col >= col {
|
||||
return Span::new(offset, offset);
|
||||
} else {
|
||||
cur_col += 1;
|
||||
}
|
||||
}
|
||||
|
||||
Span::new(contents.len(), contents.len())
|
||||
}
|
||||
|
||||
fn convert_string_to_value(string_input: &str, span: Span) -> Result<Value, ShellError> {
|
||||
match nu_json::from_str(string_input) {
|
||||
Ok(value) => Ok(convert_nujson_to_value(value, span)),
|
||||
@ -211,7 +191,7 @@ fn convert_string_to_value(string_input: &str, span: Span) -> Result<Value, Shel
|
||||
Err(x) => match x {
|
||||
nu_json::Error::Syntax(_, row, col) => {
|
||||
let label = x.to_string();
|
||||
let label_span = convert_row_column_to_span(row, col, string_input);
|
||||
let label_span = Span::from_row_column(row, col, string_input);
|
||||
Err(ShellError::GenericError {
|
||||
error: "Error while parsing JSON text".into(),
|
||||
msg: "error parsing JSON text".into(),
|
||||
@ -240,7 +220,7 @@ fn convert_string_to_value_strict(string_input: &str, span: Span) -> Result<Valu
|
||||
Ok(value) => Ok(convert_nujson_to_value(value, span)),
|
||||
Err(err) => Err(if err.is_syntax() {
|
||||
let label = err.to_string();
|
||||
let label_span = convert_row_column_to_span(err.line(), err.column(), string_input);
|
||||
let label_span = Span::from_row_column(err.line(), err.column(), string_input);
|
||||
ShellError::GenericError {
|
||||
error: "Error while parsing JSON text".into(),
|
||||
msg: "error parsing JSON text".into(),
|
||||
|
@ -2,7 +2,7 @@ use crate::formats::nu_xml_format::{COLUMN_ATTRS_NAME, COLUMN_CONTENT_NAME, COLU
|
||||
use indexmap::IndexMap;
|
||||
use nu_engine::command_prelude::*;
|
||||
|
||||
use roxmltree::NodeType;
|
||||
use roxmltree::{NodeType, ParsingOptions, TextPos};
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct FromXml;
|
||||
@ -16,6 +16,11 @@ impl Command for FromXml {
|
||||
Signature::build("from xml")
|
||||
.input_output_types(vec![(Type::String, Type::record())])
|
||||
.switch("keep-comments", "add comment nodes to result", None)
|
||||
.switch(
|
||||
"disallow-dtd",
|
||||
"disallow parsing documents with DTDs (prevents exponential entity expansion attacks)",
|
||||
None,
|
||||
)
|
||||
.switch(
|
||||
"keep-pi",
|
||||
"add processing instruction nodes to result",
|
||||
@ -50,10 +55,12 @@ string. This way content of every tag is always a table and is easier to parse"#
|
||||
let head = call.head;
|
||||
let keep_comments = call.has_flag(engine_state, stack, "keep-comments")?;
|
||||
let keep_processing_instructions = call.has_flag(engine_state, stack, "keep-pi")?;
|
||||
let allow_dtd = !call.has_flag(engine_state, stack, "disallow-dtd")?;
|
||||
let info = ParsingInfo {
|
||||
span: head,
|
||||
keep_comments,
|
||||
keep_processing_instructions,
|
||||
allow_dtd,
|
||||
};
|
||||
from_xml(input, &info)
|
||||
}
|
||||
@ -90,6 +97,7 @@ struct ParsingInfo {
|
||||
span: Span,
|
||||
keep_comments: bool,
|
||||
keep_processing_instructions: bool,
|
||||
allow_dtd: bool,
|
||||
}
|
||||
|
||||
fn from_attributes_to_value(attributes: &[roxmltree::Attribute], info: &ParsingInfo) -> Value {
|
||||
@ -198,7 +206,12 @@ fn from_document_to_value(d: &roxmltree::Document, info: &ParsingInfo) -> Value
|
||||
}
|
||||
|
||||
fn from_xml_string_to_value(s: &str, info: &ParsingInfo) -> Result<Value, roxmltree::Error> {
|
||||
let parsed = roxmltree::Document::parse(s)?;
|
||||
let options = ParsingOptions {
|
||||
allow_dtd: info.allow_dtd,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let parsed = roxmltree::Document::parse_with_options(s, options)?;
|
||||
Ok(from_document_to_value(&parsed, info))
|
||||
}
|
||||
|
||||
@ -209,116 +222,135 @@ fn from_xml(input: PipelineData, info: &ParsingInfo) -> Result<PipelineData, She
|
||||
Ok(x) => {
|
||||
Ok(x.into_pipeline_data_with_metadata(metadata.map(|md| md.with_content_type(None))))
|
||||
}
|
||||
Err(err) => Err(process_xml_parse_error(err, span)),
|
||||
Err(err) => Err(process_xml_parse_error(concat_string, err, span)),
|
||||
}
|
||||
}
|
||||
|
||||
fn process_xml_parse_error(err: roxmltree::Error, span: Span) -> ShellError {
|
||||
fn process_xml_parse_error(source: String, err: roxmltree::Error, span: Span) -> ShellError {
|
||||
match err {
|
||||
roxmltree::Error::InvalidXmlPrefixUri(_) => make_cant_convert_error(
|
||||
roxmltree::Error::InvalidXmlPrefixUri(pos) => make_xml_error_spanned(
|
||||
"The `xmlns:xml` attribute must have an <http://www.w3.org/XML/1998/namespace> URI.",
|
||||
span,
|
||||
source, pos,
|
||||
),
|
||||
roxmltree::Error::UnexpectedXmlUri(_) => make_cant_convert_error(
|
||||
roxmltree::Error::UnexpectedXmlUri(pos) => make_xml_error_spanned(
|
||||
"Only the xmlns:xml attribute can have the http://www.w3.org/XML/1998/namespace URI.",
|
||||
span,
|
||||
source, pos,
|
||||
),
|
||||
roxmltree::Error::UnexpectedXmlnsUri(_) => make_cant_convert_error(
|
||||
roxmltree::Error::UnexpectedXmlnsUri(pos) => make_xml_error_spanned(
|
||||
"The http://www.w3.org/2000/xmlns/ URI must not be declared.",
|
||||
span,
|
||||
source, pos,
|
||||
),
|
||||
roxmltree::Error::InvalidElementNamePrefix(_) => {
|
||||
make_cant_convert_error("xmlns can't be used as an element prefix.", span)
|
||||
roxmltree::Error::InvalidElementNamePrefix(pos) => {
|
||||
make_xml_error_spanned("xmlns can't be used as an element prefix.", source, pos)
|
||||
}
|
||||
roxmltree::Error::DuplicatedNamespace(_, _) => {
|
||||
make_cant_convert_error("A namespace was already defined on this element.", span)
|
||||
roxmltree::Error::DuplicatedNamespace(namespace, pos) => {
|
||||
make_xml_error_spanned(format!("Namespace {namespace} was already defined on this element."), source, pos)
|
||||
}
|
||||
roxmltree::Error::UnknownNamespace(prefix, _) => {
|
||||
make_cant_convert_error(format!("Unknown prefix {}", prefix), span)
|
||||
roxmltree::Error::UnknownNamespace(prefix, pos) => {
|
||||
make_xml_error_spanned(format!("Unknown prefix {}", prefix), source, pos)
|
||||
}
|
||||
roxmltree::Error::UnexpectedCloseTag { .. } => {
|
||||
make_cant_convert_error("Unexpected close tag", span)
|
||||
roxmltree::Error::UnexpectedCloseTag(expected, actual, pos) => {
|
||||
make_xml_error_spanned(format!("Unexpected close tag {actual}, expected {expected}"), source, pos)
|
||||
}
|
||||
roxmltree::Error::UnexpectedEntityCloseTag(_) => {
|
||||
make_cant_convert_error("Entity value starts with a close tag.", span)
|
||||
roxmltree::Error::UnexpectedEntityCloseTag(pos) => {
|
||||
make_xml_error_spanned("Entity value starts with a close tag.", source, pos)
|
||||
}
|
||||
roxmltree::Error::UnknownEntityReference(_, _) => make_cant_convert_error(
|
||||
"A reference to an entity that was not defined in the DTD.",
|
||||
span,
|
||||
roxmltree::Error::UnknownEntityReference(entity, pos) => make_xml_error_spanned(
|
||||
format!("Reference to unknown entity {entity} (was not defined in the DTD)"),
|
||||
source, pos,
|
||||
),
|
||||
roxmltree::Error::MalformedEntityReference(_) => {
|
||||
make_cant_convert_error("A malformed entity reference.", span)
|
||||
roxmltree::Error::MalformedEntityReference(pos) => {
|
||||
make_xml_error_spanned("Malformed entity reference.", source, pos)
|
||||
}
|
||||
roxmltree::Error::EntityReferenceLoop(_) => {
|
||||
make_cant_convert_error("A possible entity reference loop.", span)
|
||||
roxmltree::Error::EntityReferenceLoop(pos) => {
|
||||
make_xml_error_spanned("Possible entity reference loop.", source, pos)
|
||||
}
|
||||
roxmltree::Error::InvalidAttributeValue(_) => {
|
||||
make_cant_convert_error("Attribute value cannot have a < character.", span)
|
||||
roxmltree::Error::InvalidAttributeValue(pos) => {
|
||||
make_xml_error_spanned("Attribute value cannot have a < character.", source, pos)
|
||||
}
|
||||
roxmltree::Error::DuplicatedAttribute(_, _) => {
|
||||
make_cant_convert_error("An element has a duplicated attributes.", span)
|
||||
roxmltree::Error::DuplicatedAttribute(attribute, pos) => {
|
||||
make_xml_error_spanned(format!("Element has a duplicated attribute: {attribute}"), source, pos)
|
||||
}
|
||||
roxmltree::Error::NoRootNode => {
|
||||
make_cant_convert_error("The XML document must have at least one element.", span)
|
||||
make_xml_error("The XML document must have at least one element.", span)
|
||||
}
|
||||
roxmltree::Error::UnclosedRootNode => {
|
||||
make_cant_convert_error("The root node was opened but never closed.", span)
|
||||
make_xml_error("The root node was opened but never closed.", span)
|
||||
}
|
||||
roxmltree::Error::DtdDetected => make_cant_convert_error(
|
||||
"An XML with DTD detected. DTDs are currently disabled due to security reasons.",
|
||||
span,
|
||||
roxmltree::Error::DtdDetected => make_xml_error(
|
||||
"XML document with DTD detected.",
|
||||
span
|
||||
),
|
||||
roxmltree::Error::NodesLimitReached => {
|
||||
make_cant_convert_error("Node limit was reached.", span)
|
||||
make_xml_error("Node limit was reached.", span)
|
||||
}
|
||||
roxmltree::Error::AttributesLimitReached => {
|
||||
make_cant_convert_error("Attribute limit reached", span)
|
||||
make_xml_error("Attribute limit reached", span)
|
||||
}
|
||||
roxmltree::Error::NamespacesLimitReached => {
|
||||
make_cant_convert_error("Namespace limit reached", span)
|
||||
make_xml_error("Namespace limit reached", span)
|
||||
}
|
||||
roxmltree::Error::UnexpectedDeclaration(_) => {
|
||||
make_cant_convert_error("An XML document can have only one XML declaration and it must be at the start of the document.", span)
|
||||
roxmltree::Error::UnexpectedDeclaration(pos) => {
|
||||
make_xml_error_spanned("An XML document can have only one XML declaration and it must be at the start of the document.", source, pos)
|
||||
}
|
||||
roxmltree::Error::InvalidName(_) => {
|
||||
make_cant_convert_error("Invalid name found.", span)
|
||||
roxmltree::Error::InvalidName(pos) => {
|
||||
make_xml_error_spanned("Invalid name.", source, pos)
|
||||
}
|
||||
roxmltree::Error::NonXmlChar(_, _) => {
|
||||
make_cant_convert_error("A non-XML character has occurred. Valid characters are: <https://www.w3.org/TR/xml/#char32>", span)
|
||||
roxmltree::Error::NonXmlChar(_, pos) => {
|
||||
make_xml_error_spanned("Non-XML character found. Valid characters are: <https://www.w3.org/TR/xml/#char32>", source, pos)
|
||||
}
|
||||
roxmltree::Error::InvalidChar(_, _, _) => {
|
||||
make_cant_convert_error("An invalid/unexpected character in XML.", span)
|
||||
roxmltree::Error::InvalidChar(expected, actual, pos) => {
|
||||
make_xml_error_spanned(
|
||||
format!("Unexpected character {}, expected {}", actual as char, expected as char),
|
||||
source,
|
||||
pos
|
||||
)
|
||||
}
|
||||
roxmltree::Error::InvalidChar2(_, _, _) => {
|
||||
make_cant_convert_error("An invalid/unexpected character in XML.", span)
|
||||
roxmltree::Error::InvalidChar2(expected, actual, pos) => {
|
||||
make_xml_error_spanned(
|
||||
format!("Unexpected character {}, expected {}", actual as char, expected),
|
||||
source,
|
||||
pos
|
||||
)
|
||||
}
|
||||
roxmltree::Error::InvalidString(_, _) => {
|
||||
make_cant_convert_error("An invalid/unexpected string in XML.", span)
|
||||
roxmltree::Error::InvalidString(_, pos) => {
|
||||
make_xml_error_spanned("Invalid/unexpected string in XML.", source, pos)
|
||||
}
|
||||
roxmltree::Error::InvalidExternalID(_) => {
|
||||
make_cant_convert_error("An invalid ExternalID in the DTD.", span)
|
||||
roxmltree::Error::InvalidExternalID(pos) => {
|
||||
make_xml_error_spanned("Invalid ExternalID in the DTD.", source, pos)
|
||||
}
|
||||
roxmltree::Error::InvalidComment(_) => {
|
||||
make_cant_convert_error("A comment cannot contain `--` or end with `-`.", span)
|
||||
roxmltree::Error::InvalidComment(pos) => {
|
||||
make_xml_error_spanned("A comment cannot contain `--` or end with `-`.", source, pos)
|
||||
}
|
||||
roxmltree::Error::InvalidCharacterData(_) => {
|
||||
make_cant_convert_error("A Character Data node contains an invalid data. Currently, only `]]>` is not allowed.", span)
|
||||
roxmltree::Error::InvalidCharacterData(pos) => {
|
||||
make_xml_error_spanned("Character Data node contains an invalid data. Currently, only `]]>` is not allowed.", source, pos)
|
||||
}
|
||||
roxmltree::Error::UnknownToken(_) => {
|
||||
make_cant_convert_error("Unknown token in XML.", span)
|
||||
roxmltree::Error::UnknownToken(pos) => {
|
||||
make_xml_error_spanned("Unknown token in XML.", source, pos)
|
||||
}
|
||||
roxmltree::Error::UnexpectedEndOfStream => {
|
||||
make_cant_convert_error("Unexpected end of stream while parsing XML.", span)
|
||||
make_xml_error("Unexpected end of stream while parsing XML.", span)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn make_cant_convert_error(help: impl Into<String>, span: Span) -> ShellError {
|
||||
ShellError::CantConvert {
|
||||
from_type: Type::String.to_string(),
|
||||
to_type: "XML".to_string(),
|
||||
fn make_xml_error(msg: impl Into<String>, span: Span) -> ShellError {
|
||||
ShellError::GenericError {
|
||||
error: "Failed to parse XML".into(),
|
||||
msg: msg.into(),
|
||||
help: None,
|
||||
span: Some(span),
|
||||
inner: vec![],
|
||||
}
|
||||
}
|
||||
|
||||
fn make_xml_error_spanned(msg: impl Into<String>, src: String, pos: TextPos) -> ShellError {
|
||||
let span = Span::from_row_column(pos.row as usize, pos.col as usize, &src);
|
||||
ShellError::OutsideSpannedLabeledError {
|
||||
src,
|
||||
error: "Failed to parse XML".into(),
|
||||
msg: msg.into(),
|
||||
span,
|
||||
help: Some(help.into()),
|
||||
}
|
||||
}
|
||||
|
||||
@ -375,6 +407,7 @@ mod tests {
|
||||
span: Span::test_data(),
|
||||
keep_comments: false,
|
||||
keep_processing_instructions: false,
|
||||
allow_dtd: false,
|
||||
};
|
||||
from_xml_string_to_value(xml, &info)
|
||||
}
|
||||
|
@ -148,6 +148,28 @@ impl Span {
|
||||
}
|
||||
}
|
||||
|
||||
/// Converts row and column in a String to a Span, assuming bytes (1-based rows)
|
||||
pub fn from_row_column(row: usize, col: usize, contents: &str) -> Span {
|
||||
let mut cur_row = 1;
|
||||
let mut cur_col = 1;
|
||||
|
||||
for (offset, curr_byte) in contents.bytes().enumerate() {
|
||||
if curr_byte == b'\n' {
|
||||
cur_row += 1;
|
||||
cur_col = 1;
|
||||
} else if cur_row >= row && cur_col >= col {
|
||||
return Span::new(offset, offset);
|
||||
} else {
|
||||
cur_col += 1;
|
||||
}
|
||||
}
|
||||
|
||||
Self {
|
||||
start: contents.len(),
|
||||
end: contents.len(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the minimal [`Span`] that encompasses both of the given spans.
|
||||
///
|
||||
/// The two `Spans` can overlap in the middle,
|
||||
|
Loading…
Reference in New Issue
Block a user