Strict JSON parsing (#11592)

# Description
Adds the `--strict` flag for `from json` which will try to parse text
while following the exact JSON specification (e.g., no comments or
trailing commas allowed). Fixes issue #11548.
This commit is contained in:
Ian Manske 2024-01-30 14:10:19 +00:00 committed by GitHub
parent 6530403ff8
commit 4e0a65c822
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 190 additions and 48 deletions

1
Cargo.lock generated
View File

@ -5104,6 +5104,7 @@ version = "1.0.112"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4d1bd37ce2324cf3bf85e5a25f96eb4baf0d5aa6eba43e7ae8958870c4ec48ed" checksum = "4d1bd37ce2324cf3bf85e5a25f96eb4baf0d5aa6eba43e7ae8958870c4ec48ed"
dependencies = [ dependencies = [
"indexmap",
"itoa", "itoa",
"ryu", "ryu",
"serde", "serde",

View File

@ -75,7 +75,7 @@ roxmltree = "0.18"
rusqlite = { version = "0.29", features = ["bundled", "backup", "chrono"], optional = true } rusqlite = { version = "0.29", features = ["bundled", "backup", "chrono"], optional = true }
same-file = "1.0" same-file = "1.0"
serde = { version = "1.0", features = ["derive"] } serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0" serde_json = { version = "1.0", features = ["preserve_order"] }
serde_urlencoded = "0.7" serde_urlencoded = "0.7"
serde_yaml = "0.9" serde_yaml = "0.9"
sha2 = "0.10" sha2 = "0.10"

View File

@ -22,6 +22,7 @@ impl Command for FromJson {
Signature::build("from json") Signature::build("from json")
.input_output_types(vec![(Type::String, Type::Any)]) .input_output_types(vec![(Type::String, Type::Any)])
.switch("objects", "treat each line as a separate value", Some('o')) .switch("objects", "treat each line as a separate value", Some('o'))
.switch("strict", "follow the json specification exactly", Some('s'))
.category(Category::Formats) .category(Category::Formats)
} }
@ -42,6 +43,14 @@ impl Command for FromJson {
"b" => Value::test_list(vec![Value::test_int(1), Value::test_int(2)]), "b" => Value::test_list(vec![Value::test_int(1), Value::test_int(2)]),
})), })),
}, },
Example {
example: r#"'{ "a": 1, "b": 2 }' | from json -s"#,
description: "Parse json strictly which will error on comments and trailing commas",
result: Some(Value::test_record(record! {
"a" => Value::test_int(1),
"b" => Value::test_int(2),
})),
},
] ]
} }
@ -59,52 +68,61 @@ impl Command for FromJson {
return Ok(PipelineData::new_with_metadata(metadata, span)); return Ok(PipelineData::new_with_metadata(metadata, span));
} }
let strict = call.has_flag(engine_state, stack, "strict")?;
// TODO: turn this into a structured underline of the nu_json error // TODO: turn this into a structured underline of the nu_json error
if call.has_flag(engine_state, stack, "objects")? { if call.has_flag(engine_state, stack, "objects")? {
let converted_lines: Vec<Value> = string_input let lines = string_input.lines().filter(|line| !line.trim().is_empty());
.lines()
.filter_map(move |x| { let converted_lines: Vec<_> = if strict {
if x.trim() == "" { lines
None .map(|line| {
} else { convert_string_to_value_strict(line, span)
match convert_string_to_value(x.to_string(), span) { .unwrap_or_else(|err| Value::error(err, span))
Ok(v) => Some(v), })
Err(error) => Some(Value::error(error, span)), .collect()
} } else {
} lines
}) .map(|line| {
.collect(); convert_string_to_value(line, span)
.unwrap_or_else(|err| Value::error(err, span))
})
.collect()
};
Ok(converted_lines Ok(converted_lines
.into_pipeline_data_with_metadata(metadata, engine_state.ctrlc.clone())) .into_pipeline_data_with_metadata(metadata, engine_state.ctrlc.clone()))
} else if strict {
Ok(convert_string_to_value_strict(&string_input, span)?
.into_pipeline_data_with_metadata(metadata))
} else { } else {
Ok(convert_string_to_value(string_input, span)? Ok(convert_string_to_value(&string_input, span)?
.into_pipeline_data_with_metadata(metadata)) .into_pipeline_data_with_metadata(metadata))
} }
} }
} }
fn convert_nujson_to_value(value: &nu_json::Value, span: Span) -> Value { fn convert_nujson_to_value(value: nu_json::Value, span: Span) -> Value {
match value { match value {
nu_json::Value::Array(array) => { nu_json::Value::Array(array) => Value::list(
let v: Vec<Value> = array array
.iter() .into_iter()
.map(|x| convert_nujson_to_value(x, span)) .map(|x| convert_nujson_to_value(x, span))
.collect(); .collect(),
span,
Value::list(v, span) ),
} nu_json::Value::Bool(b) => Value::bool(b, span),
nu_json::Value::Bool(b) => Value::bool(*b, span), nu_json::Value::F64(f) => Value::float(f, span),
nu_json::Value::F64(f) => Value::float(*f, span), nu_json::Value::I64(i) => Value::int(i, span),
nu_json::Value::I64(i) => Value::int(*i, span),
nu_json::Value::Null => Value::nothing(span), nu_json::Value::Null => Value::nothing(span),
nu_json::Value::Object(k) => Value::record( nu_json::Value::Object(k) => Value::record(
k.iter() k.into_iter()
.map(|(k, v)| (k.clone(), convert_nujson_to_value(v, span))) .map(|(k, v)| (k, convert_nujson_to_value(v, span)))
.collect(), .collect(),
span, span,
), ),
nu_json::Value::U64(u) => { nu_json::Value::U64(u) => {
if *u > i64::MAX as u64 { if u > i64::MAX as u64 {
Value::error( Value::error(
ShellError::CantConvert { ShellError::CantConvert {
to_type: "i64 sized integer".into(), to_type: "i64 sized integer".into(),
@ -115,22 +133,22 @@ fn convert_nujson_to_value(value: &nu_json::Value, span: Span) -> Value {
span, span,
) )
} else { } else {
Value::int(*u as i64, span) Value::int(u as i64, span)
} }
} }
nu_json::Value::String(s) => Value::string(s.clone(), span), nu_json::Value::String(s) => Value::string(s, span),
} }
} }
// Converts row+column to a Span, assuming bytes (1-based rows) // Converts row+column to a Span, assuming bytes (1-based rows)
fn convert_row_column_to_span(row: usize, col: usize, contents: &str) -> Span { fn convert_row_column_to_span(row: usize, col: usize, contents: &str) -> Span {
let mut cur_row = 1; let mut cur_row = 1;
let mut cur_col = 0; let mut cur_col = 1;
for (offset, curr_byte) in contents.bytes().enumerate() { for (offset, curr_byte) in contents.bytes().enumerate() {
if curr_byte == b'\n' { if curr_byte == b'\n' {
cur_row += 1; cur_row += 1;
cur_col = 0; cur_col = 1;
} }
if cur_row >= row && cur_col >= col { if cur_row >= row && cur_col >= col {
return Span::new(offset, offset); return Span::new(offset, offset);
@ -142,22 +160,21 @@ fn convert_row_column_to_span(row: usize, col: usize, contents: &str) -> Span {
Span::new(contents.len(), contents.len()) Span::new(contents.len(), contents.len())
} }
fn convert_string_to_value(string_input: String, span: Span) -> Result<Value, ShellError> { fn convert_string_to_value(string_input: &str, span: Span) -> Result<Value, ShellError> {
let result: Result<nu_json::Value, nu_json::Error> = nu_json::from_str(&string_input); match nu_json::from_str(string_input) {
match result { Ok(value) => Ok(convert_nujson_to_value(value, span)),
Ok(value) => Ok(convert_nujson_to_value(&value, span)),
Err(x) => match x { Err(x) => match x {
nu_json::Error::Syntax(_, row, col) => { nu_json::Error::Syntax(_, row, col) => {
let label = x.to_string(); let label = x.to_string();
let label_span = convert_row_column_to_span(row, col, &string_input); let label_span = convert_row_column_to_span(row, col, string_input);
Err(ShellError::GenericError { Err(ShellError::GenericError {
error: "Error while parsing JSON text".into(), error: "Error while parsing JSON text".into(),
msg: "error parsing JSON text".into(), msg: "error parsing JSON text".into(),
span: Some(span), span: Some(span),
help: None, help: None,
inner: vec![ShellError::OutsideSpannedLabeledError { inner: vec![ShellError::OutsideSpannedLabeledError {
src: string_input, src: string_input.into(),
error: "Error while parsing JSON text".into(), error: "Error while parsing JSON text".into(),
msg: label, msg: label,
span: label_span, span: label_span,
@ -174,6 +191,35 @@ fn convert_string_to_value(string_input: String, span: Span) -> Result<Value, Sh
} }
} }
fn convert_string_to_value_strict(string_input: &str, span: Span) -> Result<Value, ShellError> {
match serde_json::from_str(string_input) {
Ok(value) => Ok(convert_nujson_to_value(value, span)),
Err(err) => Err(if err.is_syntax() {
let label = err.to_string();
let label_span = convert_row_column_to_span(err.line(), err.column(), string_input);
ShellError::GenericError {
error: "Error while parsing JSON text".into(),
msg: "error parsing JSON text".into(),
span: Some(span),
help: None,
inner: vec![ShellError::OutsideSpannedLabeledError {
src: string_input.into(),
error: "Error while parsing JSON text".into(),
msg: label,
span: label_span,
}],
}
} else {
ShellError::CantConvert {
to_type: format!("structured json data ({err})"),
from_type: "string".into(),
span,
help: None,
}
}),
}
}
#[cfg(test)] #[cfg(test)]
mod test { mod test {
use super::*; use super::*;

View File

@ -43,6 +43,32 @@ fn from_json_text_to_table() {
}) })
} }
#[test]
fn from_json_text_to_table_strict() {
Playground::setup("filter_from_json_test_1_strict", |dirs, sandbox| {
sandbox.with_files(vec![FileWithContentToBeTrimmed(
"katz.txt",
r#"
{
"katz": [
{"name": "Yehuda", "rusty_luck": 1},
{"name": "JT", "rusty_luck": 1},
{"name": "Andres", "rusty_luck": 1},
{"name":"GorbyPuff", "rusty_luck": 1}
]
}
"#,
)]);
let actual = nu!(
cwd: dirs.test(),
"open katz.txt | from json -s | get katz | get rusty_luck | length "
);
assert_eq!(actual.out, "4");
})
}
#[test] #[test]
fn from_json_text_recognizing_objects_independently_to_table() { fn from_json_text_recognizing_objects_independently_to_table() {
Playground::setup("filter_from_json_test_2", |dirs, sandbox| { Playground::setup("filter_from_json_test_2", |dirs, sandbox| {
@ -70,6 +96,33 @@ fn from_json_text_recognizing_objects_independently_to_table() {
}) })
} }
#[test]
fn from_json_text_recognizing_objects_independently_to_table_strict() {
Playground::setup("filter_from_json_test_2_strict", |dirs, sandbox| {
sandbox.with_files(vec![FileWithContentToBeTrimmed(
"katz.txt",
r#"
{"name": "Yehuda", "rusty_luck": 1}
{"name": "JT", "rusty_luck": 1}
{"name": "Andres", "rusty_luck": 1}
{"name":"GorbyPuff", "rusty_luck": 3}
"#,
)]);
let actual = nu!(
cwd: dirs.test(), pipeline(
r#"
open katz.txt
| from json -o -s
| where name == "GorbyPuff"
| get rusty_luck.0
"#
));
assert_eq!(actual.out, "3");
})
}
#[test] #[test]
fn table_to_json_text() { fn table_to_json_text() {
Playground::setup("filter_to_json_test", |dirs, sandbox| { Playground::setup("filter_to_json_test", |dirs, sandbox| {
@ -99,6 +152,35 @@ fn table_to_json_text() {
}) })
} }
#[test]
fn table_to_json_text_strict() {
Playground::setup("filter_to_json_test_strict", |dirs, sandbox| {
sandbox.with_files(vec![FileWithContentToBeTrimmed(
"sample.txt",
r#"
JonAndrehudaTZ,3
GorbyPuff,100
"#,
)]);
let actual = nu!(
cwd: dirs.test(), pipeline(
r#"
open sample.txt
| lines
| split column "," name luck
| select name
| to json
| from json -s
| get 0
| get name
"#
));
assert_eq!(actual.out, "JonAndrehudaTZ");
})
}
#[test] #[test]
fn top_level_values_from_json() { fn top_level_values_from_json() {
for (value, type_name) in [("null", "nothing"), ("true", "bool"), ("false", "bool")] { for (value, type_name) in [("null", "nothing"), ("true", "bool"), ("false", "bool")] {
@ -109,6 +191,28 @@ fn top_level_values_from_json() {
} }
} }
#[test]
fn top_level_values_from_json_strict() {
for (value, type_name) in [("null", "nothing"), ("true", "bool"), ("false", "bool")] {
let actual = nu!(r#""{}" | from json -s | to json"#, value);
assert_eq!(actual.out, value);
let actual = nu!(r#""{}" | from json -s | describe"#, value);
assert_eq!(actual.out, type_name);
}
}
#[test]
fn strict_parsing_fails_on_comment() {
let actual = nu!(r#"'{ "a": 1, /* comment */ "b": 2 }' | from json -s"#);
assert!(actual.err.contains("error parsing JSON text"));
}
#[test]
fn strict_parsing_fails_on_trailing_comma() {
let actual = nu!(r#"'{ "a": 1, "b": 2, }' | from json -s"#);
assert!(actual.err.contains("error parsing JSON text"));
}
#[test] #[test]
fn ranges_to_json_as_array() { fn ranges_to_json_as_array() {
let value = r#"[ 1, 2, 3]"#; let value = r#"[ 1, 2, 3]"#;

View File

@ -26,15 +26,6 @@ pub struct Deserializer<Iter: Iterator<Item = u8>> {
state: State, state: State,
} }
// macro_rules! try_or_invalid {
// ($self_:expr, $e:expr) => {
// match $e {
// Some(v) => v,
// None => { return Err($self_.error(ErrorCode::InvalidNumber)); }
// }
// }
// }
impl<Iter> Deserializer<Iter> impl<Iter> Deserializer<Iter>
where where
Iter: Iterator<Item = u8>, Iter: Iterator<Item = u8>,