From 66751534605863a9e4588c470e98198ac1bfdc11 Mon Sep 17 00:00:00 2001 From: Keith Hall Date: Thu, 7 Aug 2025 23:31:15 +0300 Subject: [PATCH 1/4] Fix the read_line method for utf16le input to determine the end of the line, instead of reading until \n (0x0A) and then reading until 0x00 and calling it done, read until we find 0x00 preceded by 0x0A. --- CHANGELOG.md | 1 + src/input.rs | 47 +++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 46 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7d3481f3..a1c2b775 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ - Add missing mappings for various bash/zsh files, see PR #3262 (@AdamGaskins) - Send all bat errors to stderr by default, see #3336 (@JerryImMouse) - Make --map-syntax target case insensitive to match --language, see #3206 (@keith-hall) +- Correctly determine the end of the line in UTF16LE input #3369 (@keith-hall) ## Other diff --git a/src/input.rs b/src/input.rs index b36204df..e5f7e4d6 100644 --- a/src/input.rs +++ b/src/input.rs @@ -267,7 +267,7 @@ impl<'a> InputReader<'a> { }; if content_type == Some(ContentType::UTF_16LE) { - reader.read_until(0x00, &mut first_line).ok(); + read_utf16le_line(&mut reader, &mut first_line).ok(); } InputReader { @@ -286,13 +286,31 @@ impl<'a> InputReader<'a> { let res = self.inner.read_until(b'\n', buf).map(|size| size > 0)?; if self.content_type == Some(ContentType::UTF_16LE) { - let _ = self.inner.read_until(0x00, buf); + return read_utf16le_line(&mut self.inner, buf); } Ok(res) } } +fn read_utf16le_line(reader: &mut R, buf: &mut Vec) -> io::Result { + loop { + let mut temp = Vec::new(); + let n = reader.read_until(0x00, &mut temp)?; + if n == 0 { + // EOF reached + break; + } + buf.extend_from_slice(&temp); + if buf.len() >= 2 && buf[buf.len() - 2] == 0x0A && buf[buf.len() - 1] == 0x00 { + // end of line found + break; + } + // end of line not found, keep going + } + return Ok(!buf.is_empty()); +} + #[test] fn basic() { let content = b"#!/bin/bash\necho hello"; @@ -350,3 +368,28 @@ fn utf16le() { assert!(!res.unwrap()); assert!(buffer.is_empty()); } + +#[test] +fn utf16le_issue3367() { + let content = b"\xFF\xFE\x0A\x4E\x00\x4E\x0A\x4F\x00\x52"; + let mut reader = InputReader::new(&content[..]); + + assert_eq!( + b"\xFF\xFE\x0A\x4E\x00\x4E\x0A\x4F\x00\x52", + &reader.first_line[..] + ); + + let mut buffer = vec![]; + + let res = reader.read_line(&mut buffer); + assert!(res.is_ok()); + assert!(res.unwrap()); + assert_eq!(b"\xFF\xFE\x0A\x4E\x00\x4E\x0A\x4F\x00\x52", &buffer[..]); + + buffer.clear(); + + let res = reader.read_line(&mut buffer); + assert!(res.is_ok()); + assert!(!res.unwrap()); + assert!(buffer.is_empty()); +} From 40c4c8e5420e86d9baac9cbd9438d7584ad9da86 Mon Sep 17 00:00:00 2001 From: Keith Hall Date: Sat, 16 Aug 2025 15:33:30 +0300 Subject: [PATCH 2/4] More thorough tests for UTF16LE --- src/input.rs | 22 ++++++++++++++++--- tests/examples/test_UTF-16BE.txt | Bin 0 -> 56 bytes tests/examples/test_UTF-16LE-complicated.txt | Bin 0 -> 50 bytes tests/integration_tests.rs | 20 +++++++++++++++++ 4 files changed, 39 insertions(+), 3 deletions(-) create mode 100644 tests/examples/test_UTF-16BE.txt create mode 100644 tests/examples/test_UTF-16LE-complicated.txt diff --git a/src/input.rs b/src/input.rs index e5f7e4d6..69b10906 100644 --- a/src/input.rs +++ b/src/input.rs @@ -371,11 +371,13 @@ fn utf16le() { #[test] fn utf16le_issue3367() { - let content = b"\xFF\xFE\x0A\x4E\x00\x4E\x0A\x4F\x00\x52"; + let content = b"\xFF\xFE\x0A\x4E\x00\x4E\x0A\x4F\x00\x52\x0A\x00\ + \x6F\x00\x20\x00\x62\x00\x61\x00\x72\x00\x0A\x00\ + \x68\x00\x65\x00\x6C\x00\x6C\x00\x6F\x00\x20\x00\x77\x00\x6F\x00\x72\x00\x6C\x00\x64\x00"; let mut reader = InputReader::new(&content[..]); assert_eq!( - b"\xFF\xFE\x0A\x4E\x00\x4E\x0A\x4F\x00\x52", + b"\xFF\xFE\x0A\x4E\x00\x4E\x0A\x4F\x00\x52\x0A\x00", &reader.first_line[..] ); @@ -384,10 +386,24 @@ fn utf16le_issue3367() { let res = reader.read_line(&mut buffer); assert!(res.is_ok()); assert!(res.unwrap()); - assert_eq!(b"\xFF\xFE\x0A\x4E\x00\x4E\x0A\x4F\x00\x52", &buffer[..]); + assert_eq!(b"\xFF\xFE\x0A\x4E\x00\x4E\x0A\x4F\x00\x52\x0A\x00", &buffer[..]); buffer.clear(); + let res = reader.read_line(&mut buffer); + assert!(res.is_ok()); + assert!(res.unwrap()); + assert_eq!(b"\x6F\x00\x20\x00\x62\x00\x61\x00\x72\x00\x0A\x00", &buffer[..]); + + buffer.clear(); + + let res = reader.read_line(&mut buffer); + assert!(res.is_ok()); + assert!(res.unwrap()); + assert_eq!(b"\x68\x00\x65\x00\x6C\x00\x6C\x00\x6F\x00\x20\x00\x77\x00\x6F\x00\x72\x00\x6C\x00\x64\x00", &buffer[..]); + + buffer.clear(); + let res = reader.read_line(&mut buffer); assert!(res.is_ok()); assert!(!res.unwrap()); diff --git a/tests/examples/test_UTF-16BE.txt b/tests/examples/test_UTF-16BE.txt new file mode 100644 index 0000000000000000000000000000000000000000..77a2bf2c42cc13f081dac3623529c7dcb74746c0 GIT binary patch literal 56 xcmezOpCN-Gl_3WR^BEKv%7J7Nke$N7#ZUs2$z&)7ibBXlAgctZwirlq0RVhs3xfav literal 0 HcmV?d00001 diff --git a/tests/examples/test_UTF-16LE-complicated.txt b/tests/examples/test_UTF-16LE-complicated.txt new file mode 100644 index 0000000000000000000000000000000000000000..b2eba9c94e337d573a698625e8ee56abc6d150f9 GIT binary patch literal 50 xcmezWkIRq2kISDSh>IbOAs+}87?Kzg8HyOV7%~`A8FGLSBvuY2i-7DD1^^~r3RnOD literal 0 HcmV?d00001 diff --git a/tests/integration_tests.rs b/tests/integration_tests.rs index 0f570f89..35a85623 100644 --- a/tests/integration_tests.rs +++ b/tests/integration_tests.rs @@ -1321,6 +1321,26 @@ fn utf16() { .assert() .success() .stdout("hello world\n"); + + bat() + .arg("--plain") + .arg("--decorations=always") + .arg("test_UTF-16BE.txt") + .assert() + .success() + .stdout("hello world\nthis is a test\n"); +} + +#[test] +fn utf16le() { + bat() + .arg("--decorations=always") + .arg("--style=numbers") + .arg("--color=never") + .arg("test_UTF-16LE-complicated.txt") + .assert() + .success() + .stdout(" 1 上一伊刀\n 2 foo bar\n 3 hello world\n"); } // Regression test for https://github.com/sharkdp/bat/issues/1922 From 96ce80d0e27eee0ca8d7fe25f0526465d38a2a54 Mon Sep 17 00:00:00 2001 From: Keith Hall Date: Sat, 16 Aug 2025 15:33:53 +0300 Subject: [PATCH 3/4] Apply same fix and tests for UTF16BE --- CHANGELOG.md | 2 +- src/input.rs | 20 +++++++++++-------- tests/examples/test_UTF-16BE-complicated.txt | Bin 0 -> 50 bytes tests/integration_tests.rs | 12 +++++++++++ 4 files changed, 25 insertions(+), 9 deletions(-) create mode 100644 tests/examples/test_UTF-16BE-complicated.txt diff --git a/CHANGELOG.md b/CHANGELOG.md index a1c2b775..69338d36 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,7 +17,7 @@ - Add missing mappings for various bash/zsh files, see PR #3262 (@AdamGaskins) - Send all bat errors to stderr by default, see #3336 (@JerryImMouse) - Make --map-syntax target case insensitive to match --language, see #3206 (@keith-hall) -- Correctly determine the end of the line in UTF16LE input #3369 (@keith-hall) +- Correctly determine the end of the line in UTF16LE/BE input #3369 (@keith-hall) ## Other diff --git a/src/input.rs b/src/input.rs index 69b10906..3abfdd82 100644 --- a/src/input.rs +++ b/src/input.rs @@ -267,7 +267,9 @@ impl<'a> InputReader<'a> { }; if content_type == Some(ContentType::UTF_16LE) { - read_utf16le_line(&mut reader, &mut first_line).ok(); + read_utf16_line(&mut reader, &mut first_line, 0x00, 0x0A).ok(); + } else if content_type == Some(ContentType::UTF_16BE) { + read_utf16_line(&mut reader, &mut first_line, 0x0A, 0x00).ok(); } InputReader { @@ -283,26 +285,28 @@ impl<'a> InputReader<'a> { return Ok(true); } - let res = self.inner.read_until(b'\n', buf).map(|size| size > 0)?; - if self.content_type == Some(ContentType::UTF_16LE) { - return read_utf16le_line(&mut self.inner, buf); + return read_utf16_line(&mut self.inner, buf, 0x00, 0x0A); + } + if self.content_type == Some(ContentType::UTF_16BE) { + return read_utf16_line(&mut self.inner, buf, 0x0A, 0x00); } + let res = self.inner.read_until(b'\n', buf).map(|size| size > 0)?; Ok(res) } } -fn read_utf16le_line(reader: &mut R, buf: &mut Vec) -> io::Result { +fn read_utf16_line(reader: &mut R, buf: &mut Vec, read_until_char: u8, preceded_by_char: u8) -> io::Result { loop { let mut temp = Vec::new(); - let n = reader.read_until(0x00, &mut temp)?; + let n = reader.read_until(read_until_char, &mut temp)?; if n == 0 { // EOF reached break; } buf.extend_from_slice(&temp); - if buf.len() >= 2 && buf[buf.len() - 2] == 0x0A && buf[buf.len() - 1] == 0x00 { + if buf.len() >= 2 && buf[buf.len() - 2] == preceded_by_char && buf[buf.len() - 1] == read_until_char { // end of line found break; } @@ -403,7 +407,7 @@ fn utf16le_issue3367() { assert_eq!(b"\x68\x00\x65\x00\x6C\x00\x6C\x00\x6F\x00\x20\x00\x77\x00\x6F\x00\x72\x00\x6C\x00\x64\x00", &buffer[..]); buffer.clear(); - + let res = reader.read_line(&mut buffer); assert!(res.is_ok()); assert!(!res.unwrap()); diff --git a/tests/examples/test_UTF-16BE-complicated.txt b/tests/examples/test_UTF-16BE-complicated.txt new file mode 100644 index 0000000000000000000000000000000000000000..f57ddeaacc3dfcdbd72b525b127c01c5068bb3fd GIT binary patch literal 50 wcmezO-;c|W!JjLLfq{!5jUgWh6&R8j5*dns{0xRvh8!RSiIoG%A|N{j03dw|SO5S3 literal 0 HcmV?d00001 diff --git a/tests/integration_tests.rs b/tests/integration_tests.rs index 35a85623..7579794d 100644 --- a/tests/integration_tests.rs +++ b/tests/integration_tests.rs @@ -1343,6 +1343,18 @@ fn utf16le() { .stdout(" 1 上一伊刀\n 2 foo bar\n 3 hello world\n"); } +#[test] +fn utf16be() { + bat() + .arg("--decorations=always") + .arg("--style=numbers") + .arg("--color=never") + .arg("test_UTF-16BE-complicated.txt") + .assert() + .success() + .stdout(" 1 上一伊刀\n 2 foo bar\n 3 hello world\n"); +} + // Regression test for https://github.com/sharkdp/bat/issues/1922 #[test] fn bom_not_stripped_in_loop_through_mode() { From bdaf25793d258f3c5609412a834912e7ec91ecb5 Mon Sep 17 00:00:00 2001 From: Keith Hall Date: Sat, 16 Aug 2025 15:35:13 +0300 Subject: [PATCH 4/4] cargo fmt --- src/input.rs | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/src/input.rs b/src/input.rs index 3abfdd82..f9cfac2a 100644 --- a/src/input.rs +++ b/src/input.rs @@ -297,7 +297,12 @@ impl<'a> InputReader<'a> { } } -fn read_utf16_line(reader: &mut R, buf: &mut Vec, read_until_char: u8, preceded_by_char: u8) -> io::Result { +fn read_utf16_line( + reader: &mut R, + buf: &mut Vec, + read_until_char: u8, + preceded_by_char: u8, +) -> io::Result { loop { let mut temp = Vec::new(); let n = reader.read_until(read_until_char, &mut temp)?; @@ -306,7 +311,10 @@ fn read_utf16_line(reader: &mut R, buf: &mut Vec, read_until_cha break; } buf.extend_from_slice(&temp); - if buf.len() >= 2 && buf[buf.len() - 2] == preceded_by_char && buf[buf.len() - 1] == read_until_char { + if buf.len() >= 2 + && buf[buf.len() - 2] == preceded_by_char + && buf[buf.len() - 1] == read_until_char + { // end of line found break; } @@ -390,21 +398,30 @@ fn utf16le_issue3367() { let res = reader.read_line(&mut buffer); assert!(res.is_ok()); assert!(res.unwrap()); - assert_eq!(b"\xFF\xFE\x0A\x4E\x00\x4E\x0A\x4F\x00\x52\x0A\x00", &buffer[..]); + assert_eq!( + b"\xFF\xFE\x0A\x4E\x00\x4E\x0A\x4F\x00\x52\x0A\x00", + &buffer[..] + ); buffer.clear(); let res = reader.read_line(&mut buffer); assert!(res.is_ok()); assert!(res.unwrap()); - assert_eq!(b"\x6F\x00\x20\x00\x62\x00\x61\x00\x72\x00\x0A\x00", &buffer[..]); + assert_eq!( + b"\x6F\x00\x20\x00\x62\x00\x61\x00\x72\x00\x0A\x00", + &buffer[..] + ); buffer.clear(); let res = reader.read_line(&mut buffer); assert!(res.is_ok()); assert!(res.unwrap()); - assert_eq!(b"\x68\x00\x65\x00\x6C\x00\x6C\x00\x6F\x00\x20\x00\x77\x00\x6F\x00\x72\x00\x6C\x00\x64\x00", &buffer[..]); + assert_eq!( + b"\x68\x00\x65\x00\x6C\x00\x6C\x00\x6F\x00\x20\x00\x77\x00\x6F\x00\x72\x00\x6C\x00\x64\x00", + &buffer[..] + ); buffer.clear();