diff --git a/CHANGELOG.md b/CHANGELOG.md index 7d3481f3..69338d36 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ - Add missing mappings for various bash/zsh files, see PR #3262 (@AdamGaskins) - Send all bat errors to stderr by default, see #3336 (@JerryImMouse) - Make --map-syntax target case insensitive to match --language, see #3206 (@keith-hall) +- Correctly determine the end of the line in UTF16LE/BE input #3369 (@keith-hall) ## Other diff --git a/src/input.rs b/src/input.rs index b36204df..f9cfac2a 100644 --- a/src/input.rs +++ b/src/input.rs @@ -267,7 +267,9 @@ impl<'a> InputReader<'a> { }; if content_type == Some(ContentType::UTF_16LE) { - reader.read_until(0x00, &mut first_line).ok(); + read_utf16_line(&mut reader, &mut first_line, 0x00, 0x0A).ok(); + } else if content_type == Some(ContentType::UTF_16BE) { + read_utf16_line(&mut reader, &mut first_line, 0x0A, 0x00).ok(); } InputReader { @@ -283,16 +285,44 @@ impl<'a> InputReader<'a> { return Ok(true); } - let res = self.inner.read_until(b'\n', buf).map(|size| size > 0)?; - if self.content_type == Some(ContentType::UTF_16LE) { - let _ = self.inner.read_until(0x00, buf); + return read_utf16_line(&mut self.inner, buf, 0x00, 0x0A); + } + if self.content_type == Some(ContentType::UTF_16BE) { + return read_utf16_line(&mut self.inner, buf, 0x0A, 0x00); } + let res = self.inner.read_until(b'\n', buf).map(|size| size > 0)?; Ok(res) } } +fn read_utf16_line( + reader: &mut R, + buf: &mut Vec, + read_until_char: u8, + preceded_by_char: u8, +) -> io::Result { + loop { + let mut temp = Vec::new(); + let n = reader.read_until(read_until_char, &mut temp)?; + if n == 0 { + // EOF reached + break; + } + buf.extend_from_slice(&temp); + if buf.len() >= 2 + && buf[buf.len() - 2] == preceded_by_char + && buf[buf.len() - 1] == read_until_char + { + // end of line found + break; + } + // end of line not found, keep going + } + return Ok(!buf.is_empty()); +} + #[test] fn basic() { let content = b"#!/bin/bash\necho hello"; @@ -350,3 +380,53 @@ fn utf16le() { assert!(!res.unwrap()); assert!(buffer.is_empty()); } + +#[test] +fn utf16le_issue3367() { + let content = b"\xFF\xFE\x0A\x4E\x00\x4E\x0A\x4F\x00\x52\x0A\x00\ + \x6F\x00\x20\x00\x62\x00\x61\x00\x72\x00\x0A\x00\ + \x68\x00\x65\x00\x6C\x00\x6C\x00\x6F\x00\x20\x00\x77\x00\x6F\x00\x72\x00\x6C\x00\x64\x00"; + let mut reader = InputReader::new(&content[..]); + + assert_eq!( + b"\xFF\xFE\x0A\x4E\x00\x4E\x0A\x4F\x00\x52\x0A\x00", + &reader.first_line[..] + ); + + let mut buffer = vec![]; + + let res = reader.read_line(&mut buffer); + assert!(res.is_ok()); + assert!(res.unwrap()); + assert_eq!( + b"\xFF\xFE\x0A\x4E\x00\x4E\x0A\x4F\x00\x52\x0A\x00", + &buffer[..] + ); + + buffer.clear(); + + let res = reader.read_line(&mut buffer); + assert!(res.is_ok()); + assert!(res.unwrap()); + assert_eq!( + b"\x6F\x00\x20\x00\x62\x00\x61\x00\x72\x00\x0A\x00", + &buffer[..] + ); + + buffer.clear(); + + let res = reader.read_line(&mut buffer); + assert!(res.is_ok()); + assert!(res.unwrap()); + assert_eq!( + b"\x68\x00\x65\x00\x6C\x00\x6C\x00\x6F\x00\x20\x00\x77\x00\x6F\x00\x72\x00\x6C\x00\x64\x00", + &buffer[..] + ); + + buffer.clear(); + + let res = reader.read_line(&mut buffer); + assert!(res.is_ok()); + assert!(!res.unwrap()); + assert!(buffer.is_empty()); +} diff --git a/tests/examples/test_UTF-16BE-complicated.txt b/tests/examples/test_UTF-16BE-complicated.txt new file mode 100644 index 00000000..f57ddeaa Binary files /dev/null and b/tests/examples/test_UTF-16BE-complicated.txt differ diff --git a/tests/examples/test_UTF-16BE.txt b/tests/examples/test_UTF-16BE.txt new file mode 100644 index 00000000..77a2bf2c Binary files /dev/null and b/tests/examples/test_UTF-16BE.txt differ diff --git a/tests/examples/test_UTF-16LE-complicated.txt b/tests/examples/test_UTF-16LE-complicated.txt new file mode 100644 index 00000000..b2eba9c9 Binary files /dev/null and b/tests/examples/test_UTF-16LE-complicated.txt differ diff --git a/tests/integration_tests.rs b/tests/integration_tests.rs index 0f570f89..7579794d 100644 --- a/tests/integration_tests.rs +++ b/tests/integration_tests.rs @@ -1321,6 +1321,38 @@ fn utf16() { .assert() .success() .stdout("hello world\n"); + + bat() + .arg("--plain") + .arg("--decorations=always") + .arg("test_UTF-16BE.txt") + .assert() + .success() + .stdout("hello world\nthis is a test\n"); +} + +#[test] +fn utf16le() { + bat() + .arg("--decorations=always") + .arg("--style=numbers") + .arg("--color=never") + .arg("test_UTF-16LE-complicated.txt") + .assert() + .success() + .stdout(" 1 上一伊刀\n 2 foo bar\n 3 hello world\n"); +} + +#[test] +fn utf16be() { + bat() + .arg("--decorations=always") + .arg("--style=numbers") + .arg("--color=never") + .arg("test_UTF-16BE-complicated.txt") + .assert() + .success() + .stdout(" 1 上一伊刀\n 2 foo bar\n 3 hello world\n"); } // Regression test for https://github.com/sharkdp/bat/issues/1922