From 17e6952ab891853b954c53ae7d46f7a494e1f991 Mon Sep 17 00:00:00 2001 From: krikera Date: Sat, 31 May 2025 02:58:47 +0530 Subject: [PATCH] Fix UTF-8 BOM file type detection for first-line syntax patterns - Fixes #3314 --- CHANGELOG.md | 1 + src/assets.rs | 41 ++++++++++++++++++++++++++++++++++++++++- 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ecf1c76b..73d19363 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ ## Bugfixes +- Fix UTF-8 BOM not being stripped for syntax detection, see #3314 (@krikera) - Fix `BAT_THEME_DARK` and `BAT_THEME_LIGHT` being ignored, see issue #3171 and PR #3168 (@bash) - Prevent `--list-themes` from outputting default theme info to stdout when it is piped, see #3189 (@einfachIrgendwer0815) - Rename some submodules to fix Dependabot submodule updates, see issue #3198 and PR #3201 (@victor-gp) diff --git a/src/assets.rs b/src/assets.rs index e6c50219..a5608e3b 100644 --- a/src/assets.rs +++ b/src/assets.rs @@ -298,7 +298,11 @@ impl HighlightingAssets { let syntax_set = self.get_syntax_set()?; Ok(String::from_utf8(reader.first_line.clone()) .ok() - .and_then(|l| syntax_set.find_syntax_by_first_line(&l)) + .and_then(|l| { + // Strip UTF-8 BOM if present + let line = l.strip_prefix('\u{feff}').unwrap_or(&l); + syntax_set.find_syntax_by_first_line(line) + }) .map(|syntax| SyntaxReferenceInSet { syntax, syntax_set })) } } @@ -533,6 +537,41 @@ mod tests { ); } + #[test] + fn syntax_detection_first_line_with_utf8_bom() { + let test = SyntaxDetectionTest::new(); + + // Test that XML files are detected correctly even with UTF-8 BOM + // The BOM should be stripped before first-line syntax detection + let xml_with_bom = "\u{feff}"; + assert_eq!( + test.syntax_for_file_with_content("unknown_file", xml_with_bom), + "XML" + ); + + // Test the specific .csproj case mentioned in the issue + // Even if .csproj has extension mapping, this tests first-line fallback + let csproj_content_with_bom = "\u{feff}\n"; + assert_eq!( + test.syntax_for_file_with_content("test.csproj", csproj_content_with_bom), + "XML" + ); + + // Test that shell scripts are detected correctly even with UTF-8 BOM + let script_with_bom = "\u{feff}#!/bin/bash"; + assert_eq!( + test.syntax_for_file_with_content("unknown_script", script_with_bom), + "Bourne Again Shell (bash)" + ); + + // Test that PHP files are detected correctly even with UTF-8 BOM + let php_with_bom = "\u{feff}