From 24835344eb4fb958229868970eaae30a66e9237c Mon Sep 17 00:00:00 2001 From: nielash Date: Sat, 12 Jul 2025 15:05:11 -0400 Subject: [PATCH] encoder: add ForceNFC and ForceNFD options See https://github.com/rclone/rclone/issues/8088#issuecomment-3001795870 --- docs/content/overview.md | 2 + lib/encoder/encoder.go | 22 ++++++++++ lib/encoder/encoder_test.go | 30 +++++++++++++- lib/encoder/internal/gen/main.go | 71 ++++++++++++++++++++++---------- 4 files changed, 101 insertions(+), 24 deletions(-) diff --git a/docs/content/overview.md b/docs/content/overview.md index 3f0d568d6..5dc128b73 100644 --- a/docs/content/overview.md +++ b/docs/content/overview.md @@ -377,6 +377,8 @@ will show you the defaults for the backends. | Exclamation | `!` | `!` | | Hash | `#` | `#` | | InvalidUtf8 | An invalid UTF-8 character (e.g. latin1) | `�` | +| ForceNFC | All invalid NFC characters | Their valid NFC equivalents | +| ForceNFD | All invalid NFD characters | Their valid NFD equivalents | | LeftCrLfHtVt | CR 0x0D, LF 0x0A, HT 0x09, VT 0x0B on the left of a string | `␍`, `␊`, `␉`, `␋` | | LeftPeriod | `.` on the left of a string | `.` | | LeftSpace | SPACE on the left of a string | `␠` | diff --git a/lib/encoder/encoder.go b/lib/encoder/encoder.go index c8f41d76c..b8517715b 100644 --- a/lib/encoder/encoder.go +++ b/lib/encoder/encoder.go @@ -18,6 +18,8 @@ import ( "strconv" "strings" "unicode/utf8" + + "golang.org/x/text/unicode/norm" ) const ( @@ -61,6 +63,8 @@ const ( EncodeRightPeriod // Trailing . EncodeRightCrLfHtVt // Trailing CR LF HT VT EncodeInvalidUtf8 // Invalid UTF-8 bytes + EncodeInvalidNFC // Force NFC encoding + EncodeInvalidNFD // Force NFD encoding EncodeDot // . and .. names EncodeSquareBracket // [] EncodeSemicolon // ; @@ -148,6 +152,8 @@ func init() { alias("RightPeriod", EncodeRightPeriod) alias("RightCrLfHtVt", EncodeRightCrLfHtVt) alias("InvalidUtf8", EncodeInvalidUtf8) + alias("ForceNFC", EncodeInvalidNFC) + alias("ForceNFD", EncodeInvalidNFD) alias("Dot", EncodeDot) } @@ -226,6 +232,13 @@ func (mask MultiEncoder) Encode(in string) string { return "" } + if mask.Has(EncodeInvalidNFD) { + in = norm.NFD.String(in) + } + if mask.Has(EncodeInvalidNFC) { + in = norm.NFC.String(in) + } + if mask.Has(EncodeDot) { switch in { case ".": @@ -688,6 +701,15 @@ func (mask MultiEncoder) Decode(in string) string { return in } + /* // Can't losslessly decode NFC/NFD + if mask.Has(EncodeInvalidNFD) { + in = norm.NFC.String(in) + } + if mask.Has(EncodeInvalidNFC) { + in = norm.NFD.String(in) + } + */ + if mask.Has(EncodeDot) { switch in { case ".": diff --git a/lib/encoder/encoder_test.go b/lib/encoder/encoder_test.go index b36339824..276bf9924 100644 --- a/lib/encoder/encoder_test.go +++ b/lib/encoder/encoder_test.go @@ -34,7 +34,6 @@ func TestEncodeString(t *testing.T) { got := test.mask.String() assert.Equal(t, test.want, got) } - } func TestEncodeSet(t *testing.T) { @@ -60,7 +59,6 @@ func TestEncodeSet(t *testing.T) { assert.Equal(t, test.wantErr, err != nil, err) assert.Equal(t, test.want, got, test.in) } - } type testCase struct { @@ -175,6 +173,34 @@ func TestEncodeInvalidUnicode(t *testing.T) { } } +func TestEncodeNFCNFD(t *testing.T) { + for i, tc := range []testCase{ + { + mask: EncodeInvalidNFC, + in: "Über", + out: "Über", + }, + { + mask: EncodeInvalidNFD, + in: "Über", + out: "Über", + }, + } { + e := tc.mask + t.Run(strconv.FormatInt(int64(i), 10), func(t *testing.T) { + got := e.Encode(tc.in) + if got != tc.out { + t.Errorf("Encode(%q) want %q got %q", tc.in, tc.out, got) + } + // we can't losslessly decode NFC/NFD + /* got2 := e.Decode(got) + if got2 != tc.in { + t.Errorf("Decode(%q) want %q got %q", got, tc.in, got2) + } */ + }) + } +} + func TestEncodeDot(t *testing.T) { for i, tc := range []testCase{ { diff --git a/lib/encoder/internal/gen/main.go b/lib/encoder/internal/gen/main.go index 1812091e9..0bf689e93 100644 --- a/lib/encoder/internal/gen/main.go +++ b/lib/encoder/internal/gen/main.go @@ -67,6 +67,8 @@ var maskBits = []struct { {encoder.EncodeRightPeriod, "EncodeRightPeriod"}, {encoder.EncodeRightCrLfHtVt, "EncodeRightCrLfHtVt"}, {encoder.EncodeInvalidUtf8, "EncodeInvalidUtf8"}, + {encoder.EncodeInvalidNFC, "ForceNFC"}, + {encoder.EncodeInvalidNFD, "ForceNFD"}, {encoder.EncodeDot, "EncodeDot"}, } @@ -82,13 +84,15 @@ var allEdges = []edge{ {encoder.EncodeLeftSpace, "EncodeLeftSpace", edgeLeft, []rune{' '}, []rune{'␠'}}, {encoder.EncodeLeftPeriod, "EncodeLeftPeriod", edgeLeft, []rune{'.'}, []rune{'.'}}, {encoder.EncodeLeftTilde, "EncodeLeftTilde", edgeLeft, []rune{'~'}, []rune{'~'}}, - {encoder.EncodeLeftCrLfHtVt, "EncodeLeftCrLfHtVt", edgeLeft, + { + encoder.EncodeLeftCrLfHtVt, "EncodeLeftCrLfHtVt", edgeLeft, []rune{'\t', '\n', '\v', '\r'}, []rune{'␀' + '\t', '␀' + '\n', '␀' + '\v', '␀' + '\r'}, }, {encoder.EncodeRightSpace, "EncodeRightSpace", edgeRight, []rune{' '}, []rune{'␠'}}, {encoder.EncodeRightPeriod, "EncodeRightPeriod", edgeRight, []rune{'.'}, []rune{'.'}}, - {encoder.EncodeRightCrLfHtVt, "EncodeRightCrLfHtVt", edgeRight, + { + encoder.EncodeRightCrLfHtVt, "EncodeRightCrLfHtVt", edgeRight, []rune{'\t', '\n', '\v', '\r'}, []rune{'␀' + '\t', '␀' + '\n', '␀' + '\v', '␀' + '\r'}, }, @@ -99,102 +103,122 @@ var allMappings = []mapping{{ 0, }, []rune{ '␀', - }}, { + }, +}, { encoder.EncodeSlash, []rune{ '/', }, []rune{ '/', - }}, { + }, +}, { encoder.EncodeLtGt, []rune{ '<', '>', }, []rune{ '<', '>', - }}, { + }, +}, { encoder.EncodeSquareBracket, []rune{ '[', ']', }, []rune{ '[', ']', - }}, { + }, +}, { encoder.EncodeSemicolon, []rune{ ';', }, []rune{ ';', - }}, { + }, +}, { encoder.EncodeExclamation, []rune{ '!', }, []rune{ '!', - }}, { + }, +}, { encoder.EncodeDoubleQuote, []rune{ '"', }, []rune{ '"', - }}, { + }, +}, { encoder.EncodeSingleQuote, []rune{ '\'', }, []rune{ ''', - }}, { + }, +}, { encoder.EncodeBackQuote, []rune{ '`', }, []rune{ '`', - }}, { + }, +}, { encoder.EncodeDollar, []rune{ '$', }, []rune{ '$', - }}, { + }, +}, { encoder.EncodeColon, []rune{ ':', }, []rune{ ':', - }}, { + }, +}, { encoder.EncodeQuestion, []rune{ '?', }, []rune{ '?', - }}, { + }, +}, { encoder.EncodeAsterisk, []rune{ '*', }, []rune{ '*', - }}, { + }, +}, { encoder.EncodePipe, []rune{ '|', }, []rune{ '|', - }}, { + }, +}, { encoder.EncodeHash, []rune{ '#', }, []rune{ '#', - }}, { + }, +}, { encoder.EncodePercent, []rune{ '%', }, []rune{ '%', - }}, { + }, +}, { encoder.EncodeSlash, []rune{ '/', }, []rune{ '/', - }}, { + }, +}, { encoder.EncodeBackSlash, []rune{ '\\', }, []rune{ '\', - }}, { + }, +}, { encoder.EncodeCrLf, []rune{ rune(0x0D), rune(0x0A), }, []rune{ '␍', '␊', - }}, { + }, +}, { encoder.EncodeDel, []rune{ 0x7F, }, []rune{ '␡', - }}, { + }, +}, { encoder.EncodeCtl, runeRange(0x01, 0x1F), runeRange('␁', '␟'), @@ -438,6 +462,7 @@ func fatal(err error, s ...any) { fs.Fatal(nil, fmt.Sprint(append(s, err))) } } + func fatalW(_ int, err error) func(...any) { if err != nil { return func(s ...any) { @@ -471,12 +496,14 @@ func getMapping(mask encoder.MultiEncoder) mapping { } return mapping{} } + func collectEncodables(m []mapping) (out []rune) { for _, s := range m { out = append(out, s.src...) } return } + func collectEncoded(m []mapping) (out []rune) { for _, s := range m { out = append(out, s.dst...)