encoder: add ForceNFC and ForceNFD options

See https://github.com/rclone/rclone/issues/8088#issuecomment-3001795870
This commit is contained in:
nielash
2025-07-12 15:05:11 -04:00
parent bfdd5e2c22
commit 24835344eb
4 changed files with 101 additions and 24 deletions

View File

@ -377,6 +377,8 @@ will show you the defaults for the backends.
| Exclamation | `!` | `` | | Exclamation | `!` | `` |
| Hash | `#` | `` | | Hash | `#` | `` |
| InvalidUtf8 | An invalid UTF-8 character (e.g. latin1) | `<60>` | | InvalidUtf8 | An invalid UTF-8 character (e.g. latin1) | `<60>` |
| ForceNFC | All invalid NFC characters | Their valid NFC equivalents |
| ForceNFD | All invalid NFD characters | Their valid NFD equivalents |
| LeftCrLfHtVt | CR 0x0D, LF 0x0A, HT 0x09, VT 0x0B on the left of a string | `␍`, `␊`, `␉`, `␋` | | LeftCrLfHtVt | CR 0x0D, LF 0x0A, HT 0x09, VT 0x0B on the left of a string | `␍`, `␊`, `␉`, `␋` |
| LeftPeriod | `.` on the left of a string | `.` | | LeftPeriod | `.` on the left of a string | `.` |
| LeftSpace | SPACE on the left of a string | `␠` | | LeftSpace | SPACE on the left of a string | `␠` |

View File

@ -18,6 +18,8 @@ import (
"strconv" "strconv"
"strings" "strings"
"unicode/utf8" "unicode/utf8"
"golang.org/x/text/unicode/norm"
) )
const ( const (
@ -61,6 +63,8 @@ const (
EncodeRightPeriod // Trailing . EncodeRightPeriod // Trailing .
EncodeRightCrLfHtVt // Trailing CR LF HT VT EncodeRightCrLfHtVt // Trailing CR LF HT VT
EncodeInvalidUtf8 // Invalid UTF-8 bytes EncodeInvalidUtf8 // Invalid UTF-8 bytes
EncodeInvalidNFC // Force NFC encoding
EncodeInvalidNFD // Force NFD encoding
EncodeDot // . and .. names EncodeDot // . and .. names
EncodeSquareBracket // [] EncodeSquareBracket // []
EncodeSemicolon // ; EncodeSemicolon // ;
@ -148,6 +152,8 @@ func init() {
alias("RightPeriod", EncodeRightPeriod) alias("RightPeriod", EncodeRightPeriod)
alias("RightCrLfHtVt", EncodeRightCrLfHtVt) alias("RightCrLfHtVt", EncodeRightCrLfHtVt)
alias("InvalidUtf8", EncodeInvalidUtf8) alias("InvalidUtf8", EncodeInvalidUtf8)
alias("ForceNFC", EncodeInvalidNFC)
alias("ForceNFD", EncodeInvalidNFD)
alias("Dot", EncodeDot) alias("Dot", EncodeDot)
} }
@ -226,6 +232,13 @@ func (mask MultiEncoder) Encode(in string) string {
return "" return ""
} }
if mask.Has(EncodeInvalidNFD) {
in = norm.NFD.String(in)
}
if mask.Has(EncodeInvalidNFC) {
in = norm.NFC.String(in)
}
if mask.Has(EncodeDot) { if mask.Has(EncodeDot) {
switch in { switch in {
case ".": case ".":
@ -688,6 +701,15 @@ func (mask MultiEncoder) Decode(in string) string {
return in return in
} }
/* // Can't losslessly decode NFC/NFD
if mask.Has(EncodeInvalidNFD) {
in = norm.NFC.String(in)
}
if mask.Has(EncodeInvalidNFC) {
in = norm.NFD.String(in)
}
*/
if mask.Has(EncodeDot) { if mask.Has(EncodeDot) {
switch in { switch in {
case "": case "":

View File

@ -34,7 +34,6 @@ func TestEncodeString(t *testing.T) {
got := test.mask.String() got := test.mask.String()
assert.Equal(t, test.want, got) assert.Equal(t, test.want, got)
} }
} }
func TestEncodeSet(t *testing.T) { func TestEncodeSet(t *testing.T) {
@ -60,7 +59,6 @@ func TestEncodeSet(t *testing.T) {
assert.Equal(t, test.wantErr, err != nil, err) assert.Equal(t, test.wantErr, err != nil, err)
assert.Equal(t, test.want, got, test.in) assert.Equal(t, test.want, got, test.in)
} }
} }
type testCase struct { type testCase struct {
@ -175,6 +173,34 @@ func TestEncodeInvalidUnicode(t *testing.T) {
} }
} }
func TestEncodeNFCNFD(t *testing.T) {
for i, tc := range []testCase{
{
mask: EncodeInvalidNFC,
in: "Über",
out: "Über",
},
{
mask: EncodeInvalidNFD,
in: "Über",
out: "Über",
},
} {
e := tc.mask
t.Run(strconv.FormatInt(int64(i), 10), func(t *testing.T) {
got := e.Encode(tc.in)
if got != tc.out {
t.Errorf("Encode(%q) want %q got %q", tc.in, tc.out, got)
}
// we can't losslessly decode NFC/NFD
/* got2 := e.Decode(got)
if got2 != tc.in {
t.Errorf("Decode(%q) want %q got %q", got, tc.in, got2)
} */
})
}
}
func TestEncodeDot(t *testing.T) { func TestEncodeDot(t *testing.T) {
for i, tc := range []testCase{ for i, tc := range []testCase{
{ {

View File

@ -67,6 +67,8 @@ var maskBits = []struct {
{encoder.EncodeRightPeriod, "EncodeRightPeriod"}, {encoder.EncodeRightPeriod, "EncodeRightPeriod"},
{encoder.EncodeRightCrLfHtVt, "EncodeRightCrLfHtVt"}, {encoder.EncodeRightCrLfHtVt, "EncodeRightCrLfHtVt"},
{encoder.EncodeInvalidUtf8, "EncodeInvalidUtf8"}, {encoder.EncodeInvalidUtf8, "EncodeInvalidUtf8"},
{encoder.EncodeInvalidNFC, "ForceNFC"},
{encoder.EncodeInvalidNFD, "ForceNFD"},
{encoder.EncodeDot, "EncodeDot"}, {encoder.EncodeDot, "EncodeDot"},
} }
@ -82,13 +84,15 @@ var allEdges = []edge{
{encoder.EncodeLeftSpace, "EncodeLeftSpace", edgeLeft, []rune{' '}, []rune{'␠'}}, {encoder.EncodeLeftSpace, "EncodeLeftSpace", edgeLeft, []rune{' '}, []rune{'␠'}},
{encoder.EncodeLeftPeriod, "EncodeLeftPeriod", edgeLeft, []rune{'.'}, []rune{''}}, {encoder.EncodeLeftPeriod, "EncodeLeftPeriod", edgeLeft, []rune{'.'}, []rune{''}},
{encoder.EncodeLeftTilde, "EncodeLeftTilde", edgeLeft, []rune{'~'}, []rune{''}}, {encoder.EncodeLeftTilde, "EncodeLeftTilde", edgeLeft, []rune{'~'}, []rune{''}},
{encoder.EncodeLeftCrLfHtVt, "EncodeLeftCrLfHtVt", edgeLeft, {
encoder.EncodeLeftCrLfHtVt, "EncodeLeftCrLfHtVt", edgeLeft,
[]rune{'\t', '\n', '\v', '\r'}, []rune{'\t', '\n', '\v', '\r'},
[]rune{'␀' + '\t', '␀' + '\n', '␀' + '\v', '␀' + '\r'}, []rune{'␀' + '\t', '␀' + '\n', '␀' + '\v', '␀' + '\r'},
}, },
{encoder.EncodeRightSpace, "EncodeRightSpace", edgeRight, []rune{' '}, []rune{'␠'}}, {encoder.EncodeRightSpace, "EncodeRightSpace", edgeRight, []rune{' '}, []rune{'␠'}},
{encoder.EncodeRightPeriod, "EncodeRightPeriod", edgeRight, []rune{'.'}, []rune{''}}, {encoder.EncodeRightPeriod, "EncodeRightPeriod", edgeRight, []rune{'.'}, []rune{''}},
{encoder.EncodeRightCrLfHtVt, "EncodeRightCrLfHtVt", edgeRight, {
encoder.EncodeRightCrLfHtVt, "EncodeRightCrLfHtVt", edgeRight,
[]rune{'\t', '\n', '\v', '\r'}, []rune{'\t', '\n', '\v', '\r'},
[]rune{'␀' + '\t', '␀' + '\n', '␀' + '\v', '␀' + '\r'}, []rune{'␀' + '\t', '␀' + '\n', '␀' + '\v', '␀' + '\r'},
}, },
@ -99,102 +103,122 @@ var allMappings = []mapping{{
0, 0,
}, []rune{ }, []rune{
'␀', '␀',
}}, { },
}, {
encoder.EncodeSlash, []rune{ encoder.EncodeSlash, []rune{
'/', '/',
}, []rune{ }, []rune{
'', '',
}}, { },
}, {
encoder.EncodeLtGt, []rune{ encoder.EncodeLtGt, []rune{
'<', '>', '<', '>',
}, []rune{ }, []rune{
'', '', '', '',
}}, { },
}, {
encoder.EncodeSquareBracket, []rune{ encoder.EncodeSquareBracket, []rune{
'[', ']', '[', ']',
}, []rune{ }, []rune{
'', '', '', '',
}}, { },
}, {
encoder.EncodeSemicolon, []rune{ encoder.EncodeSemicolon, []rune{
';', ';',
}, []rune{ }, []rune{
'', '',
}}, { },
}, {
encoder.EncodeExclamation, []rune{ encoder.EncodeExclamation, []rune{
'!', '!',
}, []rune{ }, []rune{
'', '',
}}, { },
}, {
encoder.EncodeDoubleQuote, []rune{ encoder.EncodeDoubleQuote, []rune{
'"', '"',
}, []rune{ }, []rune{
'', '',
}}, { },
}, {
encoder.EncodeSingleQuote, []rune{ encoder.EncodeSingleQuote, []rune{
'\'', '\'',
}, []rune{ }, []rune{
'', '',
}}, { },
}, {
encoder.EncodeBackQuote, []rune{ encoder.EncodeBackQuote, []rune{
'`', '`',
}, []rune{ }, []rune{
'', '',
}}, { },
}, {
encoder.EncodeDollar, []rune{ encoder.EncodeDollar, []rune{
'$', '$',
}, []rune{ }, []rune{
'', '',
}}, { },
}, {
encoder.EncodeColon, []rune{ encoder.EncodeColon, []rune{
':', ':',
}, []rune{ }, []rune{
'', '',
}}, { },
}, {
encoder.EncodeQuestion, []rune{ encoder.EncodeQuestion, []rune{
'?', '?',
}, []rune{ }, []rune{
'', '',
}}, { },
}, {
encoder.EncodeAsterisk, []rune{ encoder.EncodeAsterisk, []rune{
'*', '*',
}, []rune{ }, []rune{
'', '',
}}, { },
}, {
encoder.EncodePipe, []rune{ encoder.EncodePipe, []rune{
'|', '|',
}, []rune{ }, []rune{
'', '',
}}, { },
}, {
encoder.EncodeHash, []rune{ encoder.EncodeHash, []rune{
'#', '#',
}, []rune{ }, []rune{
'', '',
}}, { },
}, {
encoder.EncodePercent, []rune{ encoder.EncodePercent, []rune{
'%', '%',
}, []rune{ }, []rune{
'', '',
}}, { },
}, {
encoder.EncodeSlash, []rune{ encoder.EncodeSlash, []rune{
'/', '/',
}, []rune{ }, []rune{
'', '',
}}, { },
}, {
encoder.EncodeBackSlash, []rune{ encoder.EncodeBackSlash, []rune{
'\\', '\\',
}, []rune{ }, []rune{
'', '',
}}, { },
}, {
encoder.EncodeCrLf, []rune{ encoder.EncodeCrLf, []rune{
rune(0x0D), rune(0x0A), rune(0x0D), rune(0x0A),
}, []rune{ }, []rune{
'␍', '␊', '␍', '␊',
}}, { },
}, {
encoder.EncodeDel, []rune{ encoder.EncodeDel, []rune{
0x7F, 0x7F,
}, []rune{ }, []rune{
'␡', '␡',
}}, { },
}, {
encoder.EncodeCtl, encoder.EncodeCtl,
runeRange(0x01, 0x1F), runeRange(0x01, 0x1F),
runeRange('␁', '␟'), runeRange('␁', '␟'),
@ -438,6 +462,7 @@ func fatal(err error, s ...any) {
fs.Fatal(nil, fmt.Sprint(append(s, err))) fs.Fatal(nil, fmt.Sprint(append(s, err)))
} }
} }
func fatalW(_ int, err error) func(...any) { func fatalW(_ int, err error) func(...any) {
if err != nil { if err != nil {
return func(s ...any) { return func(s ...any) {
@ -471,12 +496,14 @@ func getMapping(mask encoder.MultiEncoder) mapping {
} }
return mapping{} return mapping{}
} }
func collectEncodables(m []mapping) (out []rune) { func collectEncodables(m []mapping) (out []rune) {
for _, s := range m { for _, s := range m {
out = append(out, s.src...) out = append(out, s.src...)
} }
return return
} }
func collectEncoded(m []mapping) (out []rune) { func collectEncoded(m []mapping) (out []rune) {
for _, s := range m { for _, s := range m {
out = append(out, s.dst...) out = append(out, s.dst...)