encoder: add ForceNFC and ForceNFD options

See https://github.com/rclone/rclone/issues/8088#issuecomment-3001795870
This commit is contained in:
nielash
2025-07-12 15:05:11 -04:00
parent bfdd5e2c22
commit 24835344eb
4 changed files with 101 additions and 24 deletions

View File

@@ -377,6 +377,8 @@ will show you the defaults for the backends.
| Exclamation | `!` | `` |
| Hash | `#` | `` |
| InvalidUtf8 | An invalid UTF-8 character (e.g. latin1) | `<60>` |
| ForceNFC | All invalid NFC characters | Their valid NFC equivalents |
| ForceNFD | All invalid NFD characters | Their valid NFD equivalents |
| LeftCrLfHtVt | CR 0x0D, LF 0x0A, HT 0x09, VT 0x0B on the left of a string | `␍`, `␊`, `␉`, `␋` |
| LeftPeriod | `.` on the left of a string | `.` |
| LeftSpace | SPACE on the left of a string | `␠` |

View File

@@ -18,6 +18,8 @@ import (
"strconv"
"strings"
"unicode/utf8"
"golang.org/x/text/unicode/norm"
)
const (
@@ -61,6 +63,8 @@ const (
EncodeRightPeriod // Trailing .
EncodeRightCrLfHtVt // Trailing CR LF HT VT
EncodeInvalidUtf8 // Invalid UTF-8 bytes
EncodeInvalidNFC // Force NFC encoding
EncodeInvalidNFD // Force NFD encoding
EncodeDot // . and .. names
EncodeSquareBracket // []
EncodeSemicolon // ;
@@ -148,6 +152,8 @@ func init() {
alias("RightPeriod", EncodeRightPeriod)
alias("RightCrLfHtVt", EncodeRightCrLfHtVt)
alias("InvalidUtf8", EncodeInvalidUtf8)
alias("ForceNFC", EncodeInvalidNFC)
alias("ForceNFD", EncodeInvalidNFD)
alias("Dot", EncodeDot)
}
@@ -226,6 +232,13 @@ func (mask MultiEncoder) Encode(in string) string {
return ""
}
if mask.Has(EncodeInvalidNFD) {
in = norm.NFD.String(in)
}
if mask.Has(EncodeInvalidNFC) {
in = norm.NFC.String(in)
}
if mask.Has(EncodeDot) {
switch in {
case ".":
@@ -688,6 +701,15 @@ func (mask MultiEncoder) Decode(in string) string {
return in
}
/* // Can't losslessly decode NFC/NFD
if mask.Has(EncodeInvalidNFD) {
in = norm.NFC.String(in)
}
if mask.Has(EncodeInvalidNFC) {
in = norm.NFD.String(in)
}
*/
if mask.Has(EncodeDot) {
switch in {
case "":

View File

@@ -34,7 +34,6 @@ func TestEncodeString(t *testing.T) {
got := test.mask.String()
assert.Equal(t, test.want, got)
}
}
func TestEncodeSet(t *testing.T) {
@@ -60,7 +59,6 @@ func TestEncodeSet(t *testing.T) {
assert.Equal(t, test.wantErr, err != nil, err)
assert.Equal(t, test.want, got, test.in)
}
}
type testCase struct {
@@ -175,6 +173,34 @@ func TestEncodeInvalidUnicode(t *testing.T) {
}
}
func TestEncodeNFCNFD(t *testing.T) {
for i, tc := range []testCase{
{
mask: EncodeInvalidNFC,
in: "Über",
out: "Über",
},
{
mask: EncodeInvalidNFD,
in: "Über",
out: "Über",
},
} {
e := tc.mask
t.Run(strconv.FormatInt(int64(i), 10), func(t *testing.T) {
got := e.Encode(tc.in)
if got != tc.out {
t.Errorf("Encode(%q) want %q got %q", tc.in, tc.out, got)
}
// we can't losslessly decode NFC/NFD
/* got2 := e.Decode(got)
if got2 != tc.in {
t.Errorf("Decode(%q) want %q got %q", got, tc.in, got2)
} */
})
}
}
func TestEncodeDot(t *testing.T) {
for i, tc := range []testCase{
{

View File

@@ -67,6 +67,8 @@ var maskBits = []struct {
{encoder.EncodeRightPeriod, "EncodeRightPeriod"},
{encoder.EncodeRightCrLfHtVt, "EncodeRightCrLfHtVt"},
{encoder.EncodeInvalidUtf8, "EncodeInvalidUtf8"},
{encoder.EncodeInvalidNFC, "ForceNFC"},
{encoder.EncodeInvalidNFD, "ForceNFD"},
{encoder.EncodeDot, "EncodeDot"},
}
@@ -82,13 +84,15 @@ var allEdges = []edge{
{encoder.EncodeLeftSpace, "EncodeLeftSpace", edgeLeft, []rune{' '}, []rune{'␠'}},
{encoder.EncodeLeftPeriod, "EncodeLeftPeriod", edgeLeft, []rune{'.'}, []rune{''}},
{encoder.EncodeLeftTilde, "EncodeLeftTilde", edgeLeft, []rune{'~'}, []rune{''}},
{encoder.EncodeLeftCrLfHtVt, "EncodeLeftCrLfHtVt", edgeLeft,
{
encoder.EncodeLeftCrLfHtVt, "EncodeLeftCrLfHtVt", edgeLeft,
[]rune{'\t', '\n', '\v', '\r'},
[]rune{'␀' + '\t', '␀' + '\n', '␀' + '\v', '␀' + '\r'},
},
{encoder.EncodeRightSpace, "EncodeRightSpace", edgeRight, []rune{' '}, []rune{'␠'}},
{encoder.EncodeRightPeriod, "EncodeRightPeriod", edgeRight, []rune{'.'}, []rune{''}},
{encoder.EncodeRightCrLfHtVt, "EncodeRightCrLfHtVt", edgeRight,
{
encoder.EncodeRightCrLfHtVt, "EncodeRightCrLfHtVt", edgeRight,
[]rune{'\t', '\n', '\v', '\r'},
[]rune{'␀' + '\t', '␀' + '\n', '␀' + '\v', '␀' + '\r'},
},
@@ -99,102 +103,122 @@ var allMappings = []mapping{{
0,
}, []rune{
'␀',
}}, {
},
}, {
encoder.EncodeSlash, []rune{
'/',
}, []rune{
'',
}}, {
},
}, {
encoder.EncodeLtGt, []rune{
'<', '>',
}, []rune{
'', '',
}}, {
},
}, {
encoder.EncodeSquareBracket, []rune{
'[', ']',
}, []rune{
'', '',
}}, {
},
}, {
encoder.EncodeSemicolon, []rune{
';',
}, []rune{
'',
}}, {
},
}, {
encoder.EncodeExclamation, []rune{
'!',
}, []rune{
'',
}}, {
},
}, {
encoder.EncodeDoubleQuote, []rune{
'"',
}, []rune{
'',
}}, {
},
}, {
encoder.EncodeSingleQuote, []rune{
'\'',
}, []rune{
'',
}}, {
},
}, {
encoder.EncodeBackQuote, []rune{
'`',
}, []rune{
'',
}}, {
},
}, {
encoder.EncodeDollar, []rune{
'$',
}, []rune{
'',
}}, {
},
}, {
encoder.EncodeColon, []rune{
':',
}, []rune{
'',
}}, {
},
}, {
encoder.EncodeQuestion, []rune{
'?',
}, []rune{
'',
}}, {
},
}, {
encoder.EncodeAsterisk, []rune{
'*',
}, []rune{
'',
}}, {
},
}, {
encoder.EncodePipe, []rune{
'|',
}, []rune{
'',
}}, {
},
}, {
encoder.EncodeHash, []rune{
'#',
}, []rune{
'',
}}, {
},
}, {
encoder.EncodePercent, []rune{
'%',
}, []rune{
'',
}}, {
},
}, {
encoder.EncodeSlash, []rune{
'/',
}, []rune{
'',
}}, {
},
}, {
encoder.EncodeBackSlash, []rune{
'\\',
}, []rune{
'',
}}, {
},
}, {
encoder.EncodeCrLf, []rune{
rune(0x0D), rune(0x0A),
}, []rune{
'␍', '␊',
}}, {
},
}, {
encoder.EncodeDel, []rune{
0x7F,
}, []rune{
'␡',
}}, {
},
}, {
encoder.EncodeCtl,
runeRange(0x01, 0x1F),
runeRange('␁', '␟'),
@@ -438,6 +462,7 @@ func fatal(err error, s ...any) {
fs.Fatal(nil, fmt.Sprint(append(s, err)))
}
}
func fatalW(_ int, err error) func(...any) {
if err != nil {
return func(s ...any) {
@@ -471,12 +496,14 @@ func getMapping(mask encoder.MultiEncoder) mapping {
}
return mapping{}
}
func collectEncodables(m []mapping) (out []rune) {
for _, s := range m {
out = append(out, s.src...)
}
return
}
func collectEncoded(m []mapping) (out []rune) {
for _, s := range m {
out = append(out, s.dst...)