rclone/lib/encoder/encoder.go

/*
Translate file names for usage on restrictive storage systems

The restricted set of characters are mapped to a unicode equivalent version
(most to their FULLWIDTH variant) to increase compatability with other
storage systems.
See: http://unicode-search.net/unicode-namesearch.pl?term=FULLWIDTH

Encoders will also quote reserved characters to differentiate between
the raw and encoded forms.
*/

package encoder

import (
	"bytes"
	"fmt"
	"io"
	"strconv"
	"strings"
	"unicode/utf8"
)

const (
	// adding this to any printable ASCII character turns it into the
	// FULLWIDTH variant
	fullOffset = 0xFEE0
	// the first rune of the SYMBOL FOR block for control characters
	symbolOffset = '␀' // SYMBOL FOR NULL
	// QuoteRune is the rune used for quoting reserved characters
	QuoteRune = '‛' // SINGLE HIGH-REVERSED-9 QUOTATION MARK
)

// NB keep the tests in fstests/fstests/fstests.go FsEncoding up to date with this

// Possible flags for the MultiEncoder
const (
	EncodeZero          uint = 0         // NUL(0x00)
	EncodeSlash         uint = 1 << iota // /
	EncodeWin                            // :?"*<>|
	EncodeBackSlash                      // \
	EncodeCrLf                           // CR(0x0D), LF(0x0A)
	EncodeHashPercent                    // #%
	EncodeDel                            // DEL(0x7F)
	EncodeCtl                            // CTRL(0x01-0x1F)
	EncodeLeftSpace                      // Leading SPACE
	EncodeLeftTilde                      // Leading ~
	EncodeLeftCrLfHtVt                   // Leading CR LF HT VT
	EncodeRightSpace                     // Trailing SPACE
	EncodeRightPeriod                    // Trailing .
	EncodeRightCrLfHtVt                  // Trailing CR LF HT VT
	EncodeInvalidUtf8                    // Invalid UTF-8 bytes
	EncodeDot                            // . and .. names
)

// Encoder can transform names to and from the original and translated version.
type Encoder interface {
	// Encode takes a raw name and substitutes any reserved characters and
	// patterns in it
	Encode(string) string
	// Decode takes a name and undoes any substitutions made by Encode
	Decode(string) string

	// FromStandardPath takes a / separated path in Standard encoding
	// and converts it to a / separated path in this encoding.
	FromStandardPath(string) string
	// FromStandardName takes name in Standard encoding and converts
	// it in this encoding.
	FromStandardName(string) string
	// ToStandardPath takes a / separated path in this encoding
	// and converts it to a / separated path in Standard encoding.
	ToStandardPath(string) string
	// ToStandardName takes name in this encoding and converts
	// it in Standard encoding.
	ToStandardName(string) string
}

// MultiEncoder is a configurable Encoder. The Encode* constants in this
// package can be combined using bitwise or (|) to enable handling of multiple
// character classes
type MultiEncoder uint

// Encode takes a raw name and substitutes any reserved characters and
// patterns in it
func (mask MultiEncoder) Encode(in string) string {
	var (
		encodeWin            = uint(mask)&EncodeWin != 0
		encodeSlash          = uint(mask)&EncodeSlash != 0
		encodeBackSlash      = uint(mask)&EncodeBackSlash != 0
		encodeCrLf           = uint(mask)&EncodeCrLf != 0
		encodeHashPercent    = uint(mask)&EncodeHashPercent != 0
		encodeDel            = uint(mask)&EncodeDel != 0
		encodeCtl            = uint(mask)&EncodeCtl != 0
		encodeLeftSpace      = uint(mask)&EncodeLeftSpace != 0
		encodeLeftTilde      = uint(mask)&EncodeLeftTilde != 0
		encodeLeftCrLfHtVt   = uint(mask)&EncodeLeftCrLfHtVt != 0
		encodeRightSpace     = uint(mask)&EncodeRightSpace != 0
		encodeRightPeriod    = uint(mask)&EncodeRightPeriod != 0
		encodeRightCrLfHtVt  = uint(mask)&EncodeRightCrLfHtVt != 0
		encodeInvalidUnicode = uint(mask)&EncodeInvalidUtf8 != 0
		encodeDot            = uint(mask)&EncodeDot != 0
	)

	if in == "" {
		return ""
	}

	if encodeDot {
		switch in {
		case ".":
			return "．"
		case "..":
			return "．．"
		case "．":
			return string(QuoteRune) + "．"
		case "．．":
			return string(QuoteRune) + "．" + string(QuoteRune) + "．"
		}
	}

	// handle prefix only replacements
	prefix := ""
	if encodeLeftSpace { // Leading SPACE
		if in[0] == ' ' {
			prefix, in = "␠", in[1:] // SYMBOL FOR SPACE
		} else if r, l := utf8.DecodeRuneInString(in); r == '␠' { // SYMBOL FOR SPACE
			prefix, in = string(QuoteRune)+"␠", in[l:] // SYMBOL FOR SPACE
		}
	}
	if encodeLeftTilde && prefix == "" { // Leading ~
		if in[0] == '~' {
			prefix, in = string('~'+fullOffset), in[1:] // FULLWIDTH TILDE
		} else if r, l := utf8.DecodeRuneInString(in); r == '~'+fullOffset {
			prefix, in = string(QuoteRune)+string('~'+fullOffset), in[l:] // FULLWIDTH TILDE
		}
	}
	if encodeLeftCrLfHtVt && prefix == "" { // Leading CR LF HT VT
		switch c := in[0]; c {
		case '\t', '\n', '\v', '\r':
			prefix, in = string('␀'+rune(c)), in[1:] // SYMBOL FOR NULL
		default:
			switch r, l := utf8.DecodeRuneInString(in); r {
			case '␀' + '\t', '␀' + '\n', '␀' + '\v', '␀' + '\r':
				prefix, in = string(QuoteRune)+string(r), in[l:]
			}
		}
	}
	// handle suffix only replacements
	suffix := ""
	if in != "" {
		if encodeRightSpace { // Trailing SPACE
			if in[len(in)-1] == ' ' {
				suffix, in = "␠", in[:len(in)-1] // SYMBOL FOR SPACE
			} else if r, l := utf8.DecodeLastRuneInString(in); r == '␠' {
				suffix, in = string(QuoteRune)+"␠", in[:len(in)-l] // SYMBOL FOR SPACE
			}
		}
		if encodeRightPeriod && suffix == "" { // Trailing .
			if in[len(in)-1] == '.' {
				suffix, in = "．", in[:len(in)-1] // FULLWIDTH FULL STOP
			} else if r, l := utf8.DecodeLastRuneInString(in); r == '．' {
				suffix, in = string(QuoteRune)+"．", in[:len(in)-l] // FULLWIDTH FULL STOP
			}
		}
		if encodeRightCrLfHtVt && suffix == "" { // Trailing .
			switch c := in[len(in)-1]; c {
			case '\t', '\n', '\v', '\r':
				suffix, in = string('␀'+rune(c)), in[:len(in)-1] // FULLWIDTH FULL STOP
			default:
				switch r, l := utf8.DecodeLastRuneInString(in); r {
				case '␀' + '\t', '␀' + '\n', '␀' + '\v', '␀' + '\r':
					suffix, in = string(QuoteRune)+string(r), in[:len(in)-l]
				}
			}
		}
	}

	index := 0
	if prefix == "" && suffix == "" {
		// find the first rune which (most likely) needs to be replaced
		index = strings.IndexFunc(in, func(r rune) bool {
			switch r {
			case 0, '␀', QuoteRune, utf8.RuneError:
				return true
			}
			if encodeWin { // :?"*<>|
				switch r {
				case '*', '<', '>', '?', ':', '|', '"',
					'＊', '＜', '＞', '？', '：', '｜', '＂':
					return true
				}
			}
			if encodeSlash { // /
				switch r {
				case '/',
					'／':
					return true
				}
			}
			if encodeBackSlash { // \
				switch r {
				case '\\',
					'＼':
					return true
				}
			}
			if encodeCrLf { // CR LF
				switch r {
				case rune(0x0D), rune(0x0A),
					'␍', '␊':
					return true
				}
			}
			if encodeHashPercent { // #%
				switch r {
				case '#', '%',
					'＃', '％':
					return true
				}
			}
			if encodeDel { // DEL(0x7F)
				switch r {
				case rune(0x7F), '␡':
					return true
				}
			}
			if encodeCtl { // CTRL(0x01-0x1F)
				if r >= 1 && r <= 0x1F {
					return true
				} else if r > symbolOffset && r <= symbolOffset+0x1F {
					return true
				}
			}
			return false
		})
	}
	// nothing to replace, return input
	if index == -1 {
		return in
	}

	var out bytes.Buffer
	out.Grow(len(in) + len(prefix) + len(suffix))
	out.WriteString(prefix)
	// copy the clean part of the input and skip it
	out.WriteString(in[:index])
	in = in[index:]

	for i, r := range in {
		switch r {
		case 0:
			out.WriteRune(symbolOffset)
			continue
		case '␀', QuoteRune:
			out.WriteRune(QuoteRune)
			out.WriteRune(r)
			continue
		case utf8.RuneError:
			if encodeInvalidUnicode {
				// only encode invalid sequences and not utf8.RuneError
				if i+3 > len(in) || in[i:i+3] != string(utf8.RuneError) {
					_, l := utf8.DecodeRuneInString(in[i:])
					appendQuotedBytes(&out, in[i:i+l])
					continue
				}
			} else {
				// append the real bytes instead of utf8.RuneError
				_, l := utf8.DecodeRuneInString(in[i:])
				out.WriteString(in[i : i+l])
				continue
			}
		}
		if encodeWin { // :?"*<>|
			switch r {
			case '*', '<', '>', '?', ':', '|', '"':
				out.WriteRune(r + fullOffset)
				continue
			case '＊', '＜', '＞', '？', '：', '｜', '＂':
				out.WriteRune(QuoteRune)
				out.WriteRune(r)
				continue
			}
		}
		if encodeSlash { // /
			switch r {
			case '/':
				out.WriteRune(r + fullOffset)
				continue
			case '／':
				out.WriteRune(QuoteRune)
				out.WriteRune(r)
				continue
			}
		}
		if encodeBackSlash { // \
			switch r {
			case '\\':
				out.WriteRune(r + fullOffset)
				continue
			case '＼':
				out.WriteRune(QuoteRune)
				out.WriteRune(r)
				continue
			}
		}
		if encodeCrLf { // CR LF
			switch r {
			case rune(0x0D), rune(0x0A):
				out.WriteRune(r + symbolOffset)
				continue
			case '␍', '␊':
				out.WriteRune(QuoteRune)
				out.WriteRune(r)
				continue
			}
		}
		if encodeHashPercent { // #%
			switch r {
			case '#', '%':
				out.WriteRune(r + fullOffset)
				continue
			case '＃', '％':
				out.WriteRune(QuoteRune)
				out.WriteRune(r)
				continue
			}
		}
		if encodeDel { // DEL(0x7F)
			switch r {
			case rune(0x7F):
				out.WriteRune('␡') // SYMBOL FOR DELETE
				continue
			case '␡':
				out.WriteRune(QuoteRune)
				out.WriteRune(r)
				continue
			}
		}
		if encodeCtl { // CTRL(0x01-0x1F)
			if r >= 1 && r <= 0x1F {
				out.WriteRune('␀' + r) // SYMBOL FOR NULL
				continue
			} else if r > symbolOffset && r <= symbolOffset+0x1F {
				out.WriteRune(QuoteRune)
				out.WriteRune(r)
				continue
			}
		}
		out.WriteRune(r)
	}
	out.WriteString(suffix)
	return out.String()
}

// Decode takes a name and undoes any substitutions made by Encode
func (mask MultiEncoder) Decode(in string) string {
	var (
		encodeWin            = uint(mask)&EncodeWin != 0
		encodeSlash          = uint(mask)&EncodeSlash != 0
		encodeBackSlash      = uint(mask)&EncodeBackSlash != 0
		encodeCrLf           = uint(mask)&EncodeCrLf != 0
		encodeHashPercent    = uint(mask)&EncodeHashPercent != 0
		encodeDel            = uint(mask)&EncodeDel != 0
		encodeCtl            = uint(mask)&EncodeCtl != 0
		encodeLeftSpace      = uint(mask)&EncodeLeftSpace != 0
		encodeLeftTilde      = uint(mask)&EncodeLeftTilde != 0
		encodeLeftCrLfHtVt   = uint(mask)&EncodeLeftCrLfHtVt != 0
		encodeRightSpace     = uint(mask)&EncodeRightSpace != 0
		encodeRightPeriod    = uint(mask)&EncodeRightPeriod != 0
		encodeRightCrLfHtVt  = uint(mask)&EncodeRightCrLfHtVt != 0
		encodeInvalidUnicode = uint(mask)&EncodeInvalidUtf8 != 0
		encodeDot            = uint(mask)&EncodeDot != 0
	)

	if encodeDot {
		switch in {
		case "．":
			return "."
		case "．．":
			return ".."
		case string(QuoteRune) + "．":
			return "．"
		case string(QuoteRune) + "．" + string(QuoteRune) + "．":
			return "．．"
		}
	}

	// handle prefix only replacements
	prefix := ""
	if r, l1 := utf8.DecodeRuneInString(in); encodeLeftSpace && r == '␠' { // SYMBOL FOR SPACE
		prefix, in = " ", in[l1:]
	} else if encodeLeftTilde && r == '～' { // FULLWIDTH TILDE
		prefix, in = "~", in[l1:]
	} else if encodeLeftCrLfHtVt && (r == '␀'+'\t' || r == '␀'+'\n' || r == '␀'+'\v' || r == '␀'+'\r') {
		prefix, in = string(r-'␀'), in[l1:]
	} else if r == QuoteRune {
		if r, l2 := utf8.DecodeRuneInString(in[l1:]); encodeLeftSpace && r == '␠' { // SYMBOL FOR SPACE
			prefix, in = "␠", in[l1+l2:]
		} else if encodeLeftTilde && r == '～' { // FULLWIDTH TILDE
			prefix, in = "～", in[l1+l2:]
		} else if encodeLeftCrLfHtVt && (r == '␀'+'\t' || r == '␀'+'\n' || r == '␀'+'\v' || r == '␀'+'\r') {
			prefix, in = string(r), in[l1+l2:]
		}
	}

	// handle suffix only replacements
	suffix := ""
	if r, l := utf8.DecodeLastRuneInString(in); encodeRightSpace && r == '␠' { // SYMBOL FOR SPACE
		in = in[:len(in)-l]
		if q, l2 := utf8.DecodeLastRuneInString(in); q == QuoteRune {
			suffix, in = "␠", in[:len(in)-l2]
		} else {
			suffix = " "
		}
	} else if encodeRightPeriod && r == '．' { // FULLWIDTH FULL STOP
		in = in[:len(in)-l]
		if q, l2 := utf8.DecodeLastRuneInString(in); q == QuoteRune {
			suffix, in = "．", in[:len(in)-l2]
		} else {
			suffix = "."
		}
	} else if encodeRightCrLfHtVt && (r == '␀'+'\t' || r == '␀'+'\n' || r == '␀'+'\v' || r == '␀'+'\r') {
		in = in[:len(in)-l]
		if q, l2 := utf8.DecodeLastRuneInString(in); q == QuoteRune {
			suffix, in = string(r), in[:len(in)-l2]
		} else {
			suffix = string(r - '␀')
		}
	}
	index := 0
	if prefix == "" && suffix == "" {
		// find the first rune which (most likely) needs to be replaced
		index = strings.IndexFunc(in, func(r rune) bool {
			switch r {
			case '␀', QuoteRune:
				return true
			}
			if encodeWin { // :?"*<>|
				switch r {
				case '＊', '＜', '＞', '？', '：', '｜', '＂':
					return true
				}
			}
			if encodeSlash { // /
				switch r {
				case '／':
					return true
				}
			}
			if encodeBackSlash { // \
				switch r {
				case '＼':
					return true
				}
			}
			if encodeCrLf { // CR LF
				switch r {
				case '␍', '␊':
					return true
				}
			}
			if encodeHashPercent { // #%
				switch r {
				case '＃', '％':
					return true
				}
			}
			if encodeDel { // DEL(0x7F)
				switch r {
				case '␡':
					return true
				}
			}
			if encodeCtl { // CTRL(0x01-0x1F)
				if r > symbolOffset && r <= symbolOffset+0x1F {
					return true
				}
			}

			return false
		})
	}
	// nothing to replace, return input
	if index == -1 {
		return in
	}

	var out bytes.Buffer
	out.Grow(len(in))
	out.WriteString(prefix)
	// copy the clean part of the input and skip it
	out.WriteString(in[:index])
	in = in[index:]
	var unquote, unquoteNext, skipNext bool

	for i, r := range in {
		if skipNext {
			skipNext = false
			continue
		}
		unquote, unquoteNext = unquoteNext, false
		switch r {
		case '␀': // SYMBOL FOR NULL
			if unquote {
				out.WriteRune(r)
			} else {
				out.WriteRune(0)
			}
			continue
		case QuoteRune:
			if unquote {
				out.WriteRune(r)
			} else {
				unquoteNext = true
			}
			continue
		}
		if encodeWin { // :?"*<>|
			switch r {
			case '＊', '＜', '＞', '？', '：', '｜', '＂':
				if unquote {
					out.WriteRune(r)
				} else {
					out.WriteRune(r - fullOffset)
				}
				continue
			}
		}
		if encodeSlash { // /
			switch r {
			case '／': // FULLWIDTH SOLIDUS
				if unquote {
					out.WriteRune(r)
				} else {
					out.WriteRune(r - fullOffset)
				}
				continue
			}
		}
		if encodeBackSlash { // \
			switch r {
			case '＼': // FULLWIDTH REVERSE SOLIDUS
				if unquote {
					out.WriteRune(r)
				} else {
					out.WriteRune(r - fullOffset)
				}
				continue
			}
		}
		if encodeCrLf { // CR LF
			switch r {
			case '␍', '␊':
				if unquote {
					out.WriteRune(r)
				} else {
					out.WriteRune(r - symbolOffset)
				}
				continue
			}
		}
		if encodeHashPercent { // #%
			switch r {
			case '＃', '％':
				if unquote {
					out.WriteRune(r)
				} else {
					out.WriteRune(r - fullOffset)
				}
				continue
			}
		}
		if encodeDel { // DEL(0x7F)
			switch r {
			case '␡': // SYMBOL FOR DELETE
				if unquote {
					out.WriteRune(r)
				} else {
					out.WriteRune(0x7F)
				}
				continue
			}
		}
		if encodeCtl { // CTRL(0x01-0x1F)
			if r > symbolOffset && r <= symbolOffset+0x1F {
				if unquote {
					out.WriteRune(r)
				} else {
					out.WriteRune(r - symbolOffset)
				}
				continue
			}
		}
		if unquote {
			if encodeInvalidUnicode {
				skipNext = appendUnquotedByte(&out, in[i:])
				if skipNext {
					continue
				}
			}
			out.WriteRune(QuoteRune)
		}
		switch r {
		case utf8.RuneError:
			// append the real bytes instead of utf8.RuneError
			_, l := utf8.DecodeRuneInString(in[i:])
			out.WriteString(in[i : i+l])
			continue
		}

		out.WriteRune(r)
	}
	if unquoteNext {
		out.WriteRune(QuoteRune)
	}
	out.WriteString(suffix)
	return out.String()
}

// FromStandardPath takes a / separated path in Standard encoding
// and converts it to a / separated path in this encoding.
func (mask MultiEncoder) FromStandardPath(s string) string {
	return FromStandardPath(mask, s)
}

// FromStandardName takes name in Standard encoding and converts
// it in this encoding.
func (mask MultiEncoder) FromStandardName(s string) string {
	return FromStandardName(mask, s)
}

// ToStandardPath takes a / separated path in this encoding
// and converts it to a / separated path in Standard encoding.
func (mask MultiEncoder) ToStandardPath(s string) string {
	return ToStandardPath(mask, s)
}

// ToStandardName takes name in this encoding and converts
// it in Standard encoding.
func (mask MultiEncoder) ToStandardName(s string) string {
	return ToStandardName(mask, s)
}

func appendQuotedBytes(w io.Writer, s string) {
	for _, b := range []byte(s) {
		_, _ = fmt.Fprintf(w, string(QuoteRune)+"%02X", b)
	}
}
func appendUnquotedByte(w io.Writer, s string) bool {
	if len(s) < 2 {
		return false
	}
	u, err := strconv.ParseUint(s[:2], 16, 8)
	if err != nil {
		return false
	}
	n, _ := w.Write([]byte{byte(u)})
	return n == 1
}

type identity struct{}

func (identity) Encode(in string) string { return in }
func (identity) Decode(in string) string { return in }

func (i identity) FromStandardPath(s string) string {
	return FromStandardPath(i, s)
}
func (i identity) FromStandardName(s string) string {
	return FromStandardName(i, s)
}
func (i identity) ToStandardPath(s string) string {
	return ToStandardPath(i, s)
}
func (i identity) ToStandardName(s string) string {
	return ToStandardName(i, s)
}

// Identity returns a Encoder that always returns the input value
func Identity() Encoder {
	return identity{}
}

// FromStandardPath takes a / separated path in Standard encoding
// and converts it to a / separated path in the given encoding.
func FromStandardPath(e Encoder, s string) string {
	if e == Standard {
		return s
	}
	parts := strings.Split(s, "/")
	encoded := make([]string, len(parts))
	changed := false
	for i, p := range parts {
		enc := FromStandardName(e, p)
		changed = changed || enc != p
		encoded[i] = enc
	}
	if !changed {
		return s
	}
	return strings.Join(encoded, "/")
}

// FromStandardName takes name in Standard encoding and converts
// it in the given encoding.
func FromStandardName(e Encoder, s string) string {
	if e == Standard {
		return s
	}
	return e.Encode(Standard.Decode(s))
}

// ToStandardPath takes a / separated path in the given encoding
// and converts it to a / separated path in Standard encoding.
func ToStandardPath(e Encoder, s string) string {
	if e == Standard {
		return s
	}
	parts := strings.Split(s, "/")
	encoded := make([]string, len(parts))
	changed := false
	for i, p := range parts {
		dec := ToStandardName(e, p)
		changed = changed || dec != p
		encoded[i] = dec
	}
	if !changed {
		return s
	}
	return strings.Join(encoded, "/")
}

// ToStandardName takes name in the given encoding and converts
// it in Standard encoding.
func ToStandardName(e Encoder, s string) string {
	if e == Standard {
		return s
	}
	return Standard.Encode(e.Decode(s))
}