compose: Fix misbehaving due to encoding errors

Encoding errors in the compose data compiler due to:
- 'UTF-16' adds a BOM, use 'UTF-16-LE' instead
- 'str.encode' returns a byte array, use 'array' to have a 16-bit char
  array.
This commit is contained in:
Jules Aguillon 2024-06-09 10:35:38 +02:00
parent 1197ce36b4
commit a886f6eede
3 changed files with 32 additions and 17 deletions

View File

@ -1,4 +1,5 @@
import textwrap, sys, re, string, json, os import textwrap, sys, re, string, json, os
from array import array
# Parse symbol names from keysymdef.h. Many compose sequences in # Parse symbol names from keysymdef.h. Many compose sequences in
# en_US_UTF_8_Compose.pre reference theses. For example, all the sequences on # en_US_UTF_8_Compose.pre reference theses. For example, all the sequences on
@ -41,16 +42,21 @@ def parse_sequences_file_xkb(fname):
return def_, result return def_, result
char_names = { **xkb_char_extra_names } char_names = { **xkb_char_extra_names }
# Interpret character names of the form "U0000" or using [char_names]. # Interpret character names of the form "U0000" or using [char_names].
def parse_seq_char(c): def parse_seq_char(sc):
uchar, named_char = c uchar, named_char = sc
if uchar != "": if uchar != "":
return chr(int(uchar, 16)) c = chr(int(uchar, 16))
# else is a named char elif len(named_char) == 1:
if len(named_char) == 1: c = named_char
return named_char else:
if not named_char in char_names: if not named_char in char_names:
raise Exception("Unknown char: " + named_char) raise Exception("Unknown char: " + named_char)
return char_names[named_char] c = char_names[named_char]
# The state machine can't represent sequence characters that do not fit
# in a 16-bit char.
if len(c) > 1 or ord(c[0]) > 65535:
raise Exception("Char out of range: " + r)
return c
# Interpret the left hand side of a sequence. # Interpret the left hand side of a sequence.
def parse_seq_chars(def_): def parse_seq_chars(def_):
return list(map(parse_seq_char, re.findall(char_re, def_))) return list(map(parse_seq_char, re.findall(char_re, def_)))
@ -138,9 +144,9 @@ def make_automata(tree_root):
# There are two encoding for leafs: character final state for 15-bit # There are two encoding for leafs: character final state for 15-bit
# characters and string final state for the rest. # characters and string final state for the rest.
if len(c) > 1 or ord(c[0]) > 32767: # String final state if len(c) > 1 or ord(c[0]) > 32767: # String final state
cb = c.encode("UTF-16") javachars = array('H', c.encode("UTF-16-LE"))
states.append((-1, len(cb) + 1)) states.append((-1, len(javachars) + 1))
for c in cb: for c in javachars:
states.append((c, 0)) states.append((c, 0))
else: # Character final state else: # Character final state
states.append((c, 1)) states.append((c, 1))
@ -152,6 +158,14 @@ def make_automata(tree_root):
add_tree(tree_root) add_tree(tree_root)
return states return states
# Debug
def print_automata(automata):
i = 0
for (s, e) in automata:
s = "%#06x" % s if isinstance(s, int) else '"%s"' % str(s)
print("%3d %8s %d" % (i, s, e), file=sys.stderr)
i += 1
def batched(ar, n): def batched(ar, n):
i = 0 i = 0
while i + n < len(ar): while i + n < len(ar):
@ -213,3 +227,4 @@ for fname in sys.argv[1:]:
automata = make_automata(trie) automata = make_automata(trie)
gen_java(automata) gen_java(automata)
print("Compiled %d sequences into %d states. Dropped %d sequences." % (total_sequences, len(automata), dropped_sequences), file=sys.stderr) print("Compiled %d sequences into %d states. Dropped %d sequences." % (total_sequences, len(automata), dropped_sequences), file=sys.stderr)
# print_automata(automata)

View File

@ -36,17 +36,17 @@ public final class ComposeKey
if (next < 0) if (next < 0)
return null; return null;
next = edges[next]; next = edges[next];
char next_header = states[next]; int next_header = states[next];
if (next_header == 0) // Enter a new intermediate state. if (next_header == 0) // Enter a new intermediate state.
return KeyValue.makeComposePending(String.valueOf(c), next, 0); return KeyValue.makeComposePending(String.valueOf(c), next, 0);
else if (next_header > 0) // Character final state. else if (next_header == 0xFFFF) // String final state
return KeyValue.makeCharKey(next_header);
else // next_header is < 0, string final state.
{ {
int next_length = edges[next]; int next_length = edges[next];
return KeyValue.makeStringKey( return KeyValue.makeStringKey(
new String(states, next + 1, next + next_length)); new String(states, next + 1, next_length - 1));
} }
else // Character final state.
return KeyValue.makeCharKey((char)next_header);
} }
/** The state machine is comprised of two arrays. /** The state machine is comprised of two arrays.