forked from extern/Unexpected-Keyboard
compose: Fix misbehaving due to encoding errors
Encoding errors in the compose data compiler due to: - 'UTF-16' adds a BOM, use 'UTF-16-LE' instead - 'str.encode' returns a byte array, use 'array' to have a 16-bit char array.
This commit is contained in:
parent
1197ce36b4
commit
a886f6eede
@ -1,4 +1,5 @@
|
|||||||
import textwrap, sys, re, string, json, os
|
import textwrap, sys, re, string, json, os
|
||||||
|
from array import array
|
||||||
|
|
||||||
# Parse symbol names from keysymdef.h. Many compose sequences in
|
# Parse symbol names from keysymdef.h. Many compose sequences in
|
||||||
# en_US_UTF_8_Compose.pre reference theses. For example, all the sequences on
|
# en_US_UTF_8_Compose.pre reference theses. For example, all the sequences on
|
||||||
@ -41,16 +42,21 @@ def parse_sequences_file_xkb(fname):
|
|||||||
return def_, result
|
return def_, result
|
||||||
char_names = { **xkb_char_extra_names }
|
char_names = { **xkb_char_extra_names }
|
||||||
# Interpret character names of the form "U0000" or using [char_names].
|
# Interpret character names of the form "U0000" or using [char_names].
|
||||||
def parse_seq_char(c):
|
def parse_seq_char(sc):
|
||||||
uchar, named_char = c
|
uchar, named_char = sc
|
||||||
if uchar != "":
|
if uchar != "":
|
||||||
return chr(int(uchar, 16))
|
c = chr(int(uchar, 16))
|
||||||
# else is a named char
|
elif len(named_char) == 1:
|
||||||
if len(named_char) == 1:
|
c = named_char
|
||||||
return named_char
|
else:
|
||||||
if not named_char in char_names:
|
if not named_char in char_names:
|
||||||
raise Exception("Unknown char: " + named_char)
|
raise Exception("Unknown char: " + named_char)
|
||||||
return char_names[named_char]
|
c = char_names[named_char]
|
||||||
|
# The state machine can't represent sequence characters that do not fit
|
||||||
|
# in a 16-bit char.
|
||||||
|
if len(c) > 1 or ord(c[0]) > 65535:
|
||||||
|
raise Exception("Char out of range: " + r)
|
||||||
|
return c
|
||||||
# Interpret the left hand side of a sequence.
|
# Interpret the left hand side of a sequence.
|
||||||
def parse_seq_chars(def_):
|
def parse_seq_chars(def_):
|
||||||
return list(map(parse_seq_char, re.findall(char_re, def_)))
|
return list(map(parse_seq_char, re.findall(char_re, def_)))
|
||||||
@ -138,9 +144,9 @@ def make_automata(tree_root):
|
|||||||
# There are two encoding for leafs: character final state for 15-bit
|
# There are two encoding for leafs: character final state for 15-bit
|
||||||
# characters and string final state for the rest.
|
# characters and string final state for the rest.
|
||||||
if len(c) > 1 or ord(c[0]) > 32767: # String final state
|
if len(c) > 1 or ord(c[0]) > 32767: # String final state
|
||||||
cb = c.encode("UTF-16")
|
javachars = array('H', c.encode("UTF-16-LE"))
|
||||||
states.append((-1, len(cb) + 1))
|
states.append((-1, len(javachars) + 1))
|
||||||
for c in cb:
|
for c in javachars:
|
||||||
states.append((c, 0))
|
states.append((c, 0))
|
||||||
else: # Character final state
|
else: # Character final state
|
||||||
states.append((c, 1))
|
states.append((c, 1))
|
||||||
@ -152,6 +158,14 @@ def make_automata(tree_root):
|
|||||||
add_tree(tree_root)
|
add_tree(tree_root)
|
||||||
return states
|
return states
|
||||||
|
|
||||||
|
# Debug
|
||||||
|
def print_automata(automata):
|
||||||
|
i = 0
|
||||||
|
for (s, e) in automata:
|
||||||
|
s = "%#06x" % s if isinstance(s, int) else '"%s"' % str(s)
|
||||||
|
print("%3d %8s %d" % (i, s, e), file=sys.stderr)
|
||||||
|
i += 1
|
||||||
|
|
||||||
def batched(ar, n):
|
def batched(ar, n):
|
||||||
i = 0
|
i = 0
|
||||||
while i + n < len(ar):
|
while i + n < len(ar):
|
||||||
@ -213,3 +227,4 @@ for fname in sys.argv[1:]:
|
|||||||
automata = make_automata(trie)
|
automata = make_automata(trie)
|
||||||
gen_java(automata)
|
gen_java(automata)
|
||||||
print("Compiled %d sequences into %d states. Dropped %d sequences." % (total_sequences, len(automata), dropped_sequences), file=sys.stderr)
|
print("Compiled %d sequences into %d states. Dropped %d sequences." % (total_sequences, len(automata), dropped_sequences), file=sys.stderr)
|
||||||
|
# print_automata(automata)
|
||||||
|
@ -36,17 +36,17 @@ public final class ComposeKey
|
|||||||
if (next < 0)
|
if (next < 0)
|
||||||
return null;
|
return null;
|
||||||
next = edges[next];
|
next = edges[next];
|
||||||
char next_header = states[next];
|
int next_header = states[next];
|
||||||
if (next_header == 0) // Enter a new intermediate state.
|
if (next_header == 0) // Enter a new intermediate state.
|
||||||
return KeyValue.makeComposePending(String.valueOf(c), next, 0);
|
return KeyValue.makeComposePending(String.valueOf(c), next, 0);
|
||||||
else if (next_header > 0) // Character final state.
|
else if (next_header == 0xFFFF) // String final state
|
||||||
return KeyValue.makeCharKey(next_header);
|
|
||||||
else // next_header is < 0, string final state.
|
|
||||||
{
|
{
|
||||||
int next_length = edges[next];
|
int next_length = edges[next];
|
||||||
return KeyValue.makeStringKey(
|
return KeyValue.makeStringKey(
|
||||||
new String(states, next + 1, next + next_length));
|
new String(states, next + 1, next_length - 1));
|
||||||
}
|
}
|
||||||
|
else // Character final state.
|
||||||
|
return KeyValue.makeCharKey((char)next_header);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** The state machine is comprised of two arrays.
|
/** The state machine is comprised of two arrays.
|
||||||
|
Binary file not shown.
Loading…
Reference in New Issue
Block a user