Add advance_next32() function to 'unicode' module to encapsulate the escape handling logic #1

This commit is contained in:
Thomas Jensen 2021-02-07 20:38:38 +01:00
parent 1c4914bb01
commit 4c656727ec
No known key found for this signature in database
GPG Key ID: A4ACEE270D0FB7DB
6 changed files with 115 additions and 74 deletions

View File

@ -343,7 +343,7 @@ void btrim(char *text, size_t *len)
void btrim32(uint32_t *text, size_t *len) void btrim32(uint32_t *text, size_t *len)
/* /*
* Remove trailing whitespace from line (unicode version). * Remove trailing whitespace from line (unicode and escape sequence enabled version).
* *
* text string to trim * text string to trim
* len pointer to the length of the string in characters * len pointer to the length of the string in characters
@ -353,18 +353,34 @@ void btrim32(uint32_t *text, size_t *len)
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
*/ */
{ {
int idx = (int) (*len - 1); if (text == NULL || len == 0) {
return;
}
for (; idx >= 0; --idx) { const uint32_t *rest = text;
ucs4_t c = text[idx]; int last_char_pos = -1;
if (uc_is_c_whitespace(c) || uc_is_property_white_space(c) || uc_is_property_bidi_whitespace(c)) { size_t step_invis;
set_char_at(text, idx, char_nul);
} else { for (ucs4_t c = text[0]; c != char_nul; c = rest[0]) {
break; if (c != char_esc) {
if (!uc_is_c_whitespace(c) && !uc_is_property_white_space(c) && !uc_is_property_bidi_whitespace(c)) {
last_char_pos = (int) (rest - text);
}
}
rest = advance_next32(rest, &step_invis);
}
/* If the last character is followed by an escape sequence, keep it (but only one). */
if (last_char_pos >= 0) {
rest = text + last_char_pos + 1;
if (rest[0] == char_esc) {
advance_next32(rest, &step_invis);
last_char_pos += step_invis;
} }
} }
*len = idx + 1; set_char_at(text, (size_t) (last_char_pos + 1), char_nul);
*len = (size_t) (last_char_pos + 1);
} }
@ -546,7 +562,6 @@ void print_input_lines(const char *heading)
static size_t count_invisible_chars(const uint32_t *s, size_t *num_esc, char **ascii, size_t **posmap) static size_t count_invisible_chars(const uint32_t *s, size_t *num_esc, char **ascii, size_t **posmap)
{ {
size_t invis = 0; /* counts invisible column positions */ size_t invis = 0; /* counts invisible column positions */
int ansipos = 0; /* progression of ansi sequence */
*num_esc = 0; /* counts the number of escape sequences found */ *num_esc = 0; /* counts the number of escape sequences found */
if (is_empty(s)) { if (is_empty(s)) {
@ -562,42 +577,25 @@ static size_t count_invisible_chars(const uint32_t *s, size_t *num_esc, char **a
(*ascii) = (char *) calloc(buflen, sizeof(char)); /* maybe a little too much, but certainly enough */ (*ascii) = (char *) calloc(buflen, sizeof(char)); /* maybe a little too much, but certainly enough */
char *p = *ascii; char *p = *ascii;
ucs4_t c;
size_t mb_idx = 0; size_t mb_idx = 0;
size_t step_invis;
const uint32_t *rest = s; const uint32_t *rest = s;
while ((rest = u32_next(&c, rest))) {
for (ucs4_t c = s[0]; c != char_nul; c = rest[0]) {
if (map_idx >= map_size - 4) { if (map_idx >= map_size - 4) {
map_size = map_size * 2 + 1; map_size = map_size * 2 + 1;
map = (size_t *) realloc(map, map_size * sizeof(size_t)); map = (size_t *) realloc(map, map_size * sizeof(size_t));
} }
if (ansipos == 0 && c == char_esc) { if (c == char_esc) {
/* Found an ESC char, count it as invisible and move 1 forward in the detection of CSI sequences */
ansipos++;
invis++;
(*num_esc)++; (*num_esc)++;
} else if (ansipos == 1 && c == '[') {
/* Found '[' char after ESC. A CSI sequence has started. */
ansipos++;
invis++;
} else if (ansipos == 1 && c >= 0x40 && c <= 0x5f) {
/* Found a byte designating the end of a two-byte escape sequence */
invis++;
ansipos = 0;
} else if (ansipos == 2) {
/* Inside CSI sequence - Keep counting bytes as invisible */
invis++;
/* A char between 0x40 and 0x7e signals the end of an CSI or escape sequence */
if (c >= 0x40 && c <= 0x7e) {
ansipos = 0;
} }
else if (is_ascii_printable(c)) {
} else if (is_ascii_printable(c)) {
*p = c & 0xff; *p = c & 0xff;
map[map_idx++] = mb_idx; map[map_idx++] = mb_idx;
++p; ++p;
} else { }
else {
int cols = uc_width(c, encoding); int cols = uc_width(c, encoding);
if (cols > 0) { if (cols > 0) {
memset(p, (int) 'x', cols); memset(p, (int) 'x', cols);
@ -607,8 +605,13 @@ static size_t count_invisible_chars(const uint32_t *s, size_t *num_esc, char **a
p += cols; p += cols;
} }
} }
++mb_idx;
rest = advance_next32(rest, &step_invis);
mb_idx += BMAX((size_t) 1, step_invis);
invis += step_invis;
} }
*p = '\0'; *p = '\0';
(*posmap) = map; (*posmap) = map;
return invis; return invis;

View File

@ -100,6 +100,48 @@ uint32_t *new_empty_string32()
uint32_t *advance_next32(const uint32_t *s, size_t *invis)
{
if (is_empty(s)) {
return (uint32_t *) s;
}
int ansipos = 0;
(*invis) = 0;
ucs4_t c;
const uint32_t *rest = s;
while ((rest = u32_next(&c, rest))) {
if (ansipos == 0 && c == char_esc) {
/* Found an ESC char, count it as invisible and move 1 forward in the detection of CSI sequences */
(*invis)++;
ansipos++;
} else if (ansipos == 1 && c == '[') {
/* Found '[' char after ESC. A CSI sequence has started. */
(*invis)++;
ansipos++;
} else if (ansipos == 1 && c >= 0x40 && c <= 0x5f) {
/* Found a byte designating the end of a two-byte escape sequence */
(*invis)++;
ansipos = 0;
break;
} else if (ansipos == 2) {
/* Inside CSI sequence - Keep counting chars as invisible */
(*invis)++;
/* A char between 0x40 and 0x7e signals the end of an CSI or escape sequence */
if (c >= 0x40 && c <= 0x7e) {
ansipos = 0;
break;
}
} else {
break;
}
}
return (uint32_t *) rest;
}
uint32_t *advance32(uint32_t *s, const size_t offset) uint32_t *advance32(uint32_t *s, const size_t offset)
{ {
if (is_empty(s)) { if (is_empty(s)) {
@ -109,50 +151,28 @@ uint32_t *advance32(uint32_t *s, const size_t offset)
return s; return s;
} }
ucs4_t c; /* the current character we're looking at */ size_t count = 0; /* the count of visible characters */
const uint32_t *cStr = s; /* pointer to c in s */
size_t idx = 0; /* the count of visible characters */
const uint32_t *last_esc; /* pointer to the start of the last escape sequence encountered */
const uint32_t *rest = s; /* pointer to the next character coming up, needed only for u32_next() api */
int visible = 1; /* flag indicating whether the previous char was a visible char */ int visible = 1; /* flag indicating whether the previous char was a visible char */
int ansipos = 0; /* progression of ansi sequence */ const uint32_t *last_esc = NULL; /* pointer to the start of the last escape sequence encountered */
const uint32_t *rest = s; /* pointer to the next character coming up */
size_t step_invis = 0; /* unused, but required for advance_next32() call */
while ((rest = u32_next(&c, rest))) { for (ucs4_t c = s[0]; c != char_nul; c = rest[0]) {
if (ansipos == 0 && c == char_esc) { if (c == char_esc) {
/* Found an ESC char, count it as invisible and move 1 forward in the detection of CSI sequences */ last_esc = rest;
last_esc = cStr;
visible = 0; visible = 0;
ansipos++;
} else if (ansipos == 1 && c == '[') {
/* Found '[' char after ESC. A CSI sequence has started. */
ansipos++;
visible = 0;
} else if (ansipos == 1 && c >= 0x40 && c <= 0x5f) {
/* Found a char designating the end of a two-byte escape sequence */
visible = 0;
ansipos = 0;
} else if (ansipos == 2) {
/* Inside CSI sequence - Keep counting chars as invisible */
visible = 0;
/* A char between 0x40 and 0x7e signals the end of an CSI or escape sequence */
if (c >= 0x40 && c <= 0x7e) {
ansipos = 0;
}
} else { } else {
/* a visible char */ if (count++ == offset) {
if (idx == offset) { if (!visible && last_esc != NULL) {
if (!visible) {
return (uint32_t *) last_esc; return (uint32_t *) last_esc;
} }
return (uint32_t *) cStr; break;
} }
++idx;
visible = 1; visible = 1;
} }
cStr = rest; rest = advance_next32(rest, &step_invis);
} }
return new_empty_string32(); /* offset too large, not enough characters in string */ return (uint32_t *) rest; /* may point to zero terminator when offset too large */
} }

View File

@ -50,10 +50,28 @@ int is_ascii_printable(const ucs4_t c);
/** Return a freshly allocated empty UTF-32 string. */ /** Return a freshly allocated empty UTF-32 string. */
uint32_t *new_empty_string32(); uint32_t *new_empty_string32();
/**
* Return the next position in <s> in accordance with escape sequences. The result can be the next normal character,
* or again an escape sequence, if it directly follows the first.
*
* @param <s> The pointer to the start position. Is assumed to point either at the ESC at the start of an escape
* sequence, or to be positioned outside an escape sequence.
* @param <invis> Will contain the number of invisible characters skipped in order to get to the new position.
* This will be 0 unless <s> pointed to an ESC char, in which case it contains the length in characters of that
* escape sequence.
* @return The next position, or 0 if the end of the string was reached
*/
uint32_t *advance_next32(const uint32_t *s, size_t *invis);
/** /**
* Determine a new position in the given string s with the given offset of visible characters. * Determine a new position in the given string s with the given offset of visible characters.
* If the character right in front of the target character is invisible, then the pointer is moved to the start of * If the character right in front of the target character is invisible, then the pointer is moved to the start of
* that invisible sequence. The purpose is to catch any escape sequences which would for example color the character. * that invisible sequence. The purpose is to catch any escape sequences which would for example color the character.
*
* @param <s> The pointer to the start position. Is assumed to point either at the ESC at the start of an escape
* sequence, or to be positioned outside an escape sequence.
* @param <offset> the number of visible character positions to advance the pointer
* @return a pointer to the new position in s, or 0 if the end of the string was reached
*/ */
uint32_t *advance32(uint32_t *s, const size_t offset); uint32_t *advance32(uint32_t *s, const size_t offset);

View File

@ -10,7 +10,7 @@ Original Designer: (public domain)
Creation Date: March 18, 1999 (Thursday, 15:25h) Creation Date: March 18, 1999 (Thursday, 15:25h)
Current Revision: 1.0 as of March 18, 1999 (Thursday, 15:25h) Current Revision: 1.0 as of March 18, 1999 (Thursday, 15:25h)
Indentation Mode: box (indent box) Indentation Mode: box (indent box)
Replacement Rules: 1. (glob) "\*/" WITH "*\/" Replacement Rules: 1. (glob) "\*/" WITH "*\\/"
Reversion Rules: 1. (glob) "\*\\/" TO "*/" Reversion Rules: 1. (glob) "\*\\/" TO "*/"
Minimum Box Dimensions: 5 x 3 (width x height) Minimum Box Dimensions: 5 x 3 (width x height)
Default Padding: left 1, right 1 Default Padding: left 1, right 1

View File

@ -6,7 +6,7 @@
:OUTPUT-FILTER :OUTPUT-FILTER
:EXPECTED :EXPECTED
/********************************************/ /********************************************/
/* There is no complete theory of anything. */ /* There is no complete theory of anything. */
/*  Robert Anton Wilson */ /*  Robert Anton Wilson */
/********************************************/ /********************************************/
:EOF :EOF

View File

@ -5,7 +5,7 @@
:OUTPUT-FILTER :OUTPUT-FILTER
:EXPECTED :EXPECTED
/********************************************/ /********************************************/
/* There is no complete theory of anything. */ /* There is no complete theory of anything. */
/*  Robert Anton Wilson */ /*  Robert Anton Wilson */
/********************************************/ /********************************************/
:EOF :EOF