diff --git a/src/tools.c b/src/tools.c index 5689987..2398a12 100644 --- a/src/tools.c +++ b/src/tools.c @@ -343,7 +343,7 @@ void btrim(char *text, size_t *len) void btrim32(uint32_t *text, size_t *len) /* - * Remove trailing whitespace from line (unicode version). + * Remove trailing whitespace from line (unicode and escape sequence enabled version). * * text string to trim * len pointer to the length of the string in characters @@ -353,18 +353,34 @@ void btrim32(uint32_t *text, size_t *len) * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ { - int idx = (int) (*len - 1); + if (text == NULL || len == 0) { + return; + } - for (; idx >= 0; --idx) { - ucs4_t c = text[idx]; - if (uc_is_c_whitespace(c) || uc_is_property_white_space(c) || uc_is_property_bidi_whitespace(c)) { - set_char_at(text, idx, char_nul); - } else { - break; + const uint32_t *rest = text; + int last_char_pos = -1; + size_t step_invis; + + for (ucs4_t c = text[0]; c != char_nul; c = rest[0]) { + if (c != char_esc) { + if (!uc_is_c_whitespace(c) && !uc_is_property_white_space(c) && !uc_is_property_bidi_whitespace(c)) { + last_char_pos = (int) (rest - text); + } + } + rest = advance_next32(rest, &step_invis); + } + + /* If the last character is followed by an escape sequence, keep it (but only one). */ + if (last_char_pos >= 0) { + rest = text + last_char_pos + 1; + if (rest[0] == char_esc) { + advance_next32(rest, &step_invis); + last_char_pos += step_invis; } } - *len = idx + 1; + set_char_at(text, (size_t) (last_char_pos + 1), char_nul); + *len = (size_t) (last_char_pos + 1); } @@ -546,7 +562,6 @@ void print_input_lines(const char *heading) static size_t count_invisible_chars(const uint32_t *s, size_t *num_esc, char **ascii, size_t **posmap) { size_t invis = 0; /* counts invisible column positions */ - int ansipos = 0; /* progression of ansi sequence */ *num_esc = 0; /* counts the number of escape sequences found */ if (is_empty(s)) { @@ -562,42 +577,25 @@ static size_t count_invisible_chars(const uint32_t *s, size_t *num_esc, char **a (*ascii) = (char *) calloc(buflen, sizeof(char)); /* maybe a little too much, but certainly enough */ char *p = *ascii; - ucs4_t c; size_t mb_idx = 0; + size_t step_invis; const uint32_t *rest = s; - while ((rest = u32_next(&c, rest))) { + + for (ucs4_t c = s[0]; c != char_nul; c = rest[0]) { if (map_idx >= map_size - 4) { map_size = map_size * 2 + 1; map = (size_t *) realloc(map, map_size * sizeof(size_t)); } - if (ansipos == 0 && c == char_esc) { - /* Found an ESC char, count it as invisible and move 1 forward in the detection of CSI sequences */ - ansipos++; - invis++; + if (c == char_esc) { (*num_esc)++; - } else if (ansipos == 1 && c == '[') { - /* Found '[' char after ESC. A CSI sequence has started. */ - ansipos++; - invis++; - } else if (ansipos == 1 && c >= 0x40 && c <= 0x5f) { - /* Found a byte designating the end of a two-byte escape sequence */ - invis++; - ansipos = 0; - } else if (ansipos == 2) { - /* Inside CSI sequence - Keep counting bytes as invisible */ - invis++; - - /* A char between 0x40 and 0x7e signals the end of an CSI or escape sequence */ - if (c >= 0x40 && c <= 0x7e) { - ansipos = 0; - } - - } else if (is_ascii_printable(c)) { + } + else if (is_ascii_printable(c)) { *p = c & 0xff; map[map_idx++] = mb_idx; ++p; - } else { + } + else { int cols = uc_width(c, encoding); if (cols > 0) { memset(p, (int) 'x', cols); @@ -607,8 +605,13 @@ static size_t count_invisible_chars(const uint32_t *s, size_t *num_esc, char **a p += cols; } } - ++mb_idx; + + rest = advance_next32(rest, &step_invis); + + mb_idx += BMAX((size_t) 1, step_invis); + invis += step_invis; } + *p = '\0'; (*posmap) = map; return invis; diff --git a/src/unicode.c b/src/unicode.c index dafdfbd..57ef2c7 100644 --- a/src/unicode.c +++ b/src/unicode.c @@ -100,6 +100,48 @@ uint32_t *new_empty_string32() +uint32_t *advance_next32(const uint32_t *s, size_t *invis) +{ + if (is_empty(s)) { + return (uint32_t *) s; + } + + int ansipos = 0; + (*invis) = 0; + ucs4_t c; + const uint32_t *rest = s; + while ((rest = u32_next(&c, rest))) { + if (ansipos == 0 && c == char_esc) { + /* Found an ESC char, count it as invisible and move 1 forward in the detection of CSI sequences */ + (*invis)++; + ansipos++; + } else if (ansipos == 1 && c == '[') { + /* Found '[' char after ESC. A CSI sequence has started. */ + (*invis)++; + ansipos++; + } else if (ansipos == 1 && c >= 0x40 && c <= 0x5f) { + /* Found a byte designating the end of a two-byte escape sequence */ + (*invis)++; + ansipos = 0; + break; + } else if (ansipos == 2) { + /* Inside CSI sequence - Keep counting chars as invisible */ + (*invis)++; + + /* A char between 0x40 and 0x7e signals the end of an CSI or escape sequence */ + if (c >= 0x40 && c <= 0x7e) { + ansipos = 0; + break; + } + } else { + break; + } + } + return (uint32_t *) rest; +} + + + uint32_t *advance32(uint32_t *s, const size_t offset) { if (is_empty(s)) { @@ -109,50 +151,28 @@ uint32_t *advance32(uint32_t *s, const size_t offset) return s; } - ucs4_t c; /* the current character we're looking at */ - const uint32_t *cStr = s; /* pointer to c in s */ - size_t idx = 0; /* the count of visible characters */ - const uint32_t *last_esc; /* pointer to the start of the last escape sequence encountered */ - const uint32_t *rest = s; /* pointer to the next character coming up, needed only for u32_next() api */ - int visible = 1; /* flag indicating whether the previous char was a visible char */ - int ansipos = 0; /* progression of ansi sequence */ + size_t count = 0; /* the count of visible characters */ + int visible = 1; /* flag indicating whether the previous char was a visible char */ + const uint32_t *last_esc = NULL; /* pointer to the start of the last escape sequence encountered */ + const uint32_t *rest = s; /* pointer to the next character coming up */ + size_t step_invis = 0; /* unused, but required for advance_next32() call */ - while ((rest = u32_next(&c, rest))) { - if (ansipos == 0 && c == char_esc) { - /* Found an ESC char, count it as invisible and move 1 forward in the detection of CSI sequences */ - last_esc = cStr; + for (ucs4_t c = s[0]; c != char_nul; c = rest[0]) { + if (c == char_esc) { + last_esc = rest; visible = 0; - ansipos++; - } else if (ansipos == 1 && c == '[') { - /* Found '[' char after ESC. A CSI sequence has started. */ - ansipos++; - visible = 0; - } else if (ansipos == 1 && c >= 0x40 && c <= 0x5f) { - /* Found a char designating the end of a two-byte escape sequence */ - visible = 0; - ansipos = 0; - } else if (ansipos == 2) { - /* Inside CSI sequence - Keep counting chars as invisible */ - visible = 0; - - /* A char between 0x40 and 0x7e signals the end of an CSI or escape sequence */ - if (c >= 0x40 && c <= 0x7e) { - ansipos = 0; - } } else { - /* a visible char */ - if (idx == offset) { - if (!visible) { + if (count++ == offset) { + if (!visible && last_esc != NULL) { return (uint32_t *) last_esc; } - return (uint32_t *) cStr; + break; } - ++idx; visible = 1; } - cStr = rest; + rest = advance_next32(rest, &step_invis); } - return new_empty_string32(); /* offset too large, not enough characters in string */ + return (uint32_t *) rest; /* may point to zero terminator when offset too large */ } diff --git a/src/unicode.h b/src/unicode.h index 02d090a..cbfc14f 100644 --- a/src/unicode.h +++ b/src/unicode.h @@ -50,10 +50,28 @@ int is_ascii_printable(const ucs4_t c); /** Return a freshly allocated empty UTF-32 string. */ uint32_t *new_empty_string32(); +/** + * Return the next position in in accordance with escape sequences. The result can be the next normal character, + * or again an escape sequence, if it directly follows the first. + * + * @param The pointer to the start position. Is assumed to point either at the ESC at the start of an escape + * sequence, or to be positioned outside an escape sequence. + * @param Will contain the number of invisible characters skipped in order to get to the new position. + * This will be 0 unless pointed to an ESC char, in which case it contains the length in characters of that + * escape sequence. + * @return The next position, or 0 if the end of the string was reached + */ +uint32_t *advance_next32(const uint32_t *s, size_t *invis); + /** * Determine a new position in the given string s with the given offset of visible characters. * If the character right in front of the target character is invisible, then the pointer is moved to the start of * that invisible sequence. The purpose is to catch any escape sequences which would for example color the character. + * + * @param The pointer to the start position. Is assumed to point either at the ESC at the start of an escape + * sequence, or to be positioned outside an escape sequence. + * @param the number of visible character positions to advance the pointer + * @return a pointer to the new position in s, or 0 if the end of the string was reached */ uint32_t *advance32(uint32_t *s, const size_t offset); diff --git a/test/083_list_design_info.txt b/test/083_list_design_info.txt index 33e5a9f..6effe77 100644 --- a/test/083_list_design_info.txt +++ b/test/083_list_design_info.txt @@ -10,7 +10,7 @@ Original Designer: (public domain) Creation Date: March 18, 1999 (Thursday, 15:25h) Current Revision: 1.0 as of March 18, 1999 (Thursday, 15:25h) Indentation Mode: box (indent box) -Replacement Rules: 1. (glob) "\*/" WITH "*\/" +Replacement Rules: 1. (glob) "\*/" WITH "*\\/" Reversion Rules: 1. (glob) "\*\\/" TO "*/" Minimum Box Dimensions: 5 x 3 (width x height) Default Padding: left 1, right 1 diff --git a/test/104_ansi_lolcat.txt b/test/104_ansi_lolcat.txt index d24817b..f5ae847 100644 --- a/test/104_ansi_lolcat.txt +++ b/test/104_ansi_lolcat.txt @@ -6,7 +6,7 @@ :OUTPUT-FILTER :EXPECTED /********************************************/ -/* There is no complete theory of anything. */ +/* There is no complete theory of anything. */ /*  Robert Anton Wilson */ /********************************************/ :EOF diff --git a/test/105_ansi_lolcat_within_leading_space.txt b/test/105_ansi_lolcat_within_leading_space.txt index 1aff7bc..19bbd37 100644 --- a/test/105_ansi_lolcat_within_leading_space.txt +++ b/test/105_ansi_lolcat_within_leading_space.txt @@ -5,7 +5,7 @@ :OUTPUT-FILTER :EXPECTED /********************************************/ - /* There is no complete theory of anything. */ + /* There is no complete theory of anything. */ /*  Robert Anton Wilson */ /********************************************/ :EOF