From a75902679007bbdee76509ef644d379b6c5911c6 Mon Sep 17 00:00:00 2001 From: Thomas Jensen Date: Wed, 24 May 2023 21:00:49 +0200 Subject: [PATCH] Switch `input` structure entirely to bxstring --- src/boxes.in.h | 18 ++--- src/bxstring.c | 86 +++++++++++++++++++++- src/bxstring.h | 46 +++++++++++- src/generate.c | 19 +++-- src/input.c | 115 +++++++++++++++-------------- src/remove.c | 60 +++++++-------- src/tools.c | 86 +++++++--------------- src/tools.h | 20 +++++ src/unicode.c | 37 ++++++++++ src/unicode.h | 13 +++- utest/Makefile | 2 +- utest/bxstring_test.c | 166 ++++++++++++++++++++++++++++++++++++++++++ utest/bxstring_test.h | 9 +++ utest/main.c | 8 +- utest/unicode_test.c | 25 +++++++ utest/unicode_test.h | 1 + 16 files changed, 539 insertions(+), 172 deletions(-) diff --git a/src/boxes.in.h b/src/boxes.in.h index 95bfddf..ecc7962 100644 --- a/src/boxes.in.h +++ b/src/boxes.in.h @@ -20,7 +20,7 @@ #ifndef BOXES_H #define BOXES_H -#define DEBUG 1 +/* #define DEBUG 1 */ /* #define REGEXP_DEBUG 1 */ /* #define PARSER_DEBUG 1 */ /* #define LEXER_DEBUG 1 */ @@ -146,24 +146,16 @@ extern int color_output_enabled; typedef struct { - size_t len; /* length of visible text in columns (visible character positions in a text terminal), which is the same as the length of the 'text' field */ - char *text; /* ASCII line content, tabs expanded, ansi escapes removed, multi-byte chars replaced with one or more 'x' */ - size_t invis; /* number of invisble columns/characters (part of an ansi sequence) */ - - uint32_t *mbtext; /* multi-byte (original) line content, tabs expanded. We use UTF-32 in order to enable pointer arithmetic. */ - size_t num_chars; /* total number of characters in mbtext, visible + invisible */ - uint32_t *mbtext_org; /* mbtext as originally allocated, so that we can free it again */ - + bxstr_t *text; /* text content of the line as a boxes string */ size_t *tabpos; /* tab positions in expanded work strings, or NULL if not needed */ size_t tabpos_len; /* number of tabs in a line */ - size_t *posmap; /* for each character in `text`, position of corresponding char in `mbtext`. Needed for box removal. */ } line_t; typedef struct { line_t *lines; size_t num_lines; /* number of entries in input */ - size_t maxline; /* length of longest input line */ - size_t indent; /* number of leading spaces found */ + size_t maxline; /* length in columns of longest input line */ + size_t indent; /* common number of leading spaces found in all input lines */ int final_newline; /* true if the last line of input ends with newline */ } input_t; @@ -172,4 +164,4 @@ extern input_t input; #endif /* BOXES_H */ -/*EOF*/ /* vim: set cindent sw=4: */ +/* vim: set cindent sw=4: */ diff --git a/src/bxstring.c b/src/bxstring.c index a05b35b..2ccd6eb 100644 --- a/src/bxstring.c +++ b/src/bxstring.c @@ -181,6 +181,13 @@ bxstr_t *bxs_from_unicode(uint32_t *pInput) +bxstr_t *bxs_new_empty_string() +{ + return bxs_from_ascii(""); +} + + + bxstr_t *bxs_strdup(bxstr_t *pString) { if (pString == NULL) { @@ -329,6 +336,36 @@ uint32_t *bxs_strchr(bxstr_t *pString, ucs4_t c, int *cursor) +bxstr_t *bxs_cut_front(bxstr_t *pString, size_t n) +{ + if (pString == NULL) { + return NULL; + } + if (n >= pString->num_chars_visible) { + return bxs_new_empty_string(); + } + if (n == 0) { + return bxs_strdup(pString); + } + uint32_t *s = pString->memory + pString->first_char[n]; + return bxs_from_unicode(s); +} + + + +uint32_t *bxs_first_char_ptr(bxstr_t *pString, size_t n) +{ + if (pString == NULL) { + return NULL; + } + if (n >= pString->num_chars_visible) { + return pString->memory + pString->first_char[pString->num_chars_visible]; /* pointer to NUL terminator */ + } + return pString->memory + pString->first_char[n]; +} + + + bxstr_t *bxs_trim(bxstr_t *pString) { if (pString == NULL) { @@ -338,7 +375,7 @@ bxstr_t *bxs_trim(bxstr_t *pString) return bxs_strdup(pString); } if (pString->indent + pString->trailing == pString->num_chars_visible) { - return bxs_from_ascii(""); + return bxs_new_empty_string(); } uint32_t *e = u32_strdup(pString->memory); @@ -369,6 +406,37 @@ bxstr_t *bxs_rtrim(bxstr_t *pString) +void bxs_append_spaces(bxstr_t *pString, size_t n) +{ + if (pString == NULL || n == 0) { + return; + } + + pString->memory = (uint32_t *) realloc(pString->memory, (pString->num_chars + n + 1) * sizeof(uint32_t)); + u32_set(pString->memory + pString->num_chars, char_space, n); + set_char_at(pString->memory, pString->num_chars + n, char_nul); + + pString->ascii = (char *) realloc(pString->ascii, (pString->num_columns + n + 1) * sizeof(char)); + memset(pString->ascii + pString->num_columns, ' ', n); + pString->ascii[pString->num_columns + n] = '\0'; + + pString->first_char = + (size_t *) realloc(pString->first_char, (pString->num_chars_visible + n + 1) * sizeof(size_t)); + pString->visible_char = + (size_t *) realloc(pString->visible_char, (pString->num_chars_visible + n + 1) * sizeof(size_t)); + for (size_t i = 0; i <= n; i++) { + pString->first_char[pString->num_chars_visible + i] = pString->num_chars + i; + pString->visible_char[pString->num_chars_visible + i] = pString->num_chars + i; + } + + pString->num_chars += n; + pString->num_chars_visible += n; + pString->num_columns += n; + pString->trailing += n; +} + + + char *bxs_to_output(bxstr_t *pString) { if (pString == NULL) { @@ -397,6 +465,22 @@ int bxs_is_empty(bxstr_t *pString) +int bxs_is_blank(bxstr_t *pString) +{ + if (bxs_is_empty(pString)) { + return 1; + } + for (size_t i = 0; i < pString->num_chars_visible; i++) { + ucs4_t c = pString->memory[pString->visible_char[i]]; + if (c != char_tab && c != char_cr && !uc_is_blank(c)) { + return 0; + } + } + return 1; +} + + + int bxs_is_visible_char(bxstr_t *pString, size_t idx) { int result = 0; diff --git a/src/bxstring.h b/src/bxstring.h index bb6eb5f..40913d0 100644 --- a/src/bxstring.h +++ b/src/bxstring.h @@ -83,6 +83,13 @@ bxstr_t *bxs_from_ascii(char *pAscii); bxstr_t *bxs_from_unicode(uint32_t *pInput); +/** + * Return a freshly allocated empty string. + * @return a new empty string + */ +bxstr_t *bxs_new_empty_string(); + + /** * Create an exact copy of a string. * @param pString the string to copy @@ -134,6 +141,26 @@ bxstr_t *bxs_concat(size_t count, ...); uint32_t *bxs_strchr(bxstr_t *pString, ucs4_t c, int *cursor); +/** + * Remove the first `n` visible characters from the given string. Invisible characters are additionally removed where + * they are associated with the removed visible characters. + * @param pString the string to shorten + * @param n the number of visible characters to cut from the front of the string. If this is greater than the total + * number of visible characters in the string, will lead to an empty string being produced + * @return a new, shortened string, or NULL only if `pString` was NULL + */ +bxstr_t *bxs_cut_front(bxstr_t *pString, size_t n); + + +/** + * Return the first character of the visible character with index `n` in the given string's `memory`. + * @param pString the string to use + * @param n the index of the visible character (zero-based) + * @return a pointer into existing memory + */ +uint32_t *bxs_first_char_ptr(bxstr_t *pString, size_t n); + + /** * Create a new string from which all leading and trailing whitespace have been removed. * @param pString the string to trim, which will not be modified @@ -150,6 +177,14 @@ bxstr_t *bxs_trim(bxstr_t *pString); bxstr_t *bxs_rtrim(bxstr_t *pString); +/** + * Append `n` spaces to the end of the given string. The given string is *modified* accordingly. + * @param pString the string to modify + * @param n the number of spaces to add + */ +void bxs_append_spaces(bxstr_t *pString, size_t n); + + /** * Convert the string into boxes' output encoding for proper printing on stdout. * @param pString the string to convert @@ -159,13 +194,22 @@ char *bxs_to_output(bxstr_t *pString); /** - * Determine if the given string is empty. + * Determine if the given string is empty, which means it does not contain any characters at all (neither visible nor + * invisible). * @param pString the string to check (may be NULL, which counts as empty) * @return 1 for empty, 0 for not empty */ int bxs_is_empty(bxstr_t *pString); +/** + * Determine if the line is either empty or its visible characters are all whitespace. + * @param pString the string to check (may be NULL, which counts as blank) + * @return 1 for blank, 0 for not blank + */ +int bxs_is_blank(bxstr_t *pString); + + /** * Determine if the character at position `idx` in the given `pString` is a visible character. If `idx` is out of * bounds, this function will return 0. diff --git a/src/generate.c b/src/generate.c index 000b15c..5fd7250 100644 --- a/src/generate.c +++ b/src/generate.c @@ -789,14 +789,16 @@ static int justify_line(line_t *line, int skew) } #if defined(DEBUG) || 0 - fprintf (stderr, "justify_line(%c): Input: real: (%02d) \"%s\", text: (%02d) \"%s\", invisible=%d, skew=%d", - opt.justify ? opt.justify : '0', (int) line->num_chars, u32_strconv_to_output(line->mbtext), - (int) line->len, line->text, (int) line->invis, skew); + char *outtext = bxs_to_output(line->text); + fprintf(stderr, "justify_line(%c): Input: real: (%02d) \"%s\", text: (%02d) \"%s\", invisible=%d, skew=%d", + opt.justify ? opt.justify : '0', (int) line->text->num_chars, outtext, (int) line->text->num_columns, + line->text->ascii, (int) line->text->num_chars_invisible, skew); + bxs_free(outtext); #endif int result = 0; - size_t initial_space_size = strspn(line->text, " \t"); - size_t newlen = line->len - initial_space_size; + size_t initial_space_size = line->text->indent; + size_t newlen = line->text->num_columns - initial_space_size; size_t shift; switch (opt.justify) { @@ -1043,16 +1045,17 @@ int output_box(const sentry_t *thebox) if (ti < (long) input.num_lines) { /* box content (lines) */ int shift = justify_line(input.lines + ti, hpr - hpl); restored_indent = tabbify_indent(ti, indentspc, indentspclen); - uint32_t *mbtext_shifted = advance32(input.lines[ti].mbtext, shift < 0 ? (size_t) (-shift) : 0); + bxstr_t *text_shifted = bxs_cut_front(input.lines[ti].text, shift < 0 ? (size_t) (-shift) : 0); uint32_t *spc1 = empty_string; if (ti >= 0 && shift > 0) { spc1 = u32_nspaces(shift); } - uint32_t *spc2 = u32_nspaces(input.maxline - input.lines[ti].len - shift); + uint32_t *spc2 = u32_nspaces(input.maxline - input.lines[ti].text->num_columns - shift); obuf = bxs_concat(8, restored_indent, skip_left ? empty_string : thebox[BLEF].mbcs[j]->memory, hfill1, spc1, - ti >= 0 ? mbtext_shifted : empty_string, hfill2, spc2, + ti >= 0 ? text_shifted->memory : empty_string, hfill2, spc2, thebox[BRIG].mbcs[j]->memory); + bxs_free(text_shifted); if (spc1 != empty_string) { BFREE(spc1); } diff --git a/src/input.c b/src/input.c index c26adff..3554ae3 100644 --- a/src/input.c +++ b/src/input.c @@ -18,16 +18,17 @@ */ #include "config.h" + #include #include #include #include #include "boxes.h" +#include "input.h" #include "regulex.h" #include "tools.h" #include "unicode.h" -#include "input.h" @@ -61,8 +62,8 @@ static int has_linebreak(const uint32_t *s, const int len) */ static int get_indent(const line_t *lines, const size_t lines_size) { - int res = LINE_MAX_BYTES; /* result */ - int nonblank = 0; /* true if one non-blank line found */ + int res = LINE_MAX_BYTES; /* result */ + int nonblank = 0; /* true if one non-blank line found */ if (lines == NULL) { fprintf(stderr, "%s: internal error\n", PROJECT); @@ -73,9 +74,9 @@ static int get_indent(const line_t *lines, const size_t lines_size) } for (size_t j = 0; j < lines_size; ++j) { - if (lines[j].len > 0) { + if (lines[j].text->num_columns > 0) { nonblank = 1; - size_t ispc = strspn(lines[j].text, " "); + size_t ispc = lines[j].text->indent; if ((int) ispc < res) { res = ispc; } @@ -83,9 +84,10 @@ static int get_indent(const line_t *lines, const size_t lines_size) } if (nonblank) { - return res; /* success */ - } else { - return 0; /* success, but only blank lines */ + return res; /* success */ + } + else { + return 0; /* success, but only blank lines */ } } @@ -140,28 +142,34 @@ int apply_substitutions(input_t *result, const int mode) opt.design->current_rule = rules; for (j = 0; j < anz_rules; ++j, ++(opt.design->current_rule)) { #ifdef REGEXP_DEBUG - fprintf (stderr, "regex_replace(0x%p, \"%s\", \"%s\", %d, \'%c\') == ", - rules[j].prog, rules[j].repstr, u32_strconv_to_output(result->lines[k].mbtext), - (int) result->lines[k].num_chars, rules[j].mode); + char *outtext = bxs_to_output(result->lines[k].text); + char *outrepstr = bxs_to_output(rules[j].repstr); + fprintf(stderr, "regex_replace(0x%p, \"%s\", \"%s\", %d, \'%c\') == ", rules[j].prog, outrepstr, + outtext, (int) result->lines[k].text->num_chars, rules[j].mode); + BFREE(outtext); + BFREE(outrepstr); #endif - uint32_t *newtext = u32_regex_replace(rules[j].prog, rules[j].repstr->memory, - result->lines[k].mbtext, result->lines[k].num_chars, rules[j].mode == 'g'); + uint32_t *newtext = u32_regex_replace(rules[j].prog, rules[j].repstr->memory, result->lines[k].text->memory, + result->lines[k].text->num_chars, rules[j].mode == 'g'); #ifdef REGEXP_DEBUG - fprintf (stderr, "\"%s\"\n", newtext ? u32_strconv_to_output(newtext) : "NULL"); + char *outnewtext = newtext ? u32_strconv_to_output(newtext) : strdup("NULL"); + fprintf(stderr, "\"%s\"\n", outnewtext); + BFREE(outnewtext); #endif if (newtext == NULL) { return 1; } - BFREE(result->lines[k].mbtext_org); /* original address allocated for mbtext */ - result->lines[k].mbtext = newtext; - result->lines[k].mbtext_org = newtext; + bxs_free(result->lines[k].text); + result->lines[k].text = bxs_from_unicode(newtext); - analyze_line_ascii(result, result->lines + k); + analyze_line_ascii(result, result->lines + k); /* update maxline value */ #ifdef REGEXP_DEBUG - fprintf (stderr, "result->lines[%d] == {%d, \"%s\"}\n", (int) k, - (int) result->lines[k].num_chars, u32_strconv_to_output(result->lines[k].mbtext)); + char *outtext2 = bxs_to_output(result->lines[k].text); + fprintf(stderr, "result->lines[%d] == {%d, \"%s\"}\n", (int) k, (int) result->lines[k].text->num_chars, + outtext2); + BFREE(outtext2); #endif } opt.design->current_rule = NULL; @@ -176,7 +184,8 @@ int apply_substitutions(input_t *result, const int mode) rc = get_indent(result->lines, result->num_lines); if (rc >= 0) { result->indent = (size_t) rc; - } else { + } + else { return 4; } } @@ -209,20 +218,19 @@ static void trim_trailing_ws_carefully(uint32_t *mbtemp, size_t *len_chars) input_t *read_all_input() { - char buf[LINE_MAX_BYTES + 3]; /* static input buffer incl. newline + zero terminator */ - size_t input_size = 0; /* number of elements allocated */ + char buf[LINE_MAX_BYTES + 3]; /* static input buffer incl. newline + zero terminator */ + size_t input_size = 0; /* number of elements allocated */ input_t *result = (input_t *) calloc(1, sizeof(input_t)); result->indent = LINE_MAX_BYTES; - while (fgets(buf, LINE_MAX_BYTES + 2, opt.infile)) - { + while (fgets(buf, LINE_MAX_BYTES + 2, opt.infile)) { if (result->num_lines % 100 == 0) { input_size += 100; line_t *tmp = (line_t *) realloc(result->lines, input_size * sizeof(line_t)); if (tmp == NULL) { perror(PROJECT); - BFREE (result->lines); + BFREE(result->lines); return NULL; } result->lines = tmp; @@ -240,30 +248,27 @@ input_t *read_all_input() */ if (len_chars > 0) { uint32_t *temp = NULL; - len_chars = expand_tabs_into(mbtemp, opt.tabstop, &temp, - &(result->lines[result->num_lines].tabpos), - &(result->lines[result->num_lines].tabpos_len)); + len_chars = expand_tabs_into(mbtemp, opt.tabstop, &temp, &(result->lines[result->num_lines].tabpos), + &(result->lines[result->num_lines].tabpos_len)); if (len_chars == 0) { perror(PROJECT); - BFREE (result->lines); + BFREE(result->lines); return NULL; } - result->lines[result->num_lines].mbtext = temp; - BFREE(mbtemp); - temp = NULL; + result->lines[result->num_lines].text = bxs_from_unicode(temp); + BFREE(temp); } else { - result->lines[result->num_lines].mbtext = mbtemp; + result->lines[result->num_lines].text = bxs_new_empty_string(); } - result->lines[result->num_lines].mbtext_org = result->lines[result->num_lines].mbtext; - result->lines[result->num_lines].num_chars = len_chars; + BFREE(mbtemp); ++result->num_lines; } if (ferror(stdin)) { perror(PROJECT); - BFREE (result->lines); + BFREE(result->lines); return NULL; } return result; @@ -296,7 +301,8 @@ int analyze_input(input_t *result) int rc = get_indent(result->lines, result->num_lines); if (rc >= 0) { result->indent = (size_t) rc; - } else { + } + else { return 1; } @@ -306,21 +312,23 @@ int analyze_input(input_t *result) */ if (opt.design->indentmode != 't' && opt.r == 0) { for (size_t i = 0; i < result->num_lines; ++i) { - #ifdef DEBUG - fprintf(stderr, "%2d: mbtext = \"%s\" (%d chars)\n", (int) i, - u32_strconv_to_output(result->lines[i].mbtext), (int) result->lines[i].num_chars); - #endif - if (result->lines[i].num_chars >= result->indent) { - memmove(result->lines[i].text, result->lines[i].text + result->indent, - result->lines[i].len - result->indent + 1); - result->lines[i].len -= result->indent; - - result->lines[i].mbtext = advance32(result->lines[i].mbtext, result->indent); - result->lines[i].num_chars -= result->indent; + if (result->lines[i].text->num_columns >= result->indent) { + /* + * We should really remove *columns* rather than *characters*, but since the removed characters are + * spaces (indentation), and there are no double-wide spaces in Unicode, both actions are equivalent. + */ + bxstr_t *unindented = bxs_cut_front(result->lines[i].text, result->indent); + bxs_free(result->lines[i].text); + result->lines[i].text = unindented; } #ifdef DEBUG - fprintf(stderr, "%2d: mbtext = \"%s\" (%d chars)\n", (int) i, - u32_strconv_to_output(result->lines[i].mbtext), (int) result->lines[i].num_chars); + char *outtext = bxs_to_output(result->lines[i].text); + fprintf(stderr, "%2d: text = \"%s\" (%d chars, %d visible, %d invisible, %d columns)\n" + " ascii = \"%s\"\n", (int) i, outtext, + (int) result->lines[i].text->num_chars, (int) result->lines[i].text->num_chars_visible, + (int) result->lines[i].text->num_chars_invisible, (int) result->lines[i].text->num_columns, + result->lines[i].text->ascii); + BFREE(outtext); #endif } result->maxline -= result->indent; @@ -336,10 +344,11 @@ int analyze_input(input_t *result) } #ifdef DEBUG - fprintf (stderr, "Effective encoding: %s\n", encoding); + fprintf(stderr, "Effective encoding: %s\n", encoding); print_input_lines(NULL); #endif return 0; } -/*EOF*/ /* vim: set sw=4: */ + +/* vim: set sw=4: */ diff --git a/src/remove.c b/src/remove.c index 1f89bdf..235c5ef 100644 --- a/src/remove.c +++ b/src/remove.c @@ -53,11 +53,10 @@ static int best_match(const line_t *line, size_t k; /* line counter within shape */ int w; /* shape counter */ sentry_t *cs; /* current shape */ - char *s; /* duplicate of current shape part */ - char *p; /* position found by strstr */ + uint32_t *s; /* duplicate of current shape part */ + uint32_t *p; /* position found by u32_strstr() */ size_t cq; /* current quality */ - char *q; /* space check rover */ - line_t chkline; /* for calls to empty_line() */ + uint32_t *q; /* space check rover */ size_t quality; *ws = *we = *es = *ee = NULL; @@ -87,13 +86,11 @@ static int best_match(const line_t *line, cs = opt.design->shape + west_side[--w]; } - chkline.text = cs->chars[k]; - chkline.len = cs->width; - if (empty_line(&chkline) && !(quality == 0 && j == numw - 1)) { + if (bxs_is_blank(cs->mbcs[k]) && !(quality == 0 && j == numw - 1)) { continue; } - s = (char *) strdup(cs->chars[k]); + s = u32_strdup(cs->mbcs[k]->memory); if (s == NULL) { perror(PROJECT); return -1; @@ -101,10 +98,10 @@ static int best_match(const line_t *line, cq = cs->width; do { - p = strstr(line->text, s); + p = u32_strstr(line->text->memory, s); if (p) { q = p - 1; - while (q >= line->text) { + while (q >= line->text->memory) { if (*q-- != ' ') { p = NULL; break; @@ -115,10 +112,10 @@ static int best_match(const line_t *line, } } if (!p && cq) { - if (*s == ' ') { - memmove(s, s + 1, cq--); - } else if (s[cq - 1] == ' ') { - s[--cq] = '\0'; + if (*s == char_space) { + u32_move(s, s + 1, cq--); + } else if (s[cq - 1] == char_space) { + s[--cq] = char_nul; } else { cq = 0; break; @@ -162,13 +159,11 @@ static int best_match(const line_t *line, BFREE(mbcs_temp); #endif - chkline.text = cs->chars[k]; - chkline.len = cs->width; - if (empty_line(&chkline)) { + if (bxs_is_blank(cs->mbcs[k])) { continue; } - s = (char *) strdup(cs->chars[k]); + s = u32_strdup(cs->mbcs[k]->memory); if (s == NULL) { perror(PROJECT); return -1; @@ -176,7 +171,7 @@ static int best_match(const line_t *line, cq = cs->width; do { - p = my_strnrstr(line->text, s, cq, 0); + p = u32_strnrstr(line->text->memory, s, cq, 0); if (p) { q = p + cq; while (*q) { @@ -191,7 +186,7 @@ static int best_match(const line_t *line, } if (!p && cq) { if (*s == ' ') { - memmove(s, s + 1, cq--); + u32_move(s, s + 1, cq--); } else if (s[cq - 1] == ' ') { s[--cq] = '\0'; } else { @@ -558,7 +553,7 @@ static design_t *detect_design() break; } for (j = 0; j < d->shape[scnt].height; ++j) { - shpln.text = d->shape[scnt].chars[j]; // TODO HERE + shpln.text = d->shape[scnt].chars[j]; // TODO shpln.len = d->shape[scnt].width; if (empty_line(&shpln)) { continue; @@ -803,11 +798,7 @@ static void add_spaces_to_line(line_t* line, const size_t n) if (n == 0) { return; } - line->mbtext_org = (uint32_t *) realloc(line->mbtext_org, (line->num_chars + n + 1) * sizeof(uint32_t)); - line->mbtext = line->mbtext_org; - u32_set(line->mbtext + line->num_chars, char_space, n); - set_char_at(line->mbtext, line->num_chars + n, char_nul); - line->num_chars += n; + bxs_append_spaces(line->text, n); analyze_line_ascii(&input, line); } @@ -860,7 +851,7 @@ int remove_box() */ const size_t normalized_len = input.maxline + opt.design->shape[NE].width; for (j = 0; j < input.num_lines; ++j) { - add_spaces_to_line(input.lines + j, normalized_len - input.lines[j].len); + add_spaces_to_line(input.lines + j, normalized_len - input.lines[j].text->num_columns); } #ifdef DEBUG fprintf(stderr, "Normalized all lines to %d columns (maxline + east width).\n", (int) input.maxline); @@ -930,7 +921,7 @@ int remove_box() fprintf(stderr, "\033[00;33;01mline %2d: no side match\033[00m\n", (int) j); #endif } - else { + else { // TODO HERE #ifdef DEBUG fprintf(stderr, "\033[00;33;01mline %2d: west: %d (\'%c\') to %d (\'%c\') [len %d]; " "east: %d (\'%c\') to %d (\'%c\') [len %d]\033[00m\n", (int) j, @@ -1070,15 +1061,16 @@ void output_input(const int trim_only) if (input.lines[j].text == NULL) { continue; } - btrim(input.lines[j].text, &(input.lines[j].len)); - btrim32(input.lines[j].mbtext, &(input.lines[j].num_chars)); + bxstr_t *temp = bxs_rtrim(input.lines[j].text); + bxs_free(input.lines[j].text); + input.lines[j].text = temp; if (trim_only) { continue; } char *indentspc = NULL; if (opt.tabexp == 'u') { - indent = strspn(input.lines[j].text, " "); + indent = input.lines[j].text->indent; ntabs = indent / opt.tabstop; nspcs = indent % opt.tabstop; indentspc = (char *) malloc(ntabs + nspcs + 1); @@ -1101,9 +1093,11 @@ void output_input(const int trim_only) indent = 0; } - fprintf(opt.outfile, "%s%s%s", indentspc, u32_strconv_to_output(advance32(input.lines[j].mbtext, indent)), + char *outtext = u32_strconv_to_output(bxs_first_char_ptr(input.lines[j].text, indent)); + fprintf(opt.outfile, "%s%s%s", indentspc, outtext, (input.final_newline || j < input.num_lines - 1 ? opt.eol : "")); - BFREE (indentspc); + BFREE(outtext); + BFREE(indentspc); } } diff --git a/src/tools.c b/src/tools.c index 58a8de7..fab0a45 100644 --- a/src/tools.c +++ b/src/tools.c @@ -273,54 +273,17 @@ char *repeat(char *s, size_t count) int empty_line(const line_t *line) -/* - * Return true if line is empty. - * - * Empty lines either consist entirely of whitespace or don't exist. - * -* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * - */ { - char *p; - size_t j; - if (!line) { return 1; } - if (line->text == NULL || line->len <= 0) { - return 1; - } - - for (p = line->text, j = 0; *p && j < line->len; ++j, ++p) { - if (*p != ' ' && *p != '\t' && *p != '\r') { - return 0; - } - } - return 1; + return bxs_is_blank(line->text); } size_t expand_tabs_into(const uint32_t *input_buffer, const int tabstop, uint32_t **text, size_t **tabpos, size_t *tabpos_len) -/* - * Expand tab chars in input_buffer and store result in text. - * - * input_buffer Line of text with tab chars - * tabstop tab stop distance - * text address of the pointer that will take the result - * tabpos array of ints giving the positions of the first - * space of an expanded tab in the text result buffer - * tabpos_len number of tabs recorded in tabpos - * - * Memory will be allocated for text and tabpos. - * Should only be called for lines of length > 0; - * - * RETURNS: Success: Length of the result line in characters (> 0) - * Error: 0 (e.g. out of memory) - * -* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * - */ { static uint32_t temp[LINE_MAX_BYTES + 100]; /* work string */ size_t io; /* character position in work string */ @@ -574,9 +537,11 @@ void print_input_lines(const char *heading) fprintf(stderr, "Input Lines%s:\n", heading != NULL ? heading : ""); fprintf(stderr, " [num_chars] \"real text\" [num_cols] \"ascii_text\"\n"); for (size_t i = 0; i < input.num_lines; ++i) { + char *outtext = bxs_to_output(input.lines[i].text); fprintf(stderr, "%4d [%02d] \"%s\" [%02d] \"%s\"", (int) i, - (int) input.lines[i].num_chars, u32_strconv_to_output(input.lines[i].mbtext), - (int) input.lines[i].len, input.lines[i].text); + (int) input.lines[i].text->num_chars, outtext, + (int) input.lines[i].text->num_columns, input.lines[i].text->ascii); + bxs_free(outtext); fprintf(stderr, "\tTabs: ["); if (input.lines[i].tabpos != NULL) { for (size_t j = 0; j < input.lines[i].tabpos_len; ++j) { @@ -587,13 +552,27 @@ void print_input_lines(const char *heading) } } fprintf(stderr, "] (%d)", (int) input.lines[i].tabpos_len); - fprintf(stderr, "\tinvisible=%d\n", (int) input.lines[i].invis); + fprintf(stderr, "\tinvisible=%d\n", (int) input.lines[i].text->num_chars_invisible); - fprintf(stderr, " posmap="); - if (input.lines[i].posmap != NULL) { + fprintf(stderr, " visible_char="); + if (input.lines[i].text->visible_char != NULL) { fprintf(stderr, "["); - for (size_t j = 0; j < input.lines[i].len; j++) { - fprintf(stderr, "%d%s", (int) input.lines[i].posmap[j], j == (input.lines[i].len - 1) ? "" : ", "); + for (size_t j = 0; j < input.lines[i].text->num_chars_visible; j++) { + fprintf(stderr, "%d%s", (int) input.lines[i].text->visible_char[j], + j == (input.lines[i].text->num_chars_visible - 1) ? "" : ", "); + } + fprintf(stderr, "]\n"); + } + else { + fprintf(stderr, "null\n"); + } + + fprintf(stderr, " first_char="); + if (input.lines[i].text->first_char != NULL) { + fprintf(stderr, "["); + for (size_t j = 0; j < input.lines[i].text->num_chars_visible; j++) { + fprintf(stderr, "%d%s", (int) input.lines[i].text->first_char[j], + j == (input.lines[i].text->num_chars_visible - 1) ? "" : ", "); } fprintf(stderr, "]\n"); } @@ -720,21 +699,8 @@ int is_csi_reset(const uint32_t *csi) void analyze_line_ascii(input_t *input_ptr, line_t *line) { - size_t num_esc = 0; - char *ascii; - size_t *map; - size_t invis = count_invisible_chars(line->mbtext, &num_esc, &ascii, &(map)); - line->invis = invis; - /* u32_strwidth() does not count control characters, i.e. ESC characters, for which we must correct */ - line->len = u32_strwidth(line->mbtext, encoding) - invis + num_esc; - line->num_chars = u32_strlen(line->mbtext); - BFREE(line->text); - line->text = ascii; - BFREE(line->posmap); - line->posmap = map; - - if (line->len > input_ptr->maxline) { - input_ptr->maxline = line->len; + if (line->text->num_columns > input_ptr->maxline) { + input_ptr->maxline = line->text->num_columns; } } diff --git a/src/tools.h b/src/tools.h index 12a1202..a010b9f 100644 --- a/src/tools.h +++ b/src/tools.h @@ -52,9 +52,29 @@ typedef void (*bx_fprintf_t)(FILE *stream, const char *format, ...); */ extern bx_fprintf_t bx_fprintf; + +/* + * Return true (1) if line is empty. + * Empty lines either consist entirely of whitespace or don't exist. + * @param line the line to check + * @return 1 if line is empty, 0 if it isn't + */ int empty_line(const line_t *line); +/** + * Expand tab chars in `input_buffer` and store result in `text`. + * Memory will be allocated for text and tabpos. + * Should only be called for lines of length > 0; + * + * @param input_buffer Line of text with tab chars + * @param tabstop tab stop distance as per command line options + * @param text address of the pointer that will take the result of this function + * @param tabpos array of ints giving the positions of the first space of an expanded tab in the text result buffer + * @param tabpos_len number of tabs recorded in tabpos + * @return Success: Length of the result line in characters (> 0); + * Error: 0 (e.g. out of memory) + */ size_t expand_tabs_into(const uint32_t *input_buffer, const int tabstop, uint32_t **text, size_t **tabpos, size_t *tabpos_len); diff --git a/src/unicode.c b/src/unicode.c index aef1040..c93bb8a 100644 --- a/src/unicode.c +++ b/src/unicode.c @@ -334,4 +334,41 @@ uint32_t *u32_nspaces(const size_t n) } + +uint32_t *u32_strnrstr(const uint32_t *s1, const uint32_t *s2, const size_t s2_len, int skip) +{ + if (is_empty(s2)) { + return (uint32_t *) s1; + } + if (is_empty(s1)) { + return NULL; + } + if (skip < 0) { + skip = 0; + } + + uint32_t *p = u32_strrchr(s1, s2[0]); + if (!p) { + return NULL; + } + + while (p >= s1) { + int comp = u32_strncmp(p, s2, s2_len); + if (comp == 0) { + if (skip--) { + --p; + } + else { + return p; + } + } + else { + --p; + } + } + + return NULL; +} + + /* vim: set cindent sw=4: */ diff --git a/src/unicode.h b/src/unicode.h index 0934eb4..bad777c 100644 --- a/src/unicode.h +++ b/src/unicode.h @@ -227,6 +227,17 @@ char *to_utf8(uint32_t *src); uint32_t *u32_nspaces(const size_t n); +/** + * Return pointer to last occurrence of string `s2` in string `s1`. + * @param s1 string to search + * @param s2 string to search for in `s1` + * @param s2_len length in characters of `s2` + * @param skip number of finds to ignore before returning anything + * @return pointer to last occurrence of string `s2` in string `s1`; NULL if not found or error + */ +uint32_t *u32_strnrstr(const uint32_t *s1, const uint32_t *s2, const size_t s2_len, int skip); + + #endif -/*EOF*/ /* vim: set cindent sw=4: */ +/* vim: set cindent sw=4: */ diff --git a/utest/Makefile b/utest/Makefile index 4e74b73..074dd5d 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -72,5 +72,5 @@ cmdline_test.o: cmdline_test.c cmdline_test.h boxes.h cmdline.h global_mock.h t tools_test.o: tools_test.c tools_test.h tools.h unicode.h config.h | check_dir regulex_test.o: regulex_test.c regulex_test.h boxes.h global_mock.h regulex.h config.h | check_dir main.o: main.c bxstring_test.h cmdline_test.h global_mock.h tools_test.h regulex_test.h unicode_test.h config.h | check_dir -unicode_test.o: unicode_test.c unicode_test.h boxes.h unicode.h config.h | check_dir +unicode_test.o: unicode_test.c unicode_test.h boxes.h tools.h unicode.h config.h | check_dir utest_tools.o: utest_tools.c utest_tools.h config.h | check_dir diff --git a/utest/bxstring_test.c b/utest/bxstring_test.c index d481c0e..3843c19 100644 --- a/utest/bxstring_test.c +++ b/utest/bxstring_test.c @@ -353,6 +353,32 @@ void test_ansi_unicode_null(void **state) +void test_bxs_new_empty_string(void **state) +{ + UNUSED(state); + + bxstr_t *actual = bxs_new_empty_string(); + + assert_non_null(actual); + assert_non_null(actual->memory); + assert_int_equal(1, is_char_at(actual->memory, 0, char_nul)); + assert_string_equal("", actual->ascii); + assert_int_equal(0, (int) actual->indent); + assert_int_equal(0, (int) actual->num_columns); + assert_int_equal(0, (int) actual->num_chars); + assert_int_equal(0, (int) actual->num_chars_visible); + assert_int_equal(0, (int) actual->num_chars_invisible); + assert_int_equal(0, (int) actual->trailing); + int expected_firstchar_idx[] = {0}; + assert_array_equal(expected_firstchar_idx, actual->first_char, 1); + int expected_vischar_idx[] = {0}; + assert_array_equal(expected_vischar_idx, actual->visible_char, 1); + + bxs_free(actual); +} + + + void test_bxs_strdup(void **state) { UNUSED(state); @@ -386,6 +412,81 @@ void test_bxs_strdup(void **state) +void test_bxs_cut_front(void **state) +{ + UNUSED(state); + + bxstr_t *actual = bxs_cut_front(NULL, 1); + assert_null(actual); + + uint32_t *ustr32 = u32_strconv_from_arg(" x\x1b[38;5;203mx\x1b[0m\x1b[38;5;198mf\x1b[0moo", "ASCII"); + assert_non_null(ustr32); + bxstr_t *input = bxs_from_unicode(ustr32); + actual = bxs_cut_front(input, 3); + + assert_non_null(actual); + assert_non_null(actual->memory); + assert_string_equal("foo", actual->ascii); + assert_int_equal(0, (int) actual->indent); + assert_int_equal(3, (int) actual->num_columns); + assert_int_equal(18, (int) actual->num_chars); + assert_int_equal(3, (int) actual->num_chars_visible); + assert_int_equal(15, (int) actual->num_chars_invisible); + assert_int_equal(0, (int) actual->trailing); + int expected_firstchar_idx[] = {0, 16, 17, 18}; + assert_array_equal(expected_firstchar_idx, actual->first_char, 4); + int expected_vischar_idx[] = {11, 16, 17, 18}; + assert_array_equal(expected_vischar_idx, actual->visible_char, 4); + bxs_free(actual); + + actual = bxs_cut_front(input, 1000); + assert_non_null(actual); + assert_non_null(actual->memory); + assert_string_equal("", actual->ascii); + assert_int_equal(0, (int) actual->indent); + assert_int_equal(0, (int) actual->num_columns); + assert_int_equal(0, (int) actual->num_chars); + assert_int_equal(0, (int) actual->num_chars_visible); + assert_int_equal(0, (int) actual->num_chars_invisible); + assert_int_equal(0, (int) actual->trailing); + bxs_free(actual); + + BFREE(ustr32); + bxs_free(input); +} + + + +void test_bxs_cut_front_zero(void **state) +{ + UNUSED(state); + + uint32_t *ustr32 = u32_strconv_from_arg(" x\x1b[38;5;203mx\x1b[0m\x1b[38;5;198mf\x1b[0moo", "ASCII"); + assert_non_null(ustr32); + bxstr_t *input = bxs_from_unicode(ustr32); + + bxstr_t *actual = actual = bxs_cut_front(input, 0); + assert_non_null(actual); + assert_non_null(actual->memory); + assert_string_equal(" xxfoo", actual->ascii); + assert_int_equal(1, (int) actual->indent); + assert_int_equal(6, (int) actual->num_columns); + assert_int_equal(36, (int) actual->num_chars); + assert_int_equal(6, (int) actual->num_chars_visible); + assert_int_equal(30, (int) actual->num_chars_invisible); + assert_int_equal(0, (int) actual->trailing); + int expected_firstchar_idx[] = {0, 1, 2, 18, 34, 35, 36}; + assert_array_equal(expected_firstchar_idx, actual->first_char, 7); + int expected_vischar_idx[] = {0, 1, 13, 29, 34, 35, 36}; + assert_array_equal(expected_vischar_idx, actual->visible_char, 7); + + bxs_free(actual); + BFREE(ustr32); + bxs_free(input); +} + + + void test_bxs_trimdup_null(void **state) { UNUSED(state); @@ -876,6 +977,37 @@ void test_bxs_rtrim_empty(void **state) +void test_bxs_append_spaces(void **state) +{ + UNUSED(state); + + bxs_append_spaces(NULL, 2); + + uint32_t *ustr32 = u32_strconv_from_arg("X\x1b[38;5;203mY\x1b[0mZ", "UTF-8"); + assert_non_null(ustr32); + bxstr_t *bxstr = bxs_from_unicode(ustr32); + bxs_append_spaces(bxstr, 0); + + bxs_append_spaces(bxstr, 3); + assert_non_null(bxstr->memory); + assert_string_equal("XYZ ", bxstr->ascii); + assert_int_equal(0, (int) bxstr->indent); + assert_int_equal(6, (int) bxstr->num_columns); + assert_int_equal(21, (int) bxstr->num_chars); + assert_int_equal(6, (int) bxstr->num_chars_visible); + assert_int_equal(15, (int) bxstr->num_chars_invisible); + assert_int_equal(3, (int) bxstr->trailing); + int expected_firstchar_idx[] = {0, 1, 17, 18, 19, 20, 21}; + assert_array_equal(expected_firstchar_idx, bxstr->first_char, 7); + int expected_vischar_idx[] = {0, 12, 17, 18, 19, 20, 21}; + assert_array_equal(expected_vischar_idx, bxstr->visible_char, 7); + + BFREE(ustr32); + bxs_free(bxstr); +} + + + void test_bxs_to_output(void **state) { UNUSED(state); @@ -905,6 +1037,40 @@ void test_bxs_is_empty_null(void **state) +void test_bxs_is_blank(void **state) +{ + UNUSED(state); + + assert_int_equal(1, bxs_is_blank(NULL)); + + bxstr_t *bxstr = bxs_new_empty_string(); + assert_int_equal(1, bxs_is_blank(bxstr)); + bxs_free(bxstr); + + uint32_t *ustr32 = u32_strconv_from_arg(" \x1b[38;5;203m \x1b[0m \x1b[38;5;203m\x1b[0m", "ASCII"); + assert_non_null(ustr32); + bxstr = bxs_from_unicode(ustr32); + assert_int_equal(1, bxs_is_blank(bxstr)); + BFREE(ustr32); + bxs_free(bxstr); + + ustr32 = u32_strconv_from_arg("\x1b[38;5;203m\x1b[0m", "ASCII"); + assert_non_null(ustr32); + bxstr = bxs_from_unicode(ustr32); + assert_int_equal(1, bxs_is_blank(bxstr)); + BFREE(ustr32); + bxs_free(bxstr); + + ustr32 = u32_strconv_from_arg("x", "ASCII"); + assert_non_null(ustr32); + bxstr = bxs_from_unicode(ustr32); + assert_int_equal(0, bxs_is_blank(bxstr)); + BFREE(ustr32); + bxs_free(bxstr); +} + + + void test_bxs_is_visible_char(void **state) { UNUSED(state); diff --git a/utest/bxstring_test.h b/utest/bxstring_test.h index 1e4beb4..1a5c9c6 100644 --- a/utest/bxstring_test.h +++ b/utest/bxstring_test.h @@ -36,8 +36,15 @@ void test_ansi_unicode_tabs(void **state); void test_ansi_unicode_broken_escapes(void **state); void test_ansi_unicode_null(void **state); +void test_bxs_new_empty_string(void **state); + +void test_bxs_is_blank(void **state); + void test_bxs_strdup(void **state); +void test_bxs_cut_front(void **state); +void test_bxs_cut_front_zero(void **state); + void test_bxs_trimdup_null(void **state); void test_bxs_trimdup_invalid_startidx(void **state); void test_bxs_trimdup_invalid_endidx(void **state); @@ -63,6 +70,8 @@ void test_bxs_trim_none(void **state); void test_bxs_rtrim(void **state); void test_bxs_rtrim_empty(void **state); +void test_bxs_append_spaces(void **state); + void test_bxs_to_output(void **state); void test_bxs_is_empty_null(void **state); diff --git a/utest/main.c b/utest/main.c index 132de88..1277731 100644 --- a/utest/main.c +++ b/utest/main.c @@ -115,7 +115,8 @@ int main(void) cmocka_unit_test(test_is_allowed_in_sample), cmocka_unit_test(test_is_allowed_in_shape), cmocka_unit_test(test_is_allowed_in_filename), - cmocka_unit_test(test_is_allowed_in_kv_string) + cmocka_unit_test(test_is_allowed_in_kv_string), + cmocka_unit_test(test_u32_strnrstr) }; const struct CMUnitTest bxstring_tests[] = { @@ -132,7 +133,11 @@ int main(void) cmocka_unit_test_setup(test_ansi_unicode_tabs, beforeTest), cmocka_unit_test_setup(test_ansi_unicode_broken_escapes, beforeTest), cmocka_unit_test_setup(test_ansi_unicode_null, beforeTest), + cmocka_unit_test_setup(test_bxs_new_empty_string, beforeTest), + cmocka_unit_test_setup(test_bxs_is_blank, beforeTest), cmocka_unit_test_setup(test_bxs_strdup, beforeTest), + cmocka_unit_test_setup(test_bxs_cut_front, beforeTest), + cmocka_unit_test_setup(test_bxs_cut_front_zero, beforeTest), cmocka_unit_test_setup(test_bxs_trimdup_null, beforeTest), cmocka_unit_test_setup(test_bxs_trimdup_invalid_startidx, beforeTest), cmocka_unit_test_setup(test_bxs_trimdup_invalid_endidx, beforeTest), @@ -153,6 +158,7 @@ int main(void) cmocka_unit_test_setup(test_bxs_trim_none, beforeTest), cmocka_unit_test_setup(test_bxs_rtrim, beforeTest), cmocka_unit_test_setup(test_bxs_rtrim_empty, beforeTest), + cmocka_unit_test_setup(test_bxs_append_spaces, beforeTest), cmocka_unit_test_setup(test_bxs_to_output, beforeTest), cmocka_unit_test_setup(test_bxs_is_empty_null, beforeTest), cmocka_unit_test_setup(test_bxs_is_visible_char, beforeTest), diff --git a/utest/unicode_test.c b/utest/unicode_test.c index 2213253..f0156f6 100644 --- a/utest/unicode_test.c +++ b/utest/unicode_test.c @@ -28,6 +28,7 @@ #include #include "boxes.h" +#include "tools.h" #include "unicode.h" #include "unicode_test.h" @@ -158,4 +159,28 @@ void test_is_allowed_in_kv_string(void **state) } + +void test_u32_strnrstr(void **state) +{ + UNUSED(state); + + uint32_t *haystack = u32_strconv_from_arg("a foo found found bar fou", "ASCII"); + assert_non_null(haystack); + uint32_t *needle = u32_strconv_from_arg("found", "ASCII"); + assert_non_null(needle); + + assert_null(u32_strnrstr(NULL, needle, u32_strlen(needle), 0)); + assert_ptr_equal(haystack, u32_strnrstr(haystack, NULL, 0, 0)); + + uint32_t *actual = u32_strnrstr(haystack, needle, u32_strlen(needle), 1); + assert_ptr_equal(haystack + 6, actual); + + actual = u32_strnrstr(haystack, needle, u32_strlen(needle), -1); /* -1 will be "fixed" to 0 */ + assert_ptr_equal(haystack + 12, actual); + + BFREE(haystack); + BFREE(needle); +} + + /* vim: set cindent sw=4: */ \ No newline at end of file diff --git a/utest/unicode_test.h b/utest/unicode_test.h index 093dc0c..0ed1785 100644 --- a/utest/unicode_test.h +++ b/utest/unicode_test.h @@ -27,6 +27,7 @@ void test_is_allowed_in_sample(void **state); void test_is_allowed_in_shape(void **state); void test_is_allowed_in_filename(void **state); void test_is_allowed_in_kv_string(void **state); +void test_u32_strnrstr(void **state); #endif