diff --git a/src/Makefile b/src/Makefile index c768193..3d580e8 100644 --- a/src/Makefile +++ b/src/Makefile @@ -26,9 +26,9 @@ GEN_HDR = parser.h boxes.h GEN_SRC = parser.c lex.yy.c GEN_FILES = $(GEN_SRC) $(GEN_HDR) ORIG_HDRCL = boxes.h.in config.h -ORIG_HDR = $(ORIG_HDRCL) lexer.h tools.h shape.h generate.h remove.h +ORIG_HDR = $(ORIG_HDRCL) lexer.h tools.h shape.h generate.h remove.h unicode.h ORIG_GEN = lexer.l parser.y -ORIG_NORM = boxes.c tools.c shape.c generate.c remove.c +ORIG_NORM = boxes.c tools.c shape.c generate.c remove.c unicode.c ORIG_SRC = $(ORIG_GEN) $(ORIG_NORM) ORIG_FILES = $(ORIG_SRC) $(ORIG_HDR) OTH_FILES = Makefile @@ -47,7 +47,7 @@ debug: flags_$(BOXES_PLATFORM) boxes: $(ALL_OBJ) $(MAKE) -C regexp CC=$(CC) libregexp.a - $(CC) $(LDFLAGS) $(ALL_OBJ) -o $(BOXES_EXECUTABLE_NAME) -lregexp + $(CC) $(LDFLAGS) $(ALL_OBJ) -o $(BOXES_EXECUTABLE_NAME) -lunistring -lpcre2-32 -lregexp if [ "$(STRIP)" = "true" ] ; then strip $(BOXES_EXECUTABLE_NAME) ; fi boxes.exe: $(ALL_OBJ) @@ -81,8 +81,9 @@ lex.yy.c: lexer.l boxes.h rm lexer.tmp.c -boxes.o: boxes.c boxes.h regexp/regexp.h shape.h tools.h generate.h remove.h config.h +boxes.o: boxes.c boxes.h regexp/regexp.h shape.h tools.h unicode.h generate.h remove.h config.h tools.o: tools.c tools.h boxes.h shape.h config.h +unicode.o: unicode.c unicode.h config.h shape.o: shape.c shape.h boxes.h config.h tools.h generate.o: generate.c generate.h boxes.h shape.h tools.h config.h remove.o: remove.c remove.h boxes.h shape.h tools.h config.h diff --git a/src/boxes.c b/src/boxes.c index a679f3b..4e856ea 100644 --- a/src/boxes.c +++ b/src/boxes.c @@ -21,6 +21,7 @@ #include "config.h" #include #include +#include #include #include #include @@ -29,6 +30,7 @@ #include #include +#include #include #include #include @@ -42,6 +44,7 @@ #include "regexp.h" #include "generate.h" #include "remove.h" +#include "unicode.h" #ifdef __MINGW32__ #include @@ -92,14 +95,6 @@ int anz_designs = 0; /* no of designs after parsing */ opt_t opt; /* command line options */ -char *encoding; /* the character encoding that we use */ - -ucs4_t char_tab = 0x00000009; /* ucs4_t character '\t' (tab) */ -ucs4_t char_space = 0x00000020; /* ucs4_t character ' ' (space) */ -ucs4_t char_cr = 0x0000000d; /* ucs4_t character '\r' (carriage return) */ -ucs4_t char_newline = 0x0000000a; /* ucs4_t character '\n' (newline) */ -ucs4_t char_nul = 0x00000000; /* ucs4_t character '\0' (zero) */ - input_t input = INPUT_INITIALIZER; /* input lines */ @@ -1175,12 +1170,12 @@ static int list_styles() -static int get_indent (const line_t *lines, const size_t lanz) +static int get_indent (const line_t *lines, const size_t lines_size) /* * Determine indentation of given lines in spaces. * - * lines the lines to examine - * lanz number of lines to examine + * lines the lines to examine + * lines_size number of lines to examine * * Lines are assumed to be free of trailing whitespace. * @@ -1190,31 +1185,32 @@ static int get_indent (const line_t *lines, const size_t lanz) * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ { - size_t j; - int res = LINE_MAX_BYTES; /* result */ - int nonblank = 0; /* true if one non-blank line found */ + int res = LINE_MAX_BYTES; /* result */ + int nonblank = 0; /* true if one non-blank line found */ if (lines == NULL) { - fprintf (stderr, "%s: internal error\n", PROJECT); + fprintf(stderr, "%s: internal error\n", PROJECT); return -1; } - if (lanz == 0) + if (lines_size == 0) { return 0; + } - for (j=0; j 0) { - size_t ispc; nonblank = 1; - ispc = strspn (lines[j].text, " "); - if ((int) ispc < res) + size_t ispc = strspn(lines[j].text, " "); + if ((int) ispc < res) { res = ispc; + } } } - if (nonblank) - return res; /* success */ - else - return 0; /* success, but only blank lines */ + if (nonblank) { + return res; /* success */ + } else { + return 0; /* success, but only blank lines */ + } } @@ -1327,12 +1323,12 @@ static int apply_substitutions (const int mode) -static int has_linebreak (const char *s, const int len) +static int has_linebreak (const uint32_t *s, const int len) /* * Determine if the given line of raw text is ended by a line break. * * s: the string to check - * len: length of s + * len: length of s in characters * * RETURNS: != 0 line break found * == 0 line break not found @@ -1342,10 +1338,10 @@ static int has_linebreak (const char *s, const int len) { int result = 0; if (s != NULL && len > 0) { - char the_last = s[len - 1]; - result = the_last == '\r' || the_last == '\n'; + ucs4_t the_last = s[len - 1]; + result = u32_cmp(&char_cr, &the_last, 1) == 0 || u32_cmp(&char_newline, &the_last, 1) == 0; #if defined(DEBUG) - fprintf(stderr, "has_linebreak: (%d) %d\n", the_last, result); + fprintf(stderr, "has_linebreak: (%#010x) %d\n", (int) the_last, result); #endif } return result; @@ -1353,6 +1349,60 @@ static int has_linebreak (const char *s, const int len) +static size_t count_invisible_chars(const uint32_t *s, const size_t buflen, size_t *num_esc, char **ascii) +{ + size_t invis = 0; /* counts invisible characters */ + int ansipos = 0; /* progression of ansi sequence */ + *num_esc = 0; /* counts the number of escape sequences found */ + + if (is_empty(s)) { + (*ascii) = (char *) strdup(""); + return 0; + } + (*ascii) = (char *) calloc(buflen, sizeof(char)); + char *p = *ascii; + + ucs4_t c; + const uint32_t *rest = s; + while ((rest = u32_next(&c, rest))) { + if (ansipos == 0 && c == 0x0000001b) { + /* Found an ESC char, count it as invisible and move 1 forward in the detection of CSI sequences */ + ansipos++; + invis++; + (*num_esc)++; + } else if (ansipos == 1 && c == '[') { + /* Found '[' char after ESC. A CSI sequence has started. */ + ansipos++; + invis++; + } else if (ansipos == 1 && c >= 0x40 && c <= 0x5f) { + /* Found a byte designating the end of a two-byte escape sequence */ + invis++; + ansipos = 0; + } else if (ansipos == 2) { + /* Inside CSI sequence - Keep counting bytes as invisible */ + invis++; + + /* A char between 0x40 and 0x7e signals the end of an CSI or escape sequence */ + if (c >= 0x40 && c <= 0x7e) { + ansipos = 0; + } + } else if (is_ascii_printable(c)) { + *p = c & 0xff; + ++p; + } else { + int cols = uc_width(c, encoding); + if (cols > 0) { + memset(p, (int) 'x', cols); + p += cols; + } + } + } + *p = '\0'; + return invis; +} + + + static int read_all_input (const int use_stdin) /* * Read entire input (possibly from stdin) and store it in 'input' array. @@ -1369,18 +1419,12 @@ static int read_all_input (const int use_stdin) * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ { - char buf[LINE_MAX_BYTES + 2]; /* input buffer */ - size_t len_bytes; - char c; - size_t invis; /* counts invisible characters */ - int ansipos; /* progression of ansi sequence */ - size_t input_size = 0; /* number of elements allocated */ - line_t *tmp = NULL; - char *temp = NULL; /* string resulting from tab exp. */ - uint8_t *mbtemp = NULL; /* temp string for preparing the multi-byte input */ - size_t newlen; /* line length after tab expansion */ - size_t i; - int rc; + char buf[LINE_MAX_BYTES + 2]; /* input buffer */ + size_t len_chars; + size_t input_size = 0; /* number of elements allocated */ + uint32_t *mbtemp = NULL; /* temp string for preparing the multi-byte input */ + size_t i; + int rc; input.indent = LINE_MAX_BYTES; input.maxline = 0; @@ -1391,96 +1435,72 @@ static int read_all_input (const int use_stdin) /* * Start reading */ - while (fgets (buf, LINE_MAX_BYTES+1, opt.infile)) - { + while (fgets(buf, LINE_MAX_BYTES + 1, opt.infile)) { if (input_size % 100 == 0) { input_size += 100; - tmp = (line_t *) realloc (input.lines, input_size*sizeof(line_t)); + line_t *tmp = (line_t *) realloc(input.lines, input_size * sizeof(line_t)); if (tmp == NULL) { - perror (PROJECT); + perror(PROJECT); BFREE (input.lines); return 1; } input.lines = tmp; } - len_bytes = strlen(buf); - mbtemp = u8_strconv_from_locale(buf); - input.lines[input.anz_lines].len = u8_strwidth(mbtemp, encoding); - input.lines[input.anz_lines].num_leading_blanks = 0; - input.final_newline = has_linebreak(buf, len_bytes); + mbtemp = u32_strconv_from_locale(buf); + len_chars = u32_strlen(mbtemp); + input.final_newline = has_linebreak(mbtemp, len_chars); if (opt.r) { - input.lines[input.anz_lines].len -= 1; /* TODO HERE */ - if (buf[input.lines[input.anz_lines].len] == '\n') - buf[input.lines[input.anz_lines].len] = '\0'; + if (is_char_at(mbtemp, len_chars - 1, char_newline)) { + set_char_at(mbtemp, len_chars - 1, char_nul); + --len_chars; + } } else { - btrim (buf, &(input.lines[input.anz_lines].len)); + btrim32(mbtemp, &len_chars); } - if (input.lines[input.anz_lines].len > 0) { - newlen = expand_tabs_into (buf, - input.lines[input.anz_lines].len, opt.tabstop, &temp, - &(input.lines[input.anz_lines].tabpos), - &(input.lines[input.anz_lines].tabpos_len)); - if (newlen == 0) { - perror (PROJECT); + /* + * Expand tabs + */ + if (len_chars > 0) { + uint32_t *temp = NULL; + len_chars = expand_tabs_into(mbtemp, opt.tabstop, &temp, + &(input.lines[input.anz_lines].tabpos), + &(input.lines[input.anz_lines].tabpos_len)); + if (len_chars == 0) { + perror(PROJECT); BFREE (input.lines); return 1; } - input.lines[input.anz_lines].text = temp; - input.lines[input.anz_lines].len = newlen; + input.lines[input.anz_lines].mbtext = temp; temp = NULL; } else { - input.lines[input.anz_lines].text = (char *) strdup (buf); + input.lines[input.anz_lines].mbtext = mbtemp; } + input.lines[input.anz_lines].num_chars = len_chars; /* * Find ANSI CSI/ESC sequences */ - invis = 0; - ansipos = 0; - for (i=0; i= 0x40 && c <= 0x5f) { - /* Found a byte designating the end of a two-byte - * escape sequence */ - invis++; - ansipos = 0; - } else if (ansipos == 2) { - /* Inside CSI sequence - Keep counting bytes as invisible */ - invis++; - - /* A char between 0x40 and 0x7e signals the end of an CSI or escape sequence */ - if (c >= 0x40 && c <= 0x7e) - ansipos = 0; - } - } - - /* Save the count of invisible chars and visible chars. - * I'm happy about suggestions for a more elegant handling - * of this and the use of .invis and .vischar (and .len) - * in the other functions. - */ + size_t num_esc = 0; + size_t invis = count_invisible_chars(input.lines[input.anz_lines].mbtext, strlen(buf), &num_esc, + &(input.lines[input.anz_lines].text)); input.lines[input.anz_lines].invis = invis; - input.lines[input.anz_lines].vischar = input.lines[input.anz_lines].len - invis; + input.lines[input.anz_lines].vischar = len_chars - invis; + + /* u32_strwidth() does not count control characters, i.e. ESC characters, for which we must correct */ + input.lines[input.anz_lines].len = + u32_strwidth(input.lines[input.anz_lines].mbtext, encoding) - invis + num_esc; + input.lines[input.anz_lines].num_leading_blanks = 0; /* * Update length of longest line */ - if (input.lines[input.anz_lines].vischar > input.maxline) { - input.maxline = input.lines[input.anz_lines].vischar; + if (input.lines[input.anz_lines].len > input.maxline) { + input.maxline = input.lines[input.anz_lines].len; } /* @@ -1489,8 +1509,8 @@ static int read_all_input (const int use_stdin) ++input.anz_lines; } - if (ferror (stdin)) { - perror (PROJECT); + if (ferror(stdin)) { + perror(PROJECT); BFREE (input.lines); return 1; } @@ -1498,39 +1518,51 @@ static int read_all_input (const int use_stdin) else { /* recalculate input statistics for redrawing the mended box */ - for (i=0; i input.maxline) + for (i = 0; i < input.anz_lines; ++i) { + size_t num_esc = 0; + char *dummy; + size_t invis = count_invisible_chars(input.lines[i].mbtext, strlen(input.lines[i].text), &num_esc, &dummy); + BFREE(dummy); + input.lines[i].len = u32_strwidth(input.lines[i].mbtext, encoding) - invis + num_esc; + input.lines[i].num_chars = u32_strlen(input.lines[i].mbtext); + if (input.lines[i].len > input.maxline) { input.maxline = input.lines[i].len; + } } } /* * Exit if there was no input at all */ - if (input.lines == NULL || input.lines[0].text == NULL) + if (input.lines == NULL || input.lines[0].text == NULL) { return 0; + } /* * Compute indentation */ - rc = get_indent (input.lines, input.anz_lines); - if (rc >= 0) + rc = get_indent(input.lines, input.anz_lines); + if (rc >= 0) { input.indent = (size_t) rc; - else + } else { return 1; + } /* * Remove indentation, unless we want to preserve it (when removing * a box or if the user wants to retain it inside the box) */ if (opt.design->indentmode != 't' && opt.r == 0) { - for (i=0; i= input.indent) { - memmove (input.lines[i].text, input.lines[i].text+input.indent, - input.lines[i].len-input.indent+1); + for (i = 0; i < input.anz_lines; ++i) { + if (input.lines[i].num_chars >= input.indent) { + memmove(input.lines[i].text, input.lines[i].text + input.indent, + input.lines[i].len - input.indent + 1); input.lines[i].len -= input.indent; input.lines[i].vischar -= input.indent; + + u32_move(input.lines[i].mbtext, input.lines[i].mbtext + input.indent, + input.lines[i].num_chars - input.indent + 1); + input.lines[i].num_chars -= input.indent; } } input.maxline -= input.indent; @@ -1540,32 +1572,38 @@ static int read_all_input (const int use_stdin) * Apply regular expression substitutions */ if (opt.r == 0) { - if (apply_substitutions(0) != 0) + if (apply_substitutions(0) != 0) { // TODO HERE return 1; + } } -#if 0 +#if 1 /* * Debugging Code: Display contents of input structure */ + fprintf (stderr, "Encoding: %s\n", encoding); + fprintf (stderr, "Input Lines:\n"); + fprintf (stderr, " [num_chars] \"real text\" [num_cols] \"ascii_text\"\n"); for (i=0; i #include +#include #include #include #include "shape.h" #include "boxes.h" +#include "unicode.h" #include "tools.h" @@ -239,13 +241,12 @@ int empty_line(const line_t *line) -size_t expand_tabs_into(const uint32_t *input_buffer, const size_t in_len, - const int tabstop, uint32_t **text, size_t **tabpos, size_t *tabpos_len) +size_t expand_tabs_into(const uint32_t *input_buffer, const int tabstop, uint32_t **text, + size_t **tabpos, size_t *tabpos_len) /* * Expand tab chars in input_buffer and store result in text. * * input_buffer Line of text with tab chars - * in_len length of the string in input_buffer in characters * tabstop tab stop distance * text address of the pointer that will take the result * tabpos array of ints giving the positions of the first @@ -262,18 +263,21 @@ size_t expand_tabs_into(const uint32_t *input_buffer, const size_t in_len, */ { static uint32_t temp[LINE_MAX_BYTES * MAX_TABSTOP + 1]; /* work string */ - size_t io; /* character position in work string */ - size_t tabnum; /* index of the current tab */ + size_t io; /* character position in work string */ + size_t tabnum = 0; /* index of the current tab */ *text = NULL; if (opt.tabexp != 'k') { + /* We need to know the exact tab positions only if expansion type 'k' is requested (keep tabs as much as they + * were as possible). Else we'll just convert spaces and tabs without having to know where exactly the tabs + * were in the first place. */ *tabpos_len = 0; } else { ucs4_t puc; const uint32_t *rest = input_buffer; - while (rest = u32_next(&puc, rest)) { - if (u32_cmp(&char_tab, &puc, 1) == 0) { + while ((rest = u32_next(&puc, rest))) { + if (puc == char_tab) { (*tabpos_len)++; } } @@ -289,8 +293,8 @@ size_t expand_tabs_into(const uint32_t *input_buffer, const size_t in_len, ucs4_t puc; const uint32_t *rest = input_buffer; io = 0; - while (rest = u32_next(&puc, rest)) { - if (u32_cmp(&char_tab, &puc, 1) == 0) { /* Is it a tab char? */ + while ((rest = u32_next(&puc, rest))) { + if (puc == char_tab) { if (*tabpos_len > 0) { (*tabpos)[tabnum++] = io; } @@ -299,7 +303,7 @@ size_t expand_tabs_into(const uint32_t *input_buffer, const size_t in_len, io += num_spc; } else { - u32_set(temp + io, puc, 1); + set_char_at(temp, io, puc); ++io; } } @@ -335,6 +339,34 @@ void btrim(char *text, size_t *len) +void btrim32(uint32_t *text, size_t *len) +/* + * Remove trailing whitespace from line (unicode version). + * + * text string to trim + * len pointer to the length of the string in characters + * + * Both the string and the length will be modified as trailing whitespace is removed. + * +* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + */ +{ + int idx = (int) (*len - 1); + + for (; idx >= 0; --idx) { + ucs4_t c = text[idx]; + if (uc_is_c_whitespace(c) || uc_is_property_white_space(c) || uc_is_property_bidi_whitespace(c)) { + set_char_at(text, idx, char_nul); + } else { + break; + } + } + + *len = idx + 1; +} + + + char *my_strnrstr(const char *s1, const char *s2, const size_t s2_len, int skip) /* * Return pointer to last occurrence of string s2 in string s1. diff --git a/src/tools.h b/src/tools.h index b3ab5af..35e0516 100644 --- a/src/tools.h +++ b/src/tools.h @@ -43,11 +43,13 @@ void regerror(char *msg); int empty_line(const line_t *line); -size_t expand_tabs_into(const uint32_t *input_buffer, const size_t in_len, - const int tabstop, uint32_t **text, size_t **tabpos, size_t *tabpos_len); +size_t expand_tabs_into(const uint32_t *input_buffer, const int tabstop, uint32_t **text, + size_t **tabpos, size_t *tabpos_len); void btrim(char *text, size_t *len); +void btrim32(uint32_t *text, size_t *len); + char *my_strnrstr(const char *s1, const char *s2, const size_t s2_len, int skip); diff --git a/src/unicode.c b/src/unicode.c new file mode 100644 index 0000000..38d9ad7 --- /dev/null +++ b/src/unicode.c @@ -0,0 +1,96 @@ +/* + * boxes - Command line filter to draw/remove ASCII boxes around text + * Copyright (C) 1999 Thomas Jensen and the boxes contributors + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License, version 2, as published + * by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * +* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + */ + +/* + * Functions and constants for handling unicode strings with libunistring. + */ + +#include "config.h" +#include +#include + +#include +#include +#include + +#include "unicode.h" + + +const char *encoding; /* the character encoding that we use */ + +const ucs4_t char_tab = 0x00000009; /* ucs4_t character '\t' (tab) */ +const ucs4_t char_space = 0x00000020; /* ucs4_t character ' ' (space) */ +const ucs4_t char_cr = 0x0000000d; /* ucs4_t character '\r' (carriage return) */ +const ucs4_t char_newline = 0x0000000a; /* ucs4_t character '\n' (newline) */ +const ucs4_t char_nul = 0x00000000; /* ucs4_t character '\0' (zero) */ + + + +/** + * Check whether the character at the given index has the given value. + * + * @param the string to check + * @param the index position of the character to check + * @param the expected character value + * @return flag indicating whether the character has the expected value + */ +int is_char_at(const uint32_t *text, const size_t idx, const ucs4_t expected_char) +{ + return text != NULL && u32_cmp(text + idx, &expected_char, 1) == 0; +} + + + +/** + * Set the character at the given index to the given value. + * + * @param the string to modify + * @param the index position of the character to modify + * @param the new character value + */ +void set_char_at(uint32_t *text, const size_t idx, const ucs4_t char_to_set) +{ + u32_set(text + idx, char_to_set, 1); +} + + + +/** + * Determine if a string is NULL/empty or not. + * + * @param the string to check + * @return > 0: the string is empty or NULL + * 0: the string contains at least 1 character + */ +int is_empty(const uint32_t *text) +{ + return text == NULL || is_char_at(text, 0, char_nul); +} + + + +int is_ascii_printable(const ucs4_t c) +{ + return c >= 0x20 && c < 0x7f; +} + + + +/*EOF*/ /* vim: set sw=4: */ diff --git a/src/unicode.h b/src/unicode.h new file mode 100644 index 0000000..8c8d165 --- /dev/null +++ b/src/unicode.h @@ -0,0 +1,49 @@ +/* + * boxes - Command line filter to draw/remove ASCII boxes around text + * Copyright (C) 1999 Thomas Jensen and the boxes contributors + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License, version 2, as published + * by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * +* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + */ + +/* + * Functions and constants for handling unicode strings with libunistring. + */ + +#ifndef UNICODE_H +#define UNICODE_H + +extern const char *encoding; /* the character encoding that we use */ + +extern const ucs4_t char_tab; /* ucs4_t character '\t' (tab) */ +extern const ucs4_t char_space; /* ucs4_t character ' ' (space) */ +extern const ucs4_t char_cr; /* ucs4_t character '\r' (carriage return) */ +extern const ucs4_t char_newline; /* ucs4_t character '\n' (newline) */ +extern const ucs4_t char_nul; /* ucs4_t character '\0' (zero) */ + + + +int is_char_at(const uint32_t *text, const size_t idx, const ucs4_t expected_char); + +void set_char_at(uint32_t *text, const size_t idx, const ucs4_t char_to_set); + +int is_empty(const uint32_t *text); + +int is_ascii_printable(const ucs4_t c); + + +#endif + +/*EOF*/ /* vim: set cindent sw=4: */