diff --git a/src/boxes.c b/src/boxes.c index a58c22c..a679f3b 100644 --- a/src/boxes.c +++ b/src/boxes.c @@ -5,12 +5,12 @@ * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License, version 2, as published * by the Free Software Foundation. - * + * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * for more details. - * + * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. @@ -27,7 +27,15 @@ #include #include #include + +#include +#include +#include +#include +#include + #include + #include "shape.h" #include "boxes.h" #include "tools.h" @@ -84,6 +92,14 @@ int anz_designs = 0; /* no of designs after parsing */ opt_t opt; /* command line options */ +char *encoding; /* the character encoding that we use */ + +ucs4_t char_tab = 0x00000009; /* ucs4_t character '\t' (tab) */ +ucs4_t char_space = 0x00000020; /* ucs4_t character ' ' (space) */ +ucs4_t char_cr = 0x0000000d; /* ucs4_t character '\r' (carriage return) */ +ucs4_t char_newline = 0x0000000a; /* ucs4_t character '\n' (newline) */ +ucs4_t char_nul = 0x00000000; /* ucs4_t character '\0' (zero) */ + input_t input = INPUT_INITIALIZER; /* input lines */ @@ -1353,16 +1369,18 @@ static int read_all_input (const int use_stdin) * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ { - char buf[LINE_MAX_BYTES+2]; /* input buffer */ - char c; - size_t invis; /* counts invisible characters */ - int ansipos; /* progression of ansi sequence */ - size_t input_size = 0; /* number of elements allocated */ - line_t *tmp = NULL; - char *temp = NULL; /* string resulting from tab exp. */ - size_t newlen; /* line length after tab expansion */ - size_t i; - int rc; + char buf[LINE_MAX_BYTES + 2]; /* input buffer */ + size_t len_bytes; + char c; + size_t invis; /* counts invisible characters */ + int ansipos; /* progression of ansi sequence */ + size_t input_size = 0; /* number of elements allocated */ + line_t *tmp = NULL; + char *temp = NULL; /* string resulting from tab exp. */ + uint8_t *mbtemp = NULL; /* temp string for preparing the multi-byte input */ + size_t newlen; /* line length after tab expansion */ + size_t i; + int rc; input.indent = LINE_MAX_BYTES; input.maxline = 0; @@ -1386,12 +1404,14 @@ static int read_all_input (const int use_stdin) input.lines = tmp; } - input.lines[input.anz_lines].len = strlen (buf); + len_bytes = strlen(buf); + mbtemp = u8_strconv_from_locale(buf); + input.lines[input.anz_lines].len = u8_strwidth(mbtemp, encoding); input.lines[input.anz_lines].num_leading_blanks = 0; - input.final_newline = has_linebreak(buf, input.lines[input.anz_lines].len); + input.final_newline = has_linebreak(buf, len_bytes); if (opt.r) { - input.lines[input.anz_lines].len -= 1; + input.lines[input.anz_lines].len -= 1; /* TODO HERE */ if (buf[input.lines[input.anz_lines].len] == '\n') buf[input.lines[input.anz_lines].len] = '\0'; } @@ -1582,6 +1602,12 @@ int main (int argc, char *argv[]) if (rc) exit (EXIT_FAILURE); + /* + * Store system character encoding + */ + setlocale(LC_ALL, ""); /* switch from default "C" encoding to system encoding */ + encoding = locale_charset(); + /* * Parse config file, then reset design pointer */ diff --git a/src/boxes.h.in b/src/boxes.h.in index 969d02c..522a0ee 100644 --- a/src/boxes.h.in +++ b/src/boxes.h.in @@ -30,7 +30,8 @@ /* #define PARSER_DEBUG */ /* #define LEXER_DEBUG */ -#include "regexp.h" +#include +#include "regexp/regexp.h" @@ -144,23 +145,33 @@ typedef struct { /* Command line options: */ extern opt_t opt; +extern char *encoding; /* the character encoding that we use */ + +extern ucs4_t char_tab; /* ucs4_t character '\t' (tab) */ +extern ucs4_t char_space; /* ucs4_t character ' ' (space) */ +extern ucs4_t char_cr; /* ucs4_t character '\r' (carriage return) */ +extern ucs4_t char_newline; /* ucs4_t character '\n' (newline) */ +extern ucs4_t char_nul; /* ucs4_t character '\0' (zero) */ + + typedef struct { - size_t len; /* length of text in characters */ - char *text; /* line content, tabs expanded */ - size_t invis; /* number of characters part of an ansi sequence */ - size_t vischar; /* number of normal printable characters */ - size_t *tabpos; /* tab positions in expanded work strings */ - size_t tabpos_len; /* number of tabs in a line */ - size_t num_leading_blanks; /* number of spaces at the start of the line after justification */ + size_t len; /* length of text in columns (character positions in a text terminal) */ + char *text; /* ASCII line content, tabs expanded, multi-byte chars replaced with 'x' */ + uint8_t *mbtext; /* multi-byte (original) line content, tabs expanded. We use UTF-8 so that our old regex code can find ASCII characters in it. */ + size_t invis; /* number of characters part of an ansi sequence */ + size_t vischar; /* number of normal printable characters */ + size_t *tabpos; /* tab positions in expanded work strings */ + size_t tabpos_len; /* number of tabs in a line */ + size_t num_leading_blanks; /* number of spaces at the start of the line after justification */ } line_t; #ifndef FILE_LEXER_L typedef struct { line_t *lines; - size_t anz_lines; /* number of entries in input */ - size_t maxline; /* length of longest input line */ - size_t indent; /* number of leading spaces found */ - int final_newline; /* true if the last line of input ends with newline */ + size_t anz_lines; /* number of entries in input */ + size_t maxline; /* length of longest input line */ + size_t indent; /* number of leading spaces found */ + int final_newline; /* true if the last line of input ends with newline */ } input_t; #define INPUT_INITIALIZER {NULL, 0, 0, LINE_MAX_BYTES, 0} diff --git a/src/tools.c b/src/tools.c index ecc3596..9f28d9f 100644 --- a/src/tools.c +++ b/src/tools.c @@ -29,6 +29,10 @@ #include #include #include + +#include +#include + #include "shape.h" #include "boxes.h" #include "tools.h" @@ -235,13 +239,13 @@ int empty_line(const line_t *line) -size_t expand_tabs_into(const char *input_buffer, const size_t in_len, - const int tabstop, char **text, size_t **tabpos, size_t *tabpos_len) +size_t expand_tabs_into(const uint32_t *input_buffer, const size_t in_len, + const int tabstop, uint32_t **text, size_t **tabpos, size_t *tabpos_len) /* * Expand tab chars in input_buffer and store result in text. * * input_buffer Line of text with tab chars - * in_len length of the string in input_buffer + * in_len length of the string in input_buffer in characters * tabstop tab stop distance * text address of the pointer that will take the result * tabpos array of ints giving the positions of the first @@ -257,22 +261,24 @@ size_t expand_tabs_into(const char *input_buffer, const size_t in_len, * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ { - static char temp[LINE_MAX_BYTES * MAX_TABSTOP + 1]; /* work string */ - size_t ii; /* position in input string */ - size_t io; /* position in work string */ - size_t jp; /* tab expansion jump point */ - size_t tabnum; /* number of tabs in input */ + static uint32_t temp[LINE_MAX_BYTES * MAX_TABSTOP + 1]; /* work string */ + size_t io; /* character position in work string */ + size_t tabnum; /* index of the current tab */ *text = NULL; - for (ii = 0, *tabpos_len = 0; ii < in_len; ++ii) { - if (input_buffer[ii] == '\t') { - (*tabpos_len)++; - } - } if (opt.tabexp != 'k') { *tabpos_len = 0; + } else { + ucs4_t puc; + const uint32_t *rest = input_buffer; + while (rest = u32_next(&puc, rest)) { + if (u32_cmp(&char_tab, &puc, 1) == 0) { + (*tabpos_len)++; + } + } } + if (*tabpos_len > 0) { *tabpos = (size_t *) calloc((*tabpos_len) + 1, sizeof(size_t)); if (*tabpos == NULL) { @@ -280,23 +286,26 @@ size_t expand_tabs_into(const char *input_buffer, const size_t in_len, } } - for (ii = 0, io = 0, tabnum = 0; ii < in_len && ((int) io) < (LINE_MAX_BYTES * tabstop - 1); ++ii) { - if (input_buffer[ii] == '\t') { + ucs4_t puc; + const uint32_t *rest = input_buffer; + io = 0; + while (rest = u32_next(&puc, rest)) { + if (u32_cmp(&char_tab, &puc, 1) == 0) { /* Is it a tab char? */ if (*tabpos_len > 0) { (*tabpos)[tabnum++] = io; } - for (jp = io + tabstop - (io % tabstop); io < jp; ++io) { - temp[io] = ' '; - } + size_t num_spc = tabstop - (io % tabstop); + u32_set(temp + io, char_space, num_spc); + io += num_spc; } else { - temp[io] = input_buffer[ii]; + u32_set(temp + io, puc, 1); ++io; } } - temp[io] = '\0'; + temp[io] = 0; - *text = (char *) strdup(temp); + *text = u32_strdup(temp); if (*text == NULL) { return 0; } diff --git a/src/tools.h b/src/tools.h index 8e49928..b3ab5af 100644 --- a/src/tools.h +++ b/src/tools.h @@ -43,8 +43,8 @@ void regerror(char *msg); int empty_line(const line_t *line); -size_t expand_tabs_into(const char *input_buffer, const size_t in_len, - const int tabstop, char **text, size_t **tabpos, size_t *tabpos_len); +size_t expand_tabs_into(const uint32_t *input_buffer, const size_t in_len, + const int tabstop, uint32_t **text, size_t **tabpos, size_t *tabpos_len); void btrim(char *text, size_t *len);