diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..d3d514e --- /dev/null +++ b/.gitattributes @@ -0,0 +1,13 @@ +* text=auto + +# shell scripts +*.sh text eol=lf + +# Windows batch files +*.bat text eol=crlf + +# the test cases +/test/*.txt text eol=lf + +# special test case for testing ISO encoding +/test/111_manual_encoding_iso.txt text working-tree-encoding=ISO_8859-15 diff --git a/doc/boxes.1.in b/doc/boxes.1.in index 70184e5..d54ec67 100644 --- a/doc/boxes.1.in +++ b/doc/boxes.1.in @@ -10,7 +10,7 @@ boxes \- text mode box and comment drawing filter .SH SYNOPSIS .B boxes [\-hlmrv] [\-a\ format] [\-d\ design] [\-f\ file] [\-i\ indent] [\-k\ bool] -[\-p\ pad] [\-s\ size] [\-t\ tabopts] [infile [outfile]] +[\-n\ encoding] [\-p\ pad] [\-s\ size] [\-t\ tabopts] [infile [outfile]] .SH DESCRIPTION .I Boxes is a text filter which can draw any kind of box around its input text. Box @@ -185,6 +185,14 @@ padding, indentation, etc. for the mended box. Implies false. .\" - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - .TP 0.6i +.B \-n \fIencoding\fP +Character encoding. Overrides the character encoding of the input and output +text. Choose from the list shown by \fIiconv -l\fP. If an invalid character +encoding is specified here, \fIUTF-8\fP is used as a fallback. The default +is to use the system encoding, which is normally the best course of action. +So don't specify this option unless you have to. +.\" - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +.TP 0.6i .B \-p \fIstring\fP Padding. Specify padding in spaces around the input text block for all sides of the box. The argument string may not contain whitespace and must @@ -357,4 +365,5 @@ configuration file (takes precedence over system-wide configuration file) system\-wide configuration file .\" ======================================================================= .SH "SEE ALSO" -.I figlet(6) +.BR figlet (6), +.BR iconv (1) diff --git a/src/Makefile b/src/Makefile index cbc385b..d9cd36a 100644 --- a/src/Makefile +++ b/src/Makefile @@ -79,7 +79,7 @@ lex.yy.c: lexer.l boxes.h cat lexer.tmp.c >> lex.yy.c rm lexer.tmp.c - +# TODO In the end, check declared dependencies again boxes.o: boxes.c boxes.h regulex.h shape.h tools.h unicode.h generate.h remove.h config.h tools.o: tools.c tools.h boxes.h shape.h config.h unicode.o: unicode.c unicode.h config.h diff --git a/src/boxes.c b/src/boxes.c index 19aa3bd..1482a3e 100644 --- a/src/boxes.c +++ b/src/boxes.c @@ -29,7 +29,6 @@ #include #include -#include #include #include #include @@ -107,6 +106,7 @@ static void usage(FILE *st) fprintf(st, " -k bool leading/trailing blank line retention on removal\n"); fprintf(st, " -l list available box designs w/ samples\n"); fprintf(st, " -m mend box, i.e. remove it and redraw it afterwards\n"); + fprintf(st, " -n enc Character encoding of input and output\n"); fprintf(st, " -p fmt padding [default: none]\n"); /* fprintf(st, " -q modify command for needs of the web UI (undocumented)\n"); */ fprintf(st, " -r remove box\n"); @@ -370,6 +370,7 @@ static int process_commandline(int argc, char *argv[]) opt.tabstop = DEF_TABSTOP; opt.tabexp = 'e'; opt.killblank = -1; + opt.encoding = NULL; for (idummy = 0; idummy < ANZ_SIDES; ++idummy) { opt.padding[idummy] = -1; } @@ -388,7 +389,7 @@ static int process_commandline(int argc, char *argv[]) * Parse Command Line */ do { - oc = getopt(argc, argv, "a:c:d:f:hi:k:lmp:qrs:t:v"); + oc = getopt(argc, argv, "a:c:d:f:hi:k:lmn:p:qrs:t:v"); switch (oc) { @@ -580,6 +581,17 @@ static int process_commandline(int argc, char *argv[]) opt.killblank = 0; break; + case 'n': + /* + * Character encoding + */ + opt.encoding = (char *) strdup(optarg); + if (opt.encoding == NULL) { + perror(PROJECT); + return 1; + } + break; + case 'p': /* * Padding. format is ([ahvtrbl]n)+ @@ -1257,6 +1269,9 @@ static int apply_substitutions(const int mode) /* * Compile regular expressions */ + #ifdef REGEXP_DEBUG + fprintf(stderr, "Compiling %d %s rule patterns\n", (int) anz_rules, mode ? "reversion" : "replacement"); + #endif errno = 0; opt.design->current_rule = rules; for (j = 0; j < anz_rules; ++j, ++(opt.design->current_rule)) { @@ -1278,13 +1293,13 @@ static int apply_substitutions(const int mode) for (j = 0; j < anz_rules; ++j, ++(opt.design->current_rule)) { #ifdef REGEXP_DEBUG fprintf (stderr, "regex_replace(0x%p, \"%s\", \"%s\", %d, \'%c\') == ", - rules[j].prog, rules[j].repstr, u32_strconv_to_locale(input.lines[k].mbtext), + rules[j].prog, rules[j].repstr, u32_strconv_to_output(input.lines[k].mbtext), (int) input.lines[k].num_chars, rules[j].mode); #endif uint32_t *newtext = regex_replace(rules[j].prog, rules[j].repstr, input.lines[k].mbtext, input.lines[k].num_chars, rules[j].mode == 'g'); #ifdef REGEXP_DEBUG - fprintf (stderr, "\"%s\"\n", newtext ? u32_strconv_to_locale(newtext) : "NULL"); + fprintf (stderr, "\"%s\"\n", newtext ? u32_strconv_to_output(newtext) : "NULL"); #endif if (newtext == NULL) { return 1; @@ -1298,7 +1313,7 @@ static int apply_substitutions(const int mode) #ifdef REGEXP_DEBUG fprintf (stderr, "input.lines[%d] == {%d, \"%s\"}\n", (int) k, - (int) input.lines[k].num_chars, u32_strconv_to_locale(input.lines[k].mbtext)); + (int) input.lines[k].num_chars, u32_strconv_to_output(input.lines[k].mbtext)); #endif } opt.design->current_rule = NULL; @@ -1393,7 +1408,7 @@ static int read_all_input(const int use_stdin) input.lines = tmp; } - mbtemp = u32_strconv_from_locale(buf); + mbtemp = u32_strconv_from_input(buf); len_chars = u32_strlen(mbtemp); input.final_newline = has_linebreak(mbtemp, len_chars); input.lines[input.anz_lines].posmap = NULL; @@ -1484,7 +1499,7 @@ static int read_all_input(const int use_stdin) for (i = 0; i < input.anz_lines; ++i) { #ifdef DEBUG fprintf(stderr, "%2d: mbtext = \"%s\" (%d chars)\n", (int) i, - u32_strconv_to_locale(input.lines[i].mbtext), (int) input.lines[i].num_chars); + u32_strconv_to_output(input.lines[i].mbtext), (int) input.lines[i].num_chars); #endif if (input.lines[i].num_chars >= input.indent) { memmove(input.lines[i].text, input.lines[i].text + input.indent, @@ -1496,7 +1511,7 @@ static int read_all_input(const int use_stdin) } #ifdef DEBUG fprintf(stderr, "%2d: mbtext = \"%s\" (%d chars)\n", (int) i, - u32_strconv_to_locale(input.lines[i].mbtext), (int) input.lines[i].num_chars); + u32_strconv_to_output(input.lines[i].mbtext), (int) input.lines[i].num_chars); #endif } input.maxline -= input.indent; @@ -1557,7 +1572,7 @@ int main(int argc, char *argv[]) * Store system character encoding */ setlocale(LC_ALL, ""); /* switch from default "C" encoding to system encoding */ - encoding = locale_charset(); + encoding = check_encoding(opt.encoding, locale_charset()); #ifdef DEBUG fprintf (stderr, "Character Encoding = %s\n", encoding); #endif diff --git a/src/boxes.h.in b/src/boxes.h.in index bf2b683..f0af288 100644 --- a/src/boxes.h.in +++ b/src/boxes.h.in @@ -139,6 +139,7 @@ typedef struct { /* Command line options: */ char indentmode; /* 'b', 't', 'n', or '\0' */ char justify; /* 'l', 'c', 'r', or '\0' */ int killblank; /* -1 if not set */ + char *encoding; /* character encoding override for input and output text */ FILE *infile; /* where we get our input */ FILE *outfile; /* where we put our output */ } opt_t; diff --git a/src/generate.c b/src/generate.c index 65235b4..61f8127 100644 --- a/src/generate.c +++ b/src/generate.c @@ -28,7 +28,6 @@ #include #include -#include #include #include "shape.h" @@ -729,7 +728,7 @@ static int justify_line(line_t *line, int skew) #if defined(DEBUG) || 0 fprintf (stderr, "justify_line(%c): Input: real: (%02d) \"%s\", text: (%02d) \"%s\", invisible=%d, skew=%d", - opt.justify ? opt.justify : '0', (int) line->num_chars, u32_strconv_to_locale(line->mbtext), + opt.justify ? opt.justify : '0', (int) line->num_chars, u32_strconv_to_output(line->mbtext), (int) line->len, line->text, (int) line->invis, skew); #endif @@ -998,7 +997,7 @@ int output_box(const sentry_t *thebox) concat_strings(obuf, LINE_MAX_BYTES + 1, 8, restored_indent, skip_left ? "" : thebox[BLEF].chars[j], hfill1, ti >= 0 && shift > 0 ? nspaces(shift) : "", - ti >= 0 ? u32_strconv_to_locale(mbtext_shifted) : "", + ti >= 0 ? u32_strconv_to_output(mbtext_shifted) : "", hfill2, nspaces(input.maxline - input.lines[ti].len - shift), thebox[BRIG].chars[j]); } diff --git a/src/regulex.c b/src/regulex.c index c2ebb16..b378325 100644 --- a/src/regulex.c +++ b/src/regulex.c @@ -26,9 +26,10 @@ #include #include #include -#include +#include #include "tools.h" +#include "unicode.h" #include "regulex.h" @@ -37,7 +38,10 @@ pcre2_code *compile_pattern(char *pattern) { int errornumber; PCRE2_SIZE erroroffset; - PCRE2_SPTR pattern32 = u32_strconv_from_locale(pattern); + PCRE2_SPTR pattern32 = u32_strconv_from_arg(pattern, config_encoding); + if (pattern32 == NULL) { + return NULL; + } pcre2_code *re = pcre2_compile( pattern32, /* the pattern */ @@ -51,7 +55,7 @@ pcre2_code *compile_pattern(char *pattern) PCRE2_UCHAR buffer[256]; pcre2_get_error_message(errornumber, buffer, sizeof(buffer)); fprintf(stderr, "Regular expression pattern \"%s\" failed to compile at offset %d: %s\n", - pattern, (int) erroroffset, u32_strconv_to_locale(buffer)); + pattern, (int) erroroffset, u32_strconv_to_output(buffer)); } return re; } @@ -60,7 +64,10 @@ pcre2_code *compile_pattern(char *pattern) uint32_t *regex_replace(pcre2_code *search, char *replace, uint32_t *input, const size_t input_len, const int global) { - PCRE2_SPTR replacement = u32_strconv_from_locale(replace); + PCRE2_SPTR replacement = u32_strconv_from_arg(replace, config_encoding); + if (replacement == NULL) { + return NULL; + } uint32_t options = PCRE2_SUBSTITUTE_OVERFLOW_LENGTH | PCRE2_SUBSTITUTE_EXTENDED | (global ? PCRE2_SUBSTITUTE_GLOBAL : 0); PCRE2_SIZE outlen = input_len * 2; /* estimated length of output buffer in characters, fine if too small */ @@ -101,7 +108,7 @@ uint32_t *regex_replace(pcre2_code *search, char *replace, uint32_t *input, cons PCRE2_UCHAR buffer[256]; pcre2_get_error_message(pcre2_rc, buffer, sizeof(buffer)); /* buffer will normally contain "invalid replacement string" */ - fprintf(stderr, "Error substituting \"%s\": %s\n", replace, u32_strconv_to_locale(buffer)); + fprintf(stderr, "Error substituting \"%s\": %s\n", replace, u32_strconv_to_output(buffer)); BFREE(output); return NULL; } diff --git a/src/remove.c b/src/remove.c index 3088778..d162e46 100644 --- a/src/remove.c +++ b/src/remove.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include "shape.h" @@ -1098,7 +1097,7 @@ void output_input(const int trim_only) indent = 0; } - fprintf(opt.outfile, "%s%s%s", indentspc, u32_strconv_to_locale(advance32(input.lines[j].mbtext, indent)), + fprintf(opt.outfile, "%s%s%s", indentspc, u32_strconv_to_output(advance32(input.lines[j].mbtext, indent)), (input.final_newline || j < input.anz_lines - 1 ? "\n" : "")); BFREE (indentspc); } diff --git a/src/tools.c b/src/tools.c index 2398a12..5dd85c7 100644 --- a/src/tools.c +++ b/src/tools.c @@ -30,7 +30,6 @@ #include #include -#include #include #include #include @@ -514,7 +513,7 @@ void print_input_lines(const char *heading) fprintf(stderr, " [num_chars] \"real text\" [num_cols] \"ascii_text\"\n"); for (size_t i = 0; i < input.anz_lines; ++i) { fprintf(stderr, "%4d [%02d] \"%s\" [%02d] \"%s\"", (int) i, - (int) input.lines[i].num_chars, u32_strconv_to_locale(input.lines[i].mbtext), + (int) input.lines[i].num_chars, u32_strconv_to_output(input.lines[i].mbtext), (int) input.lines[i].len, input.lines[i].text); fprintf(stderr, "\tTabs: ["); if (input.lines[i].tabpos != NULL) { diff --git a/src/unicode.c b/src/unicode.c index 57ef2c7..b49af3f 100644 --- a/src/unicode.c +++ b/src/unicode.c @@ -24,22 +24,42 @@ #include "config.h" #include +#include #include +#include +#include #include #include +#include "boxes.h" +#include "tools.h" #include "unicode.h" -const char *encoding; /* the character encoding that we use */ -const ucs4_t char_tab = 0x00000009; /* ucs4_t character '\t' (tab) */ -const ucs4_t char_space = 0x00000020; /* ucs4_t character ' ' (space) */ -const ucs4_t char_cr = 0x0000000d; /* ucs4_t character '\r' (carriage return) */ -const ucs4_t char_newline = 0x0000000a; /* ucs4_t character '\n' (newline) */ -const ucs4_t char_esc = 0x0000001b; /* ucs4_t character 0x1b (escape) */ -const ucs4_t char_nul = 0x00000000; /* ucs4_t character '\0' (zero) */ +const char *config_encoding = "ISO_8859-15"; + +/* effective character encoding of input and output text */ +const char *encoding; + +/* ucs4_t character '\t' (tab) */ +const ucs4_t char_tab = 0x00000009; + +/* ucs4_t character ' ' (space) */ +const ucs4_t char_space = 0x00000020; + +/* ucs4_t character '\r' (carriage return) */ +const ucs4_t char_cr = 0x0000000d; + +/* ucs4_t character '\n' (newline) */ +const ucs4_t char_newline = 0x0000000a; + +/* ucs4_t character 0x1b (escape) */ +const ucs4_t char_esc = 0x0000001b; + +/* ucs4_t character '\0' (zero) */ +const ucs4_t char_nul = 0x00000000; @@ -176,4 +196,78 @@ uint32_t *advance32(uint32_t *s, const size_t offset) } + +uint32_t *u32_strconv_from_input(const char *src) +{ + return u32_strconv_from_arg(src, encoding); +} + + + +uint32_t *u32_strconv_from_arg(const char *src, const char *sourceEncoding) +{ + if (src == NULL) { + return NULL; + } + if (src[0] == '\0') { + return new_empty_string32(); + } + + uint32_t *result = u32_strconv_from_encoding( + src, /* the source string to convert */ + sourceEncoding, /* the character encoding from which to convert */ + iconveh_question_mark); /* produce one question mark '?' per unconvertible character */ + + if (result == NULL) { + fprintf(stderr, "%s: failed to convert from '%s' to UTF-32: %s\n", PROJECT, sourceEncoding, strerror(errno)); + } + return result; +} + + + +char *u32_strconv_to_output(const uint32_t *src) +{ + return u32_strconv_to_arg(src, encoding); +} + + + +char *u32_strconv_to_arg(const uint32_t *src, const char *targetEncoding) +{ + if (src == NULL) { + return NULL; + } + if (is_empty(src)) { + return strdup(""); + } + + char *result = u32_strconv_to_encoding( + src, /* the source string to convert */ + targetEncoding, /* the character encoding to which to convert */ + iconveh_question_mark); /* produce one question mark '?' per unconvertible character */ + + if (result == NULL) { + fprintf(stderr, "%s: failed to convert from UTF-32 to '%s': %s\n", PROJECT, targetEncoding, strerror(errno)); + } + return result; +} + + + +const char *check_encoding(const char *manual_encoding, const char *system_encoding) +{ + if (manual_encoding != NULL) { + uint32_t *unicode = u32_strconv_from_encoding(" ", manual_encoding, iconveh_error); + if (unicode != NULL) { + BFREE(unicode); + return manual_encoding; + } + fprintf(stderr, "%s: Invalid character encoding: %s - falling back to %s\n", + PROJECT, manual_encoding, system_encoding); + } + return system_encoding; +} + + /*EOF*/ /* vim: set sw=4: */ diff --git a/src/unicode.h b/src/unicode.h index cbfc14f..d67da14 100644 --- a/src/unicode.h +++ b/src/unicode.h @@ -28,16 +28,31 @@ #include -extern const char *encoding; /* the character encoding that we use */ -extern const ucs4_t char_tab; /* ucs4_t character '\t' (tab) */ -extern const ucs4_t char_space; /* ucs4_t character ' ' (space) */ -extern const ucs4_t char_cr; /* ucs4_t character '\r' (carriage return) */ -extern const ucs4_t char_newline; /* ucs4_t character '\n' (newline) */ -extern const ucs4_t char_esc; /* ucs4_t character 0x1b (escape) */ -extern const ucs4_t char_nul; /* ucs4_t character '\0' (zero) */ +/** The boxes config file is still encoded with a single-byte character set. Officially, it is ASCII! + * However, people might not conform to this, so we use ISO_8859-15 as a reasonable superset. */ +extern const char *config_encoding; +/** the character encoding of input (and output) text */ +extern const char *encoding; +/** ucs4_t character '\t' (tab) */ +extern const ucs4_t char_tab; + +/** ucs4_t character ' ' (space) */ +extern const ucs4_t char_space; + +/** ucs4_t character '\r' (carriage return) */ +extern const ucs4_t char_cr; + +/** ucs4_t character '\n' (newline) */ +extern const ucs4_t char_newline; + +/** ucs4_t character 0x1b (escape) */ +extern const ucs4_t char_esc; + +/** ucs4_t character '\0' (zero) */ +extern const ucs4_t char_nul; int is_char_at(const uint32_t *text, const size_t idx, const ucs4_t expected_char); @@ -75,6 +90,55 @@ uint32_t *advance_next32(const uint32_t *s, size_t *invis); */ uint32_t *advance32(uint32_t *s, const size_t offset); +/** + * Convert a string from the input/output encoding (`encoding` in this .h file) to UTF-32 internal representation. + * Memory will be allocated for the converted string. + * + * @param string to convert, zero-terminated + * @return UTF-32 string, or NULL in case of error (then an error message was already printed on stderr) + */ +uint32_t *u32_strconv_from_input(const char *src); + +/** + * Convert a string from the given source encoding to UTF-32 internal representation. + * Memory will be allocated for the converted string. + * + * @param string to convert, zero-terminated + * @param the character encoding of + * @return UTF-32 string, or NULL in case of error (then an error message was already printed on stderr) + */ +uint32_t *u32_strconv_from_arg(const char *src, const char *sourceEncoding); + +/** + * Convert a string from UTF-32 internal representation to input/output encoding (`encoding` in this .h file). + * Memory will be allocated for the converted string. + * + * @param UTF-32 string to convert, zero-terminated + * @return string in input/output encoding, or NULL on error (then an error message was already printed on stderr) + */ +char *u32_strconv_to_output(const uint32_t *src); + +/** + * Convert a string from UTF-32 internal representation to the given target encoding. + * Memory will be allocated for the converted string. + * + * @param UTF-32 string to convert, zero-terminated + * @param the character encoding of the result + * @return string in target encoding, or NULL in case of error (then an error message was already printed on stderr) + */ +char *u32_strconv_to_arg(const uint32_t *src, const char *targetEncoding); + +/** + * Check if the given can be used to covert anything. This should reveal invalid encoding names that + * have been specified on the command line. If no was specified, or if an invalid encoding is + * detected, we fall back to the system encoding. No new memory is allocated. + * + * @param the encoding set on the command line, may be NULL + * @param the system encoding + * @return if it is set to a valid value, otherwise + */ +const char *check_encoding(const char *manual_encoding, const char *system_encoding); + #endif diff --git a/test/111_manual_encoding_iso.txt b/test/111_manual_encoding_iso.txt new file mode 100644 index 0000000..3a47b0b --- /dev/null +++ b/test/111_manual_encoding_iso.txt @@ -0,0 +1,24 @@ +:ARGS +-ac -n ISO_8859-15 +:INPUT + ä + äb + äbç + äbçd + äbçdé + äbçdéf + äbçdéfg + äbçdéfgh +:OUTPUT-FILTER +:EXPECTED + /**************/ + /* ä */ + /* äb */ + /* äbç */ + /* äbçd */ + /* äbçdé */ + /* äbçdéf */ + /* äbçdéfg */ + /* äbçdéfgh */ + /**************/ +:EOF