Add command line option -n to set character encoding of input text #1

2025-06-25 12:12:18 +02:00 · 2021-02-09 22:16:01 +01:00 · 2021-02-09 22:16:01 +01:00 · 44c2c526af
commit 44c2c526af
parent 4c656727ec
12 changed files with 262 additions and 38 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -0,0 +1,13 @@
+* text=auto
+
+# shell scripts
+*.sh text eol=lf
+
+# Windows batch files
+*.bat text eol=crlf
+
+# the test cases
+/test/*.txt text eol=lf
+
+# special test case for testing ISO encoding
+/test/111_manual_encoding_iso.txt text working-tree-encoding=ISO_8859-15
--- a/doc/boxes.1.in
+++ b/doc/boxes.1.in
@ -10,7 +10,7 @@ boxes \- text mode box and comment drawing filter
 .SH SYNOPSIS
 .B boxes
 [\-hlmrv] [\-a\ format] [\-d\ design] [\-f\ file] [\-i\ indent] [\-k\ bool]
-[\-p\ pad] [\-s\ size] [\-t\ tabopts] [infile [outfile]]
+[\-n\ encoding] [\-p\ pad] [\-s\ size] [\-t\ tabopts] [infile [outfile]]
 .SH DESCRIPTION
 .I Boxes
 is a text filter which can draw any kind of box around its input text. Box
@ -185,6 +185,14 @@ padding, indentation, etc. for the mended box. Implies
 false.
 .\" - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 .TP 0.6i
+.B \-n \fIencoding\fP
+Character encoding. Overrides the character encoding of the input and output
+text. Choose from the list shown by \fIiconv -l\fP. If an invalid character
+encoding is specified here, \fIUTF-8\fP is used as a fallback. The default
+is to use the system encoding, which is normally the best course of action.
+So don't specify this option unless you have to.
+.\" - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+.TP 0.6i
 .B \-p \fIstring\fP
 Padding. Specify padding in spaces around the input text block for all
 sides of the box. The argument string may not contain whitespace and must
@ -357,4 +365,5 @@ configuration file (takes precedence over system-wide configuration file)
 system\-wide configuration file
 .\" =======================================================================
 .SH "SEE ALSO"
-.I figlet(6)
+.BR figlet (6),
+.BR iconv (1)
--- a/src/Makefile
+++ b/src/Makefile
@ -79,7 +79,7 @@ lex.yy.c: lexer.l boxes.h
 	cat lexer.tmp.c >> lex.yy.c
 	rm lexer.tmp.c

-
+# TODO In the end, check declared dependencies again
 boxes.o: boxes.c boxes.h regulex.h shape.h tools.h unicode.h generate.h remove.h config.h
 tools.o: tools.c tools.h boxes.h shape.h config.h
 unicode.o: unicode.c unicode.h config.h
--- a/src/boxes.c
+++ b/src/boxes.c
@ -29,7 +29,6 @@
 #include <sys/types.h>
 #include <sys/stat.h>

-#include <uniconv.h>
 #include <unictype.h>
 #include <unistdio.h>
 #include <unistr.h>
@ -107,6 +106,7 @@ static void usage(FILE *st)
    fprintf(st, "        -k bool  leading/trailing blank line retention on removal\n");
    fprintf(st, "        -l       list available box designs w/ samples\n");
    fprintf(st, "        -m       mend box, i.e. remove it and redraw it afterwards\n");
+    fprintf(st, "        -n enc   Character encoding of input and output\n");
    fprintf(st, "        -p fmt   padding [default: none]\n");
    /* fprintf(st, "        -q       modify command for needs of the web UI (undocumented)\n"); */
    fprintf(st, "        -r       remove box\n");
@ -370,6 +370,7 @@ static int process_commandline(int argc, char *argv[])
    opt.tabstop = DEF_TABSTOP;
    opt.tabexp = 'e';
    opt.killblank = -1;
+    opt.encoding = NULL;
    for (idummy = 0; idummy < ANZ_SIDES; ++idummy) {
        opt.padding[idummy] = -1;
    }
@ -388,7 +389,7 @@ static int process_commandline(int argc, char *argv[])
     *  Parse Command Line
     */
    do {
-        oc = getopt(argc, argv, "a:c:d:f:hi:k:lmp:qrs:t:v");
+        oc = getopt(argc, argv, "a:c:d:f:hi:k:lmn:p:qrs:t:v");

        switch (oc) {

@ -580,6 +581,17 @@ static int process_commandline(int argc, char *argv[])
                opt.killblank = 0;
                break;

+            case 'n':
+                /*
+                 *  Character encoding
+                 */
+                opt.encoding = (char *) strdup(optarg);
+                if (opt.encoding == NULL) {
+                    perror(PROJECT);
+                    return 1;
+                }
+                break;
+
            case 'p':
                /*
                 *  Padding. format is ([ahvtrbl]n)+
@ -1257,6 +1269,9 @@ static int apply_substitutions(const int mode)
    /*
     *  Compile regular expressions
     */
+    #ifdef REGEXP_DEBUG
+        fprintf(stderr, "Compiling %d %s rule patterns\n", (int) anz_rules, mode ? "reversion" : "replacement");
+    #endif
    errno = 0;
    opt.design->current_rule = rules;
    for (j = 0; j < anz_rules; ++j, ++(opt.design->current_rule)) {
@ -1278,13 +1293,13 @@ static int apply_substitutions(const int mode)
        for (j = 0; j < anz_rules; ++j, ++(opt.design->current_rule)) {
            #ifdef REGEXP_DEBUG
            fprintf (stderr, "regex_replace(0x%p, \"%s\", \"%s\", %d, \'%c\') == ",
-                    rules[j].prog, rules[j].repstr, u32_strconv_to_locale(input.lines[k].mbtext),
+                    rules[j].prog, rules[j].repstr, u32_strconv_to_output(input.lines[k].mbtext),
                    (int) input.lines[k].num_chars, rules[j].mode);
            #endif
            uint32_t *newtext = regex_replace(rules[j].prog, rules[j].repstr,
                                              input.lines[k].mbtext, input.lines[k].num_chars, rules[j].mode == 'g');
            #ifdef REGEXP_DEBUG
-                fprintf (stderr, "\"%s\"\n", newtext ? u32_strconv_to_locale(newtext) : "NULL");
+                fprintf (stderr, "\"%s\"\n", newtext ? u32_strconv_to_output(newtext) : "NULL");
            #endif
            if (newtext == NULL) {
                return 1;
@ -1298,7 +1313,7 @@ static int apply_substitutions(const int mode)

            #ifdef REGEXP_DEBUG
                fprintf (stderr, "input.lines[%d] == {%d, \"%s\"}\n", (int) k,
-                    (int) input.lines[k].num_chars, u32_strconv_to_locale(input.lines[k].mbtext));
+                    (int) input.lines[k].num_chars, u32_strconv_to_output(input.lines[k].mbtext));
            #endif
        }
        opt.design->current_rule = NULL;
@ -1393,7 +1408,7 @@ static int read_all_input(const int use_stdin)
                input.lines = tmp;
            }

-            mbtemp = u32_strconv_from_locale(buf);
+            mbtemp = u32_strconv_from_input(buf);
            len_chars = u32_strlen(mbtemp);
            input.final_newline = has_linebreak(mbtemp, len_chars);
            input.lines[input.anz_lines].posmap = NULL;
@ -1484,7 +1499,7 @@ static int read_all_input(const int use_stdin)
        for (i = 0; i < input.anz_lines; ++i) {
            #ifdef DEBUG
                fprintf(stderr, "%2d: mbtext = \"%s\" (%d chars)\n", (int) i,
-                    u32_strconv_to_locale(input.lines[i].mbtext), (int) input.lines[i].num_chars);
+                    u32_strconv_to_output(input.lines[i].mbtext), (int) input.lines[i].num_chars);
            #endif
            if (input.lines[i].num_chars >= input.indent) {
                memmove(input.lines[i].text, input.lines[i].text + input.indent,
@ -1496,7 +1511,7 @@ static int read_all_input(const int use_stdin)
            }
            #ifdef DEBUG
                fprintf(stderr, "%2d: mbtext = \"%s\" (%d chars)\n", (int) i,
-                    u32_strconv_to_locale(input.lines[i].mbtext), (int) input.lines[i].num_chars);
+                    u32_strconv_to_output(input.lines[i].mbtext), (int) input.lines[i].num_chars);
            #endif
        }
        input.maxline -= input.indent;
@ -1557,7 +1572,7 @@ int main(int argc, char *argv[])
     * Store system character encoding
     */
    setlocale(LC_ALL, "");    /* switch from default "C" encoding to system encoding */
-    encoding = locale_charset();
+    encoding = check_encoding(opt.encoding, locale_charset());
    #ifdef DEBUG
        fprintf (stderr, "Character Encoding = %s\n", encoding);
    #endif
--- a/src/boxes.h.in
+++ b/src/boxes.h.in
@ -139,6 +139,7 @@ typedef struct {                         /* Command line options: */
    char      indentmode;                /* 'b', 't', 'n', or '\0' */
    char      justify;                   /* 'l', 'c', 'r', or '\0' */
    int       killblank;                 /* -1 if not set */
+    char     *encoding;                  /* character encoding override for input and output text */
    FILE     *infile;                    /* where we get our input */
    FILE     *outfile;                   /* where we put our output */
 } opt_t;
--- a/src/generate.c
+++ b/src/generate.c
@ -28,7 +28,6 @@
 #include <stdint.h>
 #include <string.h>

-#include <uniconv.h>
 #include <unistr.h>

 #include "shape.h"
@ -729,7 +728,7 @@ static int justify_line(line_t *line, int skew)

    #if defined(DEBUG) || 0
        fprintf (stderr, "justify_line(%c):  Input: real: (%02d) \"%s\", text: (%02d) \"%s\", invisible=%d, skew=%d",
-             opt.justify ? opt.justify : '0', (int) line->num_chars, u32_strconv_to_locale(line->mbtext),
+             opt.justify ? opt.justify : '0', (int) line->num_chars, u32_strconv_to_output(line->mbtext),
             (int) line->len, line->text, (int) line->invis, skew);
    #endif

@ -998,7 +997,7 @@ int output_box(const sentry_t *thebox)
                concat_strings(obuf, LINE_MAX_BYTES + 1, 8, restored_indent,
                               skip_left ? "" : thebox[BLEF].chars[j], hfill1,
                               ti >= 0 && shift > 0 ? nspaces(shift) : "",
-                               ti >= 0 ? u32_strconv_to_locale(mbtext_shifted) : "",
+                               ti >= 0 ? u32_strconv_to_output(mbtext_shifted) : "",
                               hfill2, nspaces(input.maxline - input.lines[ti].len - shift),
                               thebox[BRIG].chars[j]);
            }
--- a/src/regulex.c
+++ b/src/regulex.c
@ -26,9 +26,10 @@
 #include <errno.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <uniconv.h>
+#include <string.h>

 #include "tools.h"
+#include "unicode.h"
 #include "regulex.h"


@ -37,7 +38,10 @@ pcre2_code *compile_pattern(char *pattern)
 {
    int errornumber;
    PCRE2_SIZE erroroffset;
-    PCRE2_SPTR pattern32 = u32_strconv_from_locale(pattern);
+    PCRE2_SPTR pattern32 = u32_strconv_from_arg(pattern, config_encoding);
+    if (pattern32 == NULL) {
+        return NULL;
+    }

    pcre2_code *re = pcre2_compile(
            pattern32,               /* the pattern */
@ -51,7 +55,7 @@ pcre2_code *compile_pattern(char *pattern)
        PCRE2_UCHAR buffer[256];
        pcre2_get_error_message(errornumber, buffer, sizeof(buffer));
        fprintf(stderr, "Regular expression pattern \"%s\" failed to compile at offset %d: %s\n",
-                pattern, (int) erroroffset, u32_strconv_to_locale(buffer));
+                pattern, (int) erroroffset, u32_strconv_to_output(buffer));
    }
    return re;
 }
@ -60,7 +64,10 @@ pcre2_code *compile_pattern(char *pattern)

 uint32_t *regex_replace(pcre2_code *search, char *replace, uint32_t *input, const size_t input_len, const int global)
 {
-    PCRE2_SPTR replacement = u32_strconv_from_locale(replace);
+    PCRE2_SPTR replacement = u32_strconv_from_arg(replace, config_encoding);
+    if (replacement == NULL) {
+        return NULL;
+    }
    uint32_t options = PCRE2_SUBSTITUTE_OVERFLOW_LENGTH | PCRE2_SUBSTITUTE_EXTENDED
            | (global ? PCRE2_SUBSTITUTE_GLOBAL : 0);
    PCRE2_SIZE outlen = input_len * 2;     /* estimated length of output buffer in characters, fine if too small */
@ -101,7 +108,7 @@ uint32_t *regex_replace(pcre2_code *search, char *replace, uint32_t *input, cons
        PCRE2_UCHAR buffer[256];
        pcre2_get_error_message(pcre2_rc, buffer, sizeof(buffer));
        /* buffer will normally contain "invalid replacement string" */
-        fprintf(stderr, "Error substituting \"%s\": %s\n", replace, u32_strconv_to_locale(buffer));
+        fprintf(stderr, "Error substituting \"%s\": %s\n", replace, u32_strconv_to_output(buffer));
        BFREE(output);
        return NULL;
    }
--- a/src/remove.c
+++ b/src/remove.c
@ -26,7 +26,6 @@
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
-#include <uniconv.h>
 #include <unistr.h>

 #include "shape.h"
@ -1098,7 +1097,7 @@ void output_input(const int trim_only)
            indent = 0;
        }

-        fprintf(opt.outfile, "%s%s%s", indentspc, u32_strconv_to_locale(advance32(input.lines[j].mbtext, indent)),
+        fprintf(opt.outfile, "%s%s%s", indentspc, u32_strconv_to_output(advance32(input.lines[j].mbtext, indent)),
                (input.final_newline || j < input.anz_lines - 1 ? "\n" : ""));
        BFREE (indentspc);
    }
--- a/src/tools.c
+++ b/src/tools.c
@ -30,7 +30,6 @@
 #include <string.h>
 #include <strings.h>

-#include <uniconv.h>
 #include <unictype.h>
 #include <unistr.h>
 #include <unitypes.h>
@ -514,7 +513,7 @@ void print_input_lines(const char *heading)
    fprintf(stderr, "     [num_chars] \"real text\" [num_cols] \"ascii_text\"\n");
    for (size_t i = 0; i < input.anz_lines; ++i) {
        fprintf(stderr, "%4d [%02d] \"%s\"  [%02d] \"%s\"", (int) i,
-                (int) input.lines[i].num_chars, u32_strconv_to_locale(input.lines[i].mbtext),
+                (int) input.lines[i].num_chars, u32_strconv_to_output(input.lines[i].mbtext),
                (int) input.lines[i].len, input.lines[i].text);
        fprintf(stderr, "\tTabs: [");
        if (input.lines[i].tabpos != NULL) {
--- a/src/unicode.c
+++ b/src/unicode.c
@ -24,22 +24,42 @@

 #include "config.h"
 #include <errno.h>
+#include <stdio.h>
 #include <stdlib.h>
+#include <string.h>

+#include <uniconv.h>
 #include <unictype.h>
 #include <unistr.h>

+#include "boxes.h"
+#include "tools.h"
 #include "unicode.h"


-const char *encoding;                          /* the character encoding that we use */

-const ucs4_t char_tab     = 0x00000009;        /* ucs4_t character '\t' (tab)  */
-const ucs4_t char_space   = 0x00000020;        /* ucs4_t character ' '  (space) */
-const ucs4_t char_cr      = 0x0000000d;        /* ucs4_t character '\r' (carriage return) */
-const ucs4_t char_newline = 0x0000000a;        /* ucs4_t character '\n' (newline) */
-const ucs4_t char_esc     = 0x0000001b;        /* ucs4_t character 0x1b (escape)  */
-const ucs4_t char_nul     = 0x00000000;        /* ucs4_t character '\0' (zero) */
+const char *config_encoding = "ISO_8859-15";
+
+/* effective character encoding of input and output text */
+const char *encoding;
+
+/* ucs4_t character '\t' (tab)  */
+const ucs4_t char_tab = 0x00000009;
+
+/* ucs4_t character ' '  (space) */
+const ucs4_t char_space = 0x00000020;
+
+/* ucs4_t character '\r' (carriage return) */
+const ucs4_t char_cr = 0x0000000d;
+
+/* ucs4_t character '\n' (newline) */
+const ucs4_t char_newline = 0x0000000a;
+
+/* ucs4_t character 0x1b (escape)  */
+const ucs4_t char_esc = 0x0000001b;
+
+/* ucs4_t character '\0' (zero) */
+const ucs4_t char_nul = 0x00000000;



@ -176,4 +196,78 @@ uint32_t *advance32(uint32_t *s, const size_t offset)
 }


+
+uint32_t *u32_strconv_from_input(const char *src)
+{
+    return u32_strconv_from_arg(src, encoding);
+}
+
+
+
+uint32_t *u32_strconv_from_arg(const char *src, const char *sourceEncoding)
+{
+    if (src == NULL) {
+        return NULL;
+    }
+    if (src[0] == '\0') {
+        return new_empty_string32();
+    }
+
+    uint32_t *result = u32_strconv_from_encoding(
+            src,                    /* the source string to convert */
+            sourceEncoding,         /* the character encoding from which to convert */
+            iconveh_question_mark); /* produce one question mark '?' per unconvertible character */
+
+    if (result == NULL) {
+        fprintf(stderr, "%s: failed to convert from '%s' to UTF-32: %s\n", PROJECT, sourceEncoding, strerror(errno));
+    }
+    return result;
+}
+
+
+
+char *u32_strconv_to_output(const uint32_t *src)
+{
+    return u32_strconv_to_arg(src, encoding);
+}
+
+
+
+char *u32_strconv_to_arg(const uint32_t *src, const char *targetEncoding)
+{
+    if (src == NULL) {
+        return NULL;
+    }
+    if (is_empty(src)) {
+        return strdup("");
+    }
+
+    char *result = u32_strconv_to_encoding(
+            src,                    /* the source string to convert */
+            targetEncoding,         /* the character encoding to which to convert */
+            iconveh_question_mark); /* produce one question mark '?' per unconvertible character */
+
+    if (result == NULL) {
+        fprintf(stderr, "%s: failed to convert from UTF-32 to '%s': %s\n", PROJECT, targetEncoding, strerror(errno));
+    }
+    return result;
+}
+
+
+
+const char *check_encoding(const char *manual_encoding, const char *system_encoding)
+{
+    if (manual_encoding != NULL) {
+        uint32_t *unicode = u32_strconv_from_encoding(" ", manual_encoding, iconveh_error);
+        if (unicode != NULL) {
+            BFREE(unicode);
+            return manual_encoding;
+        }
+        fprintf(stderr, "%s: Invalid character encoding: %s - falling back to %s\n",
+                PROJECT, manual_encoding, system_encoding);
+    }
+    return system_encoding;
+}
+
+
 /*EOF*/                                                  /* vim: set sw=4: */
--- a/src/unicode.h
+++ b/src/unicode.h
@ -28,16 +28,31 @@
 #include <unitypes.h>


-extern const char *encoding;                   /* the character encoding that we use */

-extern const ucs4_t char_tab;                  /* ucs4_t character '\t' (tab)  */
-extern const ucs4_t char_space;                /* ucs4_t character ' '  (space) */
-extern const ucs4_t char_cr;                   /* ucs4_t character '\r' (carriage return) */
-extern const ucs4_t char_newline;              /* ucs4_t character '\n' (newline) */
-extern const ucs4_t char_esc;                  /* ucs4_t character 0x1b (escape)  */
-extern const ucs4_t char_nul;                  /* ucs4_t character '\0' (zero) */
+/** The boxes config file is still encoded with a single-byte character set. Officially, it is ASCII!
+ *  However, people might not conform to this, so we use ISO_8859-15 as a reasonable superset. */
+extern const char *config_encoding;

+/** the character encoding of input (and output) text */
+extern const char *encoding;

+/** ucs4_t character '\t' (tab)  */
+extern const ucs4_t char_tab;
+
+/** ucs4_t character ' '  (space) */
+extern const ucs4_t char_space;
+
+/** ucs4_t character '\r' (carriage return) */
+extern const ucs4_t char_cr;
+
+/** ucs4_t character '\n' (newline) */
+extern const ucs4_t char_newline;
+
+/** ucs4_t character 0x1b (escape)  */
+extern const ucs4_t char_esc;
+
+/** ucs4_t character '\0' (zero) */
+extern const ucs4_t char_nul;

 int is_char_at(const uint32_t *text, const size_t idx, const ucs4_t expected_char);

@ -75,6 +90,55 @@ uint32_t *advance_next32(const uint32_t *s, size_t *invis);
 */
 uint32_t *advance32(uint32_t *s, const size_t offset);

+/**
+ * Convert a string from the input/output encoding (`encoding` in this .h file) to UTF-32 internal representation.
+ * Memory will be allocated for the converted string.
+ *
+ * @param <src> string to convert, zero-terminated
+ * @return UTF-32 string, or NULL in case of error (then an error message was already printed on stderr)
+ */
+uint32_t *u32_strconv_from_input(const char *src);
+
+/**
+ * Convert a string from the given source encoding to UTF-32 internal representation.
+ * Memory will be allocated for the converted string.
+ *
+ * @param <src> string to convert, zero-terminated
+ * @param <sourceEncoding> the character encoding of <src>
+ * @return UTF-32 string, or NULL in case of error (then an error message was already printed on stderr)
+ */
+uint32_t *u32_strconv_from_arg(const char *src, const char *sourceEncoding);
+
+/**
+ * Convert a string from UTF-32 internal representation to input/output encoding (`encoding` in this .h file).
+ * Memory will be allocated for the converted string.
+ *
+ * @param <src> UTF-32 string to convert, zero-terminated
+ * @return string in input/output encoding, or NULL on error (then an error message was already printed on stderr)
+ */
+char *u32_strconv_to_output(const uint32_t *src);
+
+/**
+ * Convert a string from UTF-32 internal representation to the given target encoding.
+ * Memory will be allocated for the converted string.
+ *
+ * @param <src> UTF-32 string to convert, zero-terminated
+ * @param <targetEncoding> the character encoding of the result
+ * @return string in target encoding, or NULL in case of error (then an error message was already printed on stderr)
+ */
+char *u32_strconv_to_arg(const uint32_t *src, const char *targetEncoding);
+
+/**
+ * Check if the given <manual_encoding> can be used to covert anything. This should reveal invalid encoding names that
+ * have been specified on the command line. If no <manual_encoding> was specified, or if an invalid encoding is
+ * detected, we fall back to the system encoding. No new memory is allocated.
+ *
+ * @param <manual_encoding> the encoding set on the command line, may be NULL
+ * @param <system_encoding> the system encoding
+ * @return <manual_encoding> if it is set to a valid value, <system_encoding> otherwise
+ */
+const char *check_encoding(const char *manual_encoding, const char *system_encoding);
+

 #endif

--- a/test/111_manual_encoding_iso.txt
+++ b/test/111_manual_encoding_iso.txt
@ -0,0 +1,24 @@
+:ARGS
+-ac -n ISO_8859-15
+:INPUT
+             ä
+      äb
+      äbç
+    äbçd
+    äbçdé
+    äbçdéf
+     äbçdéfg
+    äbçdéfgh
+:OUTPUT-FILTER
+:EXPECTED
+    /**************/
+    /*     ä      */
+    /*     äb     */
+    /*    äbç     */
+    /*    äbçd    */
+    /*   äbçdé    */
+    /*   äbçdéf   */
+    /*  äbçdéfg   */
+    /*  äbçdéfgh  */
+    /**************/
+:EOF