Add command line option -n to set character encoding of input text #1

This commit is contained in:
Thomas Jensen 2021-02-09 22:16:01 +01:00
parent 4c656727ec
commit 44c2c526af
No known key found for this signature in database
GPG Key ID: A4ACEE270D0FB7DB
12 changed files with 262 additions and 38 deletions

13
.gitattributes vendored Normal file
View File

@ -0,0 +1,13 @@
* text=auto
# shell scripts
*.sh text eol=lf
# Windows batch files
*.bat text eol=crlf
# the test cases
/test/*.txt text eol=lf
# special test case for testing ISO encoding
/test/111_manual_encoding_iso.txt text working-tree-encoding=ISO_8859-15

View File

@ -10,7 +10,7 @@ boxes \- text mode box and comment drawing filter
.SH SYNOPSIS
.B boxes
[\-hlmrv] [\-a\ format] [\-d\ design] [\-f\ file] [\-i\ indent] [\-k\ bool]
[\-p\ pad] [\-s\ size] [\-t\ tabopts] [infile [outfile]]
[\-n\ encoding] [\-p\ pad] [\-s\ size] [\-t\ tabopts] [infile [outfile]]
.SH DESCRIPTION
.I Boxes
is a text filter which can draw any kind of box around its input text. Box
@ -185,6 +185,14 @@ padding, indentation, etc. for the mended box. Implies
false.
.\" - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
.TP 0.6i
.B \-n \fIencoding\fP
Character encoding. Overrides the character encoding of the input and output
text. Choose from the list shown by \fIiconv -l\fP. If an invalid character
encoding is specified here, \fIUTF-8\fP is used as a fallback. The default
is to use the system encoding, which is normally the best course of action.
So don't specify this option unless you have to.
.\" - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
.TP 0.6i
.B \-p \fIstring\fP
Padding. Specify padding in spaces around the input text block for all
sides of the box. The argument string may not contain whitespace and must
@ -357,4 +365,5 @@ configuration file (takes precedence over system-wide configuration file)
system\-wide configuration file
.\" =======================================================================
.SH "SEE ALSO"
.I figlet(6)
.BR figlet (6),
.BR iconv (1)

View File

@ -79,7 +79,7 @@ lex.yy.c: lexer.l boxes.h
cat lexer.tmp.c >> lex.yy.c
rm lexer.tmp.c
# TODO In the end, check declared dependencies again
boxes.o: boxes.c boxes.h regulex.h shape.h tools.h unicode.h generate.h remove.h config.h
tools.o: tools.c tools.h boxes.h shape.h config.h
unicode.o: unicode.c unicode.h config.h

View File

@ -29,7 +29,6 @@
#include <sys/types.h>
#include <sys/stat.h>
#include <uniconv.h>
#include <unictype.h>
#include <unistdio.h>
#include <unistr.h>
@ -107,6 +106,7 @@ static void usage(FILE *st)
fprintf(st, " -k bool leading/trailing blank line retention on removal\n");
fprintf(st, " -l list available box designs w/ samples\n");
fprintf(st, " -m mend box, i.e. remove it and redraw it afterwards\n");
fprintf(st, " -n enc Character encoding of input and output\n");
fprintf(st, " -p fmt padding [default: none]\n");
/* fprintf(st, " -q modify command for needs of the web UI (undocumented)\n"); */
fprintf(st, " -r remove box\n");
@ -370,6 +370,7 @@ static int process_commandline(int argc, char *argv[])
opt.tabstop = DEF_TABSTOP;
opt.tabexp = 'e';
opt.killblank = -1;
opt.encoding = NULL;
for (idummy = 0; idummy < ANZ_SIDES; ++idummy) {
opt.padding[idummy] = -1;
}
@ -388,7 +389,7 @@ static int process_commandline(int argc, char *argv[])
* Parse Command Line
*/
do {
oc = getopt(argc, argv, "a:c:d:f:hi:k:lmp:qrs:t:v");
oc = getopt(argc, argv, "a:c:d:f:hi:k:lmn:p:qrs:t:v");
switch (oc) {
@ -580,6 +581,17 @@ static int process_commandline(int argc, char *argv[])
opt.killblank = 0;
break;
case 'n':
/*
* Character encoding
*/
opt.encoding = (char *) strdup(optarg);
if (opt.encoding == NULL) {
perror(PROJECT);
return 1;
}
break;
case 'p':
/*
* Padding. format is ([ahvtrbl]n)+
@ -1257,6 +1269,9 @@ static int apply_substitutions(const int mode)
/*
* Compile regular expressions
*/
#ifdef REGEXP_DEBUG
fprintf(stderr, "Compiling %d %s rule patterns\n", (int) anz_rules, mode ? "reversion" : "replacement");
#endif
errno = 0;
opt.design->current_rule = rules;
for (j = 0; j < anz_rules; ++j, ++(opt.design->current_rule)) {
@ -1278,13 +1293,13 @@ static int apply_substitutions(const int mode)
for (j = 0; j < anz_rules; ++j, ++(opt.design->current_rule)) {
#ifdef REGEXP_DEBUG
fprintf (stderr, "regex_replace(0x%p, \"%s\", \"%s\", %d, \'%c\') == ",
rules[j].prog, rules[j].repstr, u32_strconv_to_locale(input.lines[k].mbtext),
rules[j].prog, rules[j].repstr, u32_strconv_to_output(input.lines[k].mbtext),
(int) input.lines[k].num_chars, rules[j].mode);
#endif
uint32_t *newtext = regex_replace(rules[j].prog, rules[j].repstr,
input.lines[k].mbtext, input.lines[k].num_chars, rules[j].mode == 'g');
#ifdef REGEXP_DEBUG
fprintf (stderr, "\"%s\"\n", newtext ? u32_strconv_to_locale(newtext) : "NULL");
fprintf (stderr, "\"%s\"\n", newtext ? u32_strconv_to_output(newtext) : "NULL");
#endif
if (newtext == NULL) {
return 1;
@ -1298,7 +1313,7 @@ static int apply_substitutions(const int mode)
#ifdef REGEXP_DEBUG
fprintf (stderr, "input.lines[%d] == {%d, \"%s\"}\n", (int) k,
(int) input.lines[k].num_chars, u32_strconv_to_locale(input.lines[k].mbtext));
(int) input.lines[k].num_chars, u32_strconv_to_output(input.lines[k].mbtext));
#endif
}
opt.design->current_rule = NULL;
@ -1393,7 +1408,7 @@ static int read_all_input(const int use_stdin)
input.lines = tmp;
}
mbtemp = u32_strconv_from_locale(buf);
mbtemp = u32_strconv_from_input(buf);
len_chars = u32_strlen(mbtemp);
input.final_newline = has_linebreak(mbtemp, len_chars);
input.lines[input.anz_lines].posmap = NULL;
@ -1484,7 +1499,7 @@ static int read_all_input(const int use_stdin)
for (i = 0; i < input.anz_lines; ++i) {
#ifdef DEBUG
fprintf(stderr, "%2d: mbtext = \"%s\" (%d chars)\n", (int) i,
u32_strconv_to_locale(input.lines[i].mbtext), (int) input.lines[i].num_chars);
u32_strconv_to_output(input.lines[i].mbtext), (int) input.lines[i].num_chars);
#endif
if (input.lines[i].num_chars >= input.indent) {
memmove(input.lines[i].text, input.lines[i].text + input.indent,
@ -1496,7 +1511,7 @@ static int read_all_input(const int use_stdin)
}
#ifdef DEBUG
fprintf(stderr, "%2d: mbtext = \"%s\" (%d chars)\n", (int) i,
u32_strconv_to_locale(input.lines[i].mbtext), (int) input.lines[i].num_chars);
u32_strconv_to_output(input.lines[i].mbtext), (int) input.lines[i].num_chars);
#endif
}
input.maxline -= input.indent;
@ -1557,7 +1572,7 @@ int main(int argc, char *argv[])
* Store system character encoding
*/
setlocale(LC_ALL, ""); /* switch from default "C" encoding to system encoding */
encoding = locale_charset();
encoding = check_encoding(opt.encoding, locale_charset());
#ifdef DEBUG
fprintf (stderr, "Character Encoding = %s\n", encoding);
#endif

View File

@ -139,6 +139,7 @@ typedef struct { /* Command line options: */
char indentmode; /* 'b', 't', 'n', or '\0' */
char justify; /* 'l', 'c', 'r', or '\0' */
int killblank; /* -1 if not set */
char *encoding; /* character encoding override for input and output text */
FILE *infile; /* where we get our input */
FILE *outfile; /* where we put our output */
} opt_t;

View File

@ -28,7 +28,6 @@
#include <stdint.h>
#include <string.h>
#include <uniconv.h>
#include <unistr.h>
#include "shape.h"
@ -729,7 +728,7 @@ static int justify_line(line_t *line, int skew)
#if defined(DEBUG) || 0
fprintf (stderr, "justify_line(%c): Input: real: (%02d) \"%s\", text: (%02d) \"%s\", invisible=%d, skew=%d",
opt.justify ? opt.justify : '0', (int) line->num_chars, u32_strconv_to_locale(line->mbtext),
opt.justify ? opt.justify : '0', (int) line->num_chars, u32_strconv_to_output(line->mbtext),
(int) line->len, line->text, (int) line->invis, skew);
#endif
@ -998,7 +997,7 @@ int output_box(const sentry_t *thebox)
concat_strings(obuf, LINE_MAX_BYTES + 1, 8, restored_indent,
skip_left ? "" : thebox[BLEF].chars[j], hfill1,
ti >= 0 && shift > 0 ? nspaces(shift) : "",
ti >= 0 ? u32_strconv_to_locale(mbtext_shifted) : "",
ti >= 0 ? u32_strconv_to_output(mbtext_shifted) : "",
hfill2, nspaces(input.maxline - input.lines[ti].len - shift),
thebox[BRIG].chars[j]);
}

View File

@ -26,9 +26,10 @@
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <uniconv.h>
#include <string.h>
#include "tools.h"
#include "unicode.h"
#include "regulex.h"
@ -37,7 +38,10 @@ pcre2_code *compile_pattern(char *pattern)
{
int errornumber;
PCRE2_SIZE erroroffset;
PCRE2_SPTR pattern32 = u32_strconv_from_locale(pattern);
PCRE2_SPTR pattern32 = u32_strconv_from_arg(pattern, config_encoding);
if (pattern32 == NULL) {
return NULL;
}
pcre2_code *re = pcre2_compile(
pattern32, /* the pattern */
@ -51,7 +55,7 @@ pcre2_code *compile_pattern(char *pattern)
PCRE2_UCHAR buffer[256];
pcre2_get_error_message(errornumber, buffer, sizeof(buffer));
fprintf(stderr, "Regular expression pattern \"%s\" failed to compile at offset %d: %s\n",
pattern, (int) erroroffset, u32_strconv_to_locale(buffer));
pattern, (int) erroroffset, u32_strconv_to_output(buffer));
}
return re;
}
@ -60,7 +64,10 @@ pcre2_code *compile_pattern(char *pattern)
uint32_t *regex_replace(pcre2_code *search, char *replace, uint32_t *input, const size_t input_len, const int global)
{
PCRE2_SPTR replacement = u32_strconv_from_locale(replace);
PCRE2_SPTR replacement = u32_strconv_from_arg(replace, config_encoding);
if (replacement == NULL) {
return NULL;
}
uint32_t options = PCRE2_SUBSTITUTE_OVERFLOW_LENGTH | PCRE2_SUBSTITUTE_EXTENDED
| (global ? PCRE2_SUBSTITUTE_GLOBAL : 0);
PCRE2_SIZE outlen = input_len * 2; /* estimated length of output buffer in characters, fine if too small */
@ -101,7 +108,7 @@ uint32_t *regex_replace(pcre2_code *search, char *replace, uint32_t *input, cons
PCRE2_UCHAR buffer[256];
pcre2_get_error_message(pcre2_rc, buffer, sizeof(buffer));
/* buffer will normally contain "invalid replacement string" */
fprintf(stderr, "Error substituting \"%s\": %s\n", replace, u32_strconv_to_locale(buffer));
fprintf(stderr, "Error substituting \"%s\": %s\n", replace, u32_strconv_to_output(buffer));
BFREE(output);
return NULL;
}

View File

@ -26,7 +26,6 @@
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <uniconv.h>
#include <unistr.h>
#include "shape.h"
@ -1098,7 +1097,7 @@ void output_input(const int trim_only)
indent = 0;
}
fprintf(opt.outfile, "%s%s%s", indentspc, u32_strconv_to_locale(advance32(input.lines[j].mbtext, indent)),
fprintf(opt.outfile, "%s%s%s", indentspc, u32_strconv_to_output(advance32(input.lines[j].mbtext, indent)),
(input.final_newline || j < input.anz_lines - 1 ? "\n" : ""));
BFREE (indentspc);
}

View File

@ -30,7 +30,6 @@
#include <string.h>
#include <strings.h>
#include <uniconv.h>
#include <unictype.h>
#include <unistr.h>
#include <unitypes.h>
@ -514,7 +513,7 @@ void print_input_lines(const char *heading)
fprintf(stderr, " [num_chars] \"real text\" [num_cols] \"ascii_text\"\n");
for (size_t i = 0; i < input.anz_lines; ++i) {
fprintf(stderr, "%4d [%02d] \"%s\" [%02d] \"%s\"", (int) i,
(int) input.lines[i].num_chars, u32_strconv_to_locale(input.lines[i].mbtext),
(int) input.lines[i].num_chars, u32_strconv_to_output(input.lines[i].mbtext),
(int) input.lines[i].len, input.lines[i].text);
fprintf(stderr, "\tTabs: [");
if (input.lines[i].tabpos != NULL) {

View File

@ -24,22 +24,42 @@
#include "config.h"
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <uniconv.h>
#include <unictype.h>
#include <unistr.h>
#include "boxes.h"
#include "tools.h"
#include "unicode.h"
const char *encoding; /* the character encoding that we use */
const ucs4_t char_tab = 0x00000009; /* ucs4_t character '\t' (tab) */
const ucs4_t char_space = 0x00000020; /* ucs4_t character ' ' (space) */
const ucs4_t char_cr = 0x0000000d; /* ucs4_t character '\r' (carriage return) */
const ucs4_t char_newline = 0x0000000a; /* ucs4_t character '\n' (newline) */
const ucs4_t char_esc = 0x0000001b; /* ucs4_t character 0x1b (escape) */
const ucs4_t char_nul = 0x00000000; /* ucs4_t character '\0' (zero) */
const char *config_encoding = "ISO_8859-15";
/* effective character encoding of input and output text */
const char *encoding;
/* ucs4_t character '\t' (tab) */
const ucs4_t char_tab = 0x00000009;
/* ucs4_t character ' ' (space) */
const ucs4_t char_space = 0x00000020;
/* ucs4_t character '\r' (carriage return) */
const ucs4_t char_cr = 0x0000000d;
/* ucs4_t character '\n' (newline) */
const ucs4_t char_newline = 0x0000000a;
/* ucs4_t character 0x1b (escape) */
const ucs4_t char_esc = 0x0000001b;
/* ucs4_t character '\0' (zero) */
const ucs4_t char_nul = 0x00000000;
@ -176,4 +196,78 @@ uint32_t *advance32(uint32_t *s, const size_t offset)
}
uint32_t *u32_strconv_from_input(const char *src)
{
return u32_strconv_from_arg(src, encoding);
}
uint32_t *u32_strconv_from_arg(const char *src, const char *sourceEncoding)
{
if (src == NULL) {
return NULL;
}
if (src[0] == '\0') {
return new_empty_string32();
}
uint32_t *result = u32_strconv_from_encoding(
src, /* the source string to convert */
sourceEncoding, /* the character encoding from which to convert */
iconveh_question_mark); /* produce one question mark '?' per unconvertible character */
if (result == NULL) {
fprintf(stderr, "%s: failed to convert from '%s' to UTF-32: %s\n", PROJECT, sourceEncoding, strerror(errno));
}
return result;
}
char *u32_strconv_to_output(const uint32_t *src)
{
return u32_strconv_to_arg(src, encoding);
}
char *u32_strconv_to_arg(const uint32_t *src, const char *targetEncoding)
{
if (src == NULL) {
return NULL;
}
if (is_empty(src)) {
return strdup("");
}
char *result = u32_strconv_to_encoding(
src, /* the source string to convert */
targetEncoding, /* the character encoding to which to convert */
iconveh_question_mark); /* produce one question mark '?' per unconvertible character */
if (result == NULL) {
fprintf(stderr, "%s: failed to convert from UTF-32 to '%s': %s\n", PROJECT, targetEncoding, strerror(errno));
}
return result;
}
const char *check_encoding(const char *manual_encoding, const char *system_encoding)
{
if (manual_encoding != NULL) {
uint32_t *unicode = u32_strconv_from_encoding(" ", manual_encoding, iconveh_error);
if (unicode != NULL) {
BFREE(unicode);
return manual_encoding;
}
fprintf(stderr, "%s: Invalid character encoding: %s - falling back to %s\n",
PROJECT, manual_encoding, system_encoding);
}
return system_encoding;
}
/*EOF*/ /* vim: set sw=4: */

View File

@ -28,16 +28,31 @@
#include <unitypes.h>
extern const char *encoding; /* the character encoding that we use */
extern const ucs4_t char_tab; /* ucs4_t character '\t' (tab) */
extern const ucs4_t char_space; /* ucs4_t character ' ' (space) */
extern const ucs4_t char_cr; /* ucs4_t character '\r' (carriage return) */
extern const ucs4_t char_newline; /* ucs4_t character '\n' (newline) */
extern const ucs4_t char_esc; /* ucs4_t character 0x1b (escape) */
extern const ucs4_t char_nul; /* ucs4_t character '\0' (zero) */
/** The boxes config file is still encoded with a single-byte character set. Officially, it is ASCII!
* However, people might not conform to this, so we use ISO_8859-15 as a reasonable superset. */
extern const char *config_encoding;
/** the character encoding of input (and output) text */
extern const char *encoding;
/** ucs4_t character '\t' (tab) */
extern const ucs4_t char_tab;
/** ucs4_t character ' ' (space) */
extern const ucs4_t char_space;
/** ucs4_t character '\r' (carriage return) */
extern const ucs4_t char_cr;
/** ucs4_t character '\n' (newline) */
extern const ucs4_t char_newline;
/** ucs4_t character 0x1b (escape) */
extern const ucs4_t char_esc;
/** ucs4_t character '\0' (zero) */
extern const ucs4_t char_nul;
int is_char_at(const uint32_t *text, const size_t idx, const ucs4_t expected_char);
@ -75,6 +90,55 @@ uint32_t *advance_next32(const uint32_t *s, size_t *invis);
*/
uint32_t *advance32(uint32_t *s, const size_t offset);
/**
* Convert a string from the input/output encoding (`encoding` in this .h file) to UTF-32 internal representation.
* Memory will be allocated for the converted string.
*
* @param <src> string to convert, zero-terminated
* @return UTF-32 string, or NULL in case of error (then an error message was already printed on stderr)
*/
uint32_t *u32_strconv_from_input(const char *src);
/**
* Convert a string from the given source encoding to UTF-32 internal representation.
* Memory will be allocated for the converted string.
*
* @param <src> string to convert, zero-terminated
* @param <sourceEncoding> the character encoding of <src>
* @return UTF-32 string, or NULL in case of error (then an error message was already printed on stderr)
*/
uint32_t *u32_strconv_from_arg(const char *src, const char *sourceEncoding);
/**
* Convert a string from UTF-32 internal representation to input/output encoding (`encoding` in this .h file).
* Memory will be allocated for the converted string.
*
* @param <src> UTF-32 string to convert, zero-terminated
* @return string in input/output encoding, or NULL on error (then an error message was already printed on stderr)
*/
char *u32_strconv_to_output(const uint32_t *src);
/**
* Convert a string from UTF-32 internal representation to the given target encoding.
* Memory will be allocated for the converted string.
*
* @param <src> UTF-32 string to convert, zero-terminated
* @param <targetEncoding> the character encoding of the result
* @return string in target encoding, or NULL in case of error (then an error message was already printed on stderr)
*/
char *u32_strconv_to_arg(const uint32_t *src, const char *targetEncoding);
/**
* Check if the given <manual_encoding> can be used to covert anything. This should reveal invalid encoding names that
* have been specified on the command line. If no <manual_encoding> was specified, or if an invalid encoding is
* detected, we fall back to the system encoding. No new memory is allocated.
*
* @param <manual_encoding> the encoding set on the command line, may be NULL
* @param <system_encoding> the system encoding
* @return <manual_encoding> if it is set to a valid value, <system_encoding> otherwise
*/
const char *check_encoding(const char *manual_encoding, const char *system_encoding);
#endif

View File

@ -0,0 +1,24 @@
:ARGS
-ac -n ISO_8859-15
:INPUT
ä
äb
äbç
äbçd
äbçdé
äbçdéf
äbçdéfg
äbçdéfgh
:OUTPUT-FILTER
:EXPECTED
/**************/
/* ä */
/* äb */
/* äbç */
/* äbçd */
/* äbçdé */
/* äbçdéf */
/* äbçdéfg */
/* äbçdéfgh */
/**************/
:EOF