Add command line option -n to set character encoding of input text #1

This commit is contained in:
Thomas Jensen 2021-02-09 22:16:01 +01:00
parent 4c656727ec
commit 44c2c526af
No known key found for this signature in database
GPG Key ID: A4ACEE270D0FB7DB
12 changed files with 262 additions and 38 deletions

13
.gitattributes vendored Normal file
View File

@ -0,0 +1,13 @@
* text=auto
# shell scripts
*.sh text eol=lf
# Windows batch files
*.bat text eol=crlf
# the test cases
/test/*.txt text eol=lf
# special test case for testing ISO encoding
/test/111_manual_encoding_iso.txt text working-tree-encoding=ISO_8859-15

View File

@ -10,7 +10,7 @@ boxes \- text mode box and comment drawing filter
.SH SYNOPSIS .SH SYNOPSIS
.B boxes .B boxes
[\-hlmrv] [\-a\ format] [\-d\ design] [\-f\ file] [\-i\ indent] [\-k\ bool] [\-hlmrv] [\-a\ format] [\-d\ design] [\-f\ file] [\-i\ indent] [\-k\ bool]
[\-p\ pad] [\-s\ size] [\-t\ tabopts] [infile [outfile]] [\-n\ encoding] [\-p\ pad] [\-s\ size] [\-t\ tabopts] [infile [outfile]]
.SH DESCRIPTION .SH DESCRIPTION
.I Boxes .I Boxes
is a text filter which can draw any kind of box around its input text. Box is a text filter which can draw any kind of box around its input text. Box
@ -185,6 +185,14 @@ padding, indentation, etc. for the mended box. Implies
false. false.
.\" - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - .\" - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
.TP 0.6i .TP 0.6i
.B \-n \fIencoding\fP
Character encoding. Overrides the character encoding of the input and output
text. Choose from the list shown by \fIiconv -l\fP. If an invalid character
encoding is specified here, \fIUTF-8\fP is used as a fallback. The default
is to use the system encoding, which is normally the best course of action.
So don't specify this option unless you have to.
.\" - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
.TP 0.6i
.B \-p \fIstring\fP .B \-p \fIstring\fP
Padding. Specify padding in spaces around the input text block for all Padding. Specify padding in spaces around the input text block for all
sides of the box. The argument string may not contain whitespace and must sides of the box. The argument string may not contain whitespace and must
@ -357,4 +365,5 @@ configuration file (takes precedence over system-wide configuration file)
system\-wide configuration file system\-wide configuration file
.\" ======================================================================= .\" =======================================================================
.SH "SEE ALSO" .SH "SEE ALSO"
.I figlet(6) .BR figlet (6),
.BR iconv (1)

View File

@ -79,7 +79,7 @@ lex.yy.c: lexer.l boxes.h
cat lexer.tmp.c >> lex.yy.c cat lexer.tmp.c >> lex.yy.c
rm lexer.tmp.c rm lexer.tmp.c
# TODO In the end, check declared dependencies again
boxes.o: boxes.c boxes.h regulex.h shape.h tools.h unicode.h generate.h remove.h config.h boxes.o: boxes.c boxes.h regulex.h shape.h tools.h unicode.h generate.h remove.h config.h
tools.o: tools.c tools.h boxes.h shape.h config.h tools.o: tools.c tools.h boxes.h shape.h config.h
unicode.o: unicode.c unicode.h config.h unicode.o: unicode.c unicode.h config.h

View File

@ -29,7 +29,6 @@
#include <sys/types.h> #include <sys/types.h>
#include <sys/stat.h> #include <sys/stat.h>
#include <uniconv.h>
#include <unictype.h> #include <unictype.h>
#include <unistdio.h> #include <unistdio.h>
#include <unistr.h> #include <unistr.h>
@ -107,6 +106,7 @@ static void usage(FILE *st)
fprintf(st, " -k bool leading/trailing blank line retention on removal\n"); fprintf(st, " -k bool leading/trailing blank line retention on removal\n");
fprintf(st, " -l list available box designs w/ samples\n"); fprintf(st, " -l list available box designs w/ samples\n");
fprintf(st, " -m mend box, i.e. remove it and redraw it afterwards\n"); fprintf(st, " -m mend box, i.e. remove it and redraw it afterwards\n");
fprintf(st, " -n enc Character encoding of input and output\n");
fprintf(st, " -p fmt padding [default: none]\n"); fprintf(st, " -p fmt padding [default: none]\n");
/* fprintf(st, " -q modify command for needs of the web UI (undocumented)\n"); */ /* fprintf(st, " -q modify command for needs of the web UI (undocumented)\n"); */
fprintf(st, " -r remove box\n"); fprintf(st, " -r remove box\n");
@ -370,6 +370,7 @@ static int process_commandline(int argc, char *argv[])
opt.tabstop = DEF_TABSTOP; opt.tabstop = DEF_TABSTOP;
opt.tabexp = 'e'; opt.tabexp = 'e';
opt.killblank = -1; opt.killblank = -1;
opt.encoding = NULL;
for (idummy = 0; idummy < ANZ_SIDES; ++idummy) { for (idummy = 0; idummy < ANZ_SIDES; ++idummy) {
opt.padding[idummy] = -1; opt.padding[idummy] = -1;
} }
@ -388,7 +389,7 @@ static int process_commandline(int argc, char *argv[])
* Parse Command Line * Parse Command Line
*/ */
do { do {
oc = getopt(argc, argv, "a:c:d:f:hi:k:lmp:qrs:t:v"); oc = getopt(argc, argv, "a:c:d:f:hi:k:lmn:p:qrs:t:v");
switch (oc) { switch (oc) {
@ -580,6 +581,17 @@ static int process_commandline(int argc, char *argv[])
opt.killblank = 0; opt.killblank = 0;
break; break;
case 'n':
/*
* Character encoding
*/
opt.encoding = (char *) strdup(optarg);
if (opt.encoding == NULL) {
perror(PROJECT);
return 1;
}
break;
case 'p': case 'p':
/* /*
* Padding. format is ([ahvtrbl]n)+ * Padding. format is ([ahvtrbl]n)+
@ -1257,6 +1269,9 @@ static int apply_substitutions(const int mode)
/* /*
* Compile regular expressions * Compile regular expressions
*/ */
#ifdef REGEXP_DEBUG
fprintf(stderr, "Compiling %d %s rule patterns\n", (int) anz_rules, mode ? "reversion" : "replacement");
#endif
errno = 0; errno = 0;
opt.design->current_rule = rules; opt.design->current_rule = rules;
for (j = 0; j < anz_rules; ++j, ++(opt.design->current_rule)) { for (j = 0; j < anz_rules; ++j, ++(opt.design->current_rule)) {
@ -1278,13 +1293,13 @@ static int apply_substitutions(const int mode)
for (j = 0; j < anz_rules; ++j, ++(opt.design->current_rule)) { for (j = 0; j < anz_rules; ++j, ++(opt.design->current_rule)) {
#ifdef REGEXP_DEBUG #ifdef REGEXP_DEBUG
fprintf (stderr, "regex_replace(0x%p, \"%s\", \"%s\", %d, \'%c\') == ", fprintf (stderr, "regex_replace(0x%p, \"%s\", \"%s\", %d, \'%c\') == ",
rules[j].prog, rules[j].repstr, u32_strconv_to_locale(input.lines[k].mbtext), rules[j].prog, rules[j].repstr, u32_strconv_to_output(input.lines[k].mbtext),
(int) input.lines[k].num_chars, rules[j].mode); (int) input.lines[k].num_chars, rules[j].mode);
#endif #endif
uint32_t *newtext = regex_replace(rules[j].prog, rules[j].repstr, uint32_t *newtext = regex_replace(rules[j].prog, rules[j].repstr,
input.lines[k].mbtext, input.lines[k].num_chars, rules[j].mode == 'g'); input.lines[k].mbtext, input.lines[k].num_chars, rules[j].mode == 'g');
#ifdef REGEXP_DEBUG #ifdef REGEXP_DEBUG
fprintf (stderr, "\"%s\"\n", newtext ? u32_strconv_to_locale(newtext) : "NULL"); fprintf (stderr, "\"%s\"\n", newtext ? u32_strconv_to_output(newtext) : "NULL");
#endif #endif
if (newtext == NULL) { if (newtext == NULL) {
return 1; return 1;
@ -1298,7 +1313,7 @@ static int apply_substitutions(const int mode)
#ifdef REGEXP_DEBUG #ifdef REGEXP_DEBUG
fprintf (stderr, "input.lines[%d] == {%d, \"%s\"}\n", (int) k, fprintf (stderr, "input.lines[%d] == {%d, \"%s\"}\n", (int) k,
(int) input.lines[k].num_chars, u32_strconv_to_locale(input.lines[k].mbtext)); (int) input.lines[k].num_chars, u32_strconv_to_output(input.lines[k].mbtext));
#endif #endif
} }
opt.design->current_rule = NULL; opt.design->current_rule = NULL;
@ -1393,7 +1408,7 @@ static int read_all_input(const int use_stdin)
input.lines = tmp; input.lines = tmp;
} }
mbtemp = u32_strconv_from_locale(buf); mbtemp = u32_strconv_from_input(buf);
len_chars = u32_strlen(mbtemp); len_chars = u32_strlen(mbtemp);
input.final_newline = has_linebreak(mbtemp, len_chars); input.final_newline = has_linebreak(mbtemp, len_chars);
input.lines[input.anz_lines].posmap = NULL; input.lines[input.anz_lines].posmap = NULL;
@ -1484,7 +1499,7 @@ static int read_all_input(const int use_stdin)
for (i = 0; i < input.anz_lines; ++i) { for (i = 0; i < input.anz_lines; ++i) {
#ifdef DEBUG #ifdef DEBUG
fprintf(stderr, "%2d: mbtext = \"%s\" (%d chars)\n", (int) i, fprintf(stderr, "%2d: mbtext = \"%s\" (%d chars)\n", (int) i,
u32_strconv_to_locale(input.lines[i].mbtext), (int) input.lines[i].num_chars); u32_strconv_to_output(input.lines[i].mbtext), (int) input.lines[i].num_chars);
#endif #endif
if (input.lines[i].num_chars >= input.indent) { if (input.lines[i].num_chars >= input.indent) {
memmove(input.lines[i].text, input.lines[i].text + input.indent, memmove(input.lines[i].text, input.lines[i].text + input.indent,
@ -1496,7 +1511,7 @@ static int read_all_input(const int use_stdin)
} }
#ifdef DEBUG #ifdef DEBUG
fprintf(stderr, "%2d: mbtext = \"%s\" (%d chars)\n", (int) i, fprintf(stderr, "%2d: mbtext = \"%s\" (%d chars)\n", (int) i,
u32_strconv_to_locale(input.lines[i].mbtext), (int) input.lines[i].num_chars); u32_strconv_to_output(input.lines[i].mbtext), (int) input.lines[i].num_chars);
#endif #endif
} }
input.maxline -= input.indent; input.maxline -= input.indent;
@ -1557,7 +1572,7 @@ int main(int argc, char *argv[])
* Store system character encoding * Store system character encoding
*/ */
setlocale(LC_ALL, ""); /* switch from default "C" encoding to system encoding */ setlocale(LC_ALL, ""); /* switch from default "C" encoding to system encoding */
encoding = locale_charset(); encoding = check_encoding(opt.encoding, locale_charset());
#ifdef DEBUG #ifdef DEBUG
fprintf (stderr, "Character Encoding = %s\n", encoding); fprintf (stderr, "Character Encoding = %s\n", encoding);
#endif #endif

View File

@ -139,6 +139,7 @@ typedef struct { /* Command line options: */
char indentmode; /* 'b', 't', 'n', or '\0' */ char indentmode; /* 'b', 't', 'n', or '\0' */
char justify; /* 'l', 'c', 'r', or '\0' */ char justify; /* 'l', 'c', 'r', or '\0' */
int killblank; /* -1 if not set */ int killblank; /* -1 if not set */
char *encoding; /* character encoding override for input and output text */
FILE *infile; /* where we get our input */ FILE *infile; /* where we get our input */
FILE *outfile; /* where we put our output */ FILE *outfile; /* where we put our output */
} opt_t; } opt_t;

View File

@ -28,7 +28,6 @@
#include <stdint.h> #include <stdint.h>
#include <string.h> #include <string.h>
#include <uniconv.h>
#include <unistr.h> #include <unistr.h>
#include "shape.h" #include "shape.h"
@ -729,7 +728,7 @@ static int justify_line(line_t *line, int skew)
#if defined(DEBUG) || 0 #if defined(DEBUG) || 0
fprintf (stderr, "justify_line(%c): Input: real: (%02d) \"%s\", text: (%02d) \"%s\", invisible=%d, skew=%d", fprintf (stderr, "justify_line(%c): Input: real: (%02d) \"%s\", text: (%02d) \"%s\", invisible=%d, skew=%d",
opt.justify ? opt.justify : '0', (int) line->num_chars, u32_strconv_to_locale(line->mbtext), opt.justify ? opt.justify : '0', (int) line->num_chars, u32_strconv_to_output(line->mbtext),
(int) line->len, line->text, (int) line->invis, skew); (int) line->len, line->text, (int) line->invis, skew);
#endif #endif
@ -998,7 +997,7 @@ int output_box(const sentry_t *thebox)
concat_strings(obuf, LINE_MAX_BYTES + 1, 8, restored_indent, concat_strings(obuf, LINE_MAX_BYTES + 1, 8, restored_indent,
skip_left ? "" : thebox[BLEF].chars[j], hfill1, skip_left ? "" : thebox[BLEF].chars[j], hfill1,
ti >= 0 && shift > 0 ? nspaces(shift) : "", ti >= 0 && shift > 0 ? nspaces(shift) : "",
ti >= 0 ? u32_strconv_to_locale(mbtext_shifted) : "", ti >= 0 ? u32_strconv_to_output(mbtext_shifted) : "",
hfill2, nspaces(input.maxline - input.lines[ti].len - shift), hfill2, nspaces(input.maxline - input.lines[ti].len - shift),
thebox[BRIG].chars[j]); thebox[BRIG].chars[j]);
} }

View File

@ -26,9 +26,10 @@
#include <errno.h> #include <errno.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <uniconv.h> #include <string.h>
#include "tools.h" #include "tools.h"
#include "unicode.h"
#include "regulex.h" #include "regulex.h"
@ -37,7 +38,10 @@ pcre2_code *compile_pattern(char *pattern)
{ {
int errornumber; int errornumber;
PCRE2_SIZE erroroffset; PCRE2_SIZE erroroffset;
PCRE2_SPTR pattern32 = u32_strconv_from_locale(pattern); PCRE2_SPTR pattern32 = u32_strconv_from_arg(pattern, config_encoding);
if (pattern32 == NULL) {
return NULL;
}
pcre2_code *re = pcre2_compile( pcre2_code *re = pcre2_compile(
pattern32, /* the pattern */ pattern32, /* the pattern */
@ -51,7 +55,7 @@ pcre2_code *compile_pattern(char *pattern)
PCRE2_UCHAR buffer[256]; PCRE2_UCHAR buffer[256];
pcre2_get_error_message(errornumber, buffer, sizeof(buffer)); pcre2_get_error_message(errornumber, buffer, sizeof(buffer));
fprintf(stderr, "Regular expression pattern \"%s\" failed to compile at offset %d: %s\n", fprintf(stderr, "Regular expression pattern \"%s\" failed to compile at offset %d: %s\n",
pattern, (int) erroroffset, u32_strconv_to_locale(buffer)); pattern, (int) erroroffset, u32_strconv_to_output(buffer));
} }
return re; return re;
} }
@ -60,7 +64,10 @@ pcre2_code *compile_pattern(char *pattern)
uint32_t *regex_replace(pcre2_code *search, char *replace, uint32_t *input, const size_t input_len, const int global) uint32_t *regex_replace(pcre2_code *search, char *replace, uint32_t *input, const size_t input_len, const int global)
{ {
PCRE2_SPTR replacement = u32_strconv_from_locale(replace); PCRE2_SPTR replacement = u32_strconv_from_arg(replace, config_encoding);
if (replacement == NULL) {
return NULL;
}
uint32_t options = PCRE2_SUBSTITUTE_OVERFLOW_LENGTH | PCRE2_SUBSTITUTE_EXTENDED uint32_t options = PCRE2_SUBSTITUTE_OVERFLOW_LENGTH | PCRE2_SUBSTITUTE_EXTENDED
| (global ? PCRE2_SUBSTITUTE_GLOBAL : 0); | (global ? PCRE2_SUBSTITUTE_GLOBAL : 0);
PCRE2_SIZE outlen = input_len * 2; /* estimated length of output buffer in characters, fine if too small */ PCRE2_SIZE outlen = input_len * 2; /* estimated length of output buffer in characters, fine if too small */
@ -101,7 +108,7 @@ uint32_t *regex_replace(pcre2_code *search, char *replace, uint32_t *input, cons
PCRE2_UCHAR buffer[256]; PCRE2_UCHAR buffer[256];
pcre2_get_error_message(pcre2_rc, buffer, sizeof(buffer)); pcre2_get_error_message(pcre2_rc, buffer, sizeof(buffer));
/* buffer will normally contain "invalid replacement string" */ /* buffer will normally contain "invalid replacement string" */
fprintf(stderr, "Error substituting \"%s\": %s\n", replace, u32_strconv_to_locale(buffer)); fprintf(stderr, "Error substituting \"%s\": %s\n", replace, u32_strconv_to_output(buffer));
BFREE(output); BFREE(output);
return NULL; return NULL;
} }

View File

@ -26,7 +26,6 @@
#include <stdlib.h> #include <stdlib.h>
#include <stdio.h> #include <stdio.h>
#include <string.h> #include <string.h>
#include <uniconv.h>
#include <unistr.h> #include <unistr.h>
#include "shape.h" #include "shape.h"
@ -1098,7 +1097,7 @@ void output_input(const int trim_only)
indent = 0; indent = 0;
} }
fprintf(opt.outfile, "%s%s%s", indentspc, u32_strconv_to_locale(advance32(input.lines[j].mbtext, indent)), fprintf(opt.outfile, "%s%s%s", indentspc, u32_strconv_to_output(advance32(input.lines[j].mbtext, indent)),
(input.final_newline || j < input.anz_lines - 1 ? "\n" : "")); (input.final_newline || j < input.anz_lines - 1 ? "\n" : ""));
BFREE (indentspc); BFREE (indentspc);
} }

View File

@ -30,7 +30,6 @@
#include <string.h> #include <string.h>
#include <strings.h> #include <strings.h>
#include <uniconv.h>
#include <unictype.h> #include <unictype.h>
#include <unistr.h> #include <unistr.h>
#include <unitypes.h> #include <unitypes.h>
@ -514,7 +513,7 @@ void print_input_lines(const char *heading)
fprintf(stderr, " [num_chars] \"real text\" [num_cols] \"ascii_text\"\n"); fprintf(stderr, " [num_chars] \"real text\" [num_cols] \"ascii_text\"\n");
for (size_t i = 0; i < input.anz_lines; ++i) { for (size_t i = 0; i < input.anz_lines; ++i) {
fprintf(stderr, "%4d [%02d] \"%s\" [%02d] \"%s\"", (int) i, fprintf(stderr, "%4d [%02d] \"%s\" [%02d] \"%s\"", (int) i,
(int) input.lines[i].num_chars, u32_strconv_to_locale(input.lines[i].mbtext), (int) input.lines[i].num_chars, u32_strconv_to_output(input.lines[i].mbtext),
(int) input.lines[i].len, input.lines[i].text); (int) input.lines[i].len, input.lines[i].text);
fprintf(stderr, "\tTabs: ["); fprintf(stderr, "\tTabs: [");
if (input.lines[i].tabpos != NULL) { if (input.lines[i].tabpos != NULL) {

View File

@ -24,22 +24,42 @@
#include "config.h" #include "config.h"
#include <errno.h> #include <errno.h>
#include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h>
#include <uniconv.h>
#include <unictype.h> #include <unictype.h>
#include <unistr.h> #include <unistr.h>
#include "boxes.h"
#include "tools.h"
#include "unicode.h" #include "unicode.h"
const char *encoding; /* the character encoding that we use */
const ucs4_t char_tab = 0x00000009; /* ucs4_t character '\t' (tab) */ const char *config_encoding = "ISO_8859-15";
const ucs4_t char_space = 0x00000020; /* ucs4_t character ' ' (space) */
const ucs4_t char_cr = 0x0000000d; /* ucs4_t character '\r' (carriage return) */ /* effective character encoding of input and output text */
const ucs4_t char_newline = 0x0000000a; /* ucs4_t character '\n' (newline) */ const char *encoding;
const ucs4_t char_esc = 0x0000001b; /* ucs4_t character 0x1b (escape) */
const ucs4_t char_nul = 0x00000000; /* ucs4_t character '\0' (zero) */ /* ucs4_t character '\t' (tab) */
const ucs4_t char_tab = 0x00000009;
/* ucs4_t character ' ' (space) */
const ucs4_t char_space = 0x00000020;
/* ucs4_t character '\r' (carriage return) */
const ucs4_t char_cr = 0x0000000d;
/* ucs4_t character '\n' (newline) */
const ucs4_t char_newline = 0x0000000a;
/* ucs4_t character 0x1b (escape) */
const ucs4_t char_esc = 0x0000001b;
/* ucs4_t character '\0' (zero) */
const ucs4_t char_nul = 0x00000000;
@ -176,4 +196,78 @@ uint32_t *advance32(uint32_t *s, const size_t offset)
} }
uint32_t *u32_strconv_from_input(const char *src)
{
return u32_strconv_from_arg(src, encoding);
}
uint32_t *u32_strconv_from_arg(const char *src, const char *sourceEncoding)
{
if (src == NULL) {
return NULL;
}
if (src[0] == '\0') {
return new_empty_string32();
}
uint32_t *result = u32_strconv_from_encoding(
src, /* the source string to convert */
sourceEncoding, /* the character encoding from which to convert */
iconveh_question_mark); /* produce one question mark '?' per unconvertible character */
if (result == NULL) {
fprintf(stderr, "%s: failed to convert from '%s' to UTF-32: %s\n", PROJECT, sourceEncoding, strerror(errno));
}
return result;
}
char *u32_strconv_to_output(const uint32_t *src)
{
return u32_strconv_to_arg(src, encoding);
}
char *u32_strconv_to_arg(const uint32_t *src, const char *targetEncoding)
{
if (src == NULL) {
return NULL;
}
if (is_empty(src)) {
return strdup("");
}
char *result = u32_strconv_to_encoding(
src, /* the source string to convert */
targetEncoding, /* the character encoding to which to convert */
iconveh_question_mark); /* produce one question mark '?' per unconvertible character */
if (result == NULL) {
fprintf(stderr, "%s: failed to convert from UTF-32 to '%s': %s\n", PROJECT, targetEncoding, strerror(errno));
}
return result;
}
const char *check_encoding(const char *manual_encoding, const char *system_encoding)
{
if (manual_encoding != NULL) {
uint32_t *unicode = u32_strconv_from_encoding(" ", manual_encoding, iconveh_error);
if (unicode != NULL) {
BFREE(unicode);
return manual_encoding;
}
fprintf(stderr, "%s: Invalid character encoding: %s - falling back to %s\n",
PROJECT, manual_encoding, system_encoding);
}
return system_encoding;
}
/*EOF*/ /* vim: set sw=4: */ /*EOF*/ /* vim: set sw=4: */

View File

@ -28,16 +28,31 @@
#include <unitypes.h> #include <unitypes.h>
extern const char *encoding; /* the character encoding that we use */
extern const ucs4_t char_tab; /* ucs4_t character '\t' (tab) */ /** The boxes config file is still encoded with a single-byte character set. Officially, it is ASCII!
extern const ucs4_t char_space; /* ucs4_t character ' ' (space) */ * However, people might not conform to this, so we use ISO_8859-15 as a reasonable superset. */
extern const ucs4_t char_cr; /* ucs4_t character '\r' (carriage return) */ extern const char *config_encoding;
extern const ucs4_t char_newline; /* ucs4_t character '\n' (newline) */
extern const ucs4_t char_esc; /* ucs4_t character 0x1b (escape) */
extern const ucs4_t char_nul; /* ucs4_t character '\0' (zero) */
/** the character encoding of input (and output) text */
extern const char *encoding;
/** ucs4_t character '\t' (tab) */
extern const ucs4_t char_tab;
/** ucs4_t character ' ' (space) */
extern const ucs4_t char_space;
/** ucs4_t character '\r' (carriage return) */
extern const ucs4_t char_cr;
/** ucs4_t character '\n' (newline) */
extern const ucs4_t char_newline;
/** ucs4_t character 0x1b (escape) */
extern const ucs4_t char_esc;
/** ucs4_t character '\0' (zero) */
extern const ucs4_t char_nul;
int is_char_at(const uint32_t *text, const size_t idx, const ucs4_t expected_char); int is_char_at(const uint32_t *text, const size_t idx, const ucs4_t expected_char);
@ -75,6 +90,55 @@ uint32_t *advance_next32(const uint32_t *s, size_t *invis);
*/ */
uint32_t *advance32(uint32_t *s, const size_t offset); uint32_t *advance32(uint32_t *s, const size_t offset);
/**
* Convert a string from the input/output encoding (`encoding` in this .h file) to UTF-32 internal representation.
* Memory will be allocated for the converted string.
*
* @param <src> string to convert, zero-terminated
* @return UTF-32 string, or NULL in case of error (then an error message was already printed on stderr)
*/
uint32_t *u32_strconv_from_input(const char *src);
/**
* Convert a string from the given source encoding to UTF-32 internal representation.
* Memory will be allocated for the converted string.
*
* @param <src> string to convert, zero-terminated
* @param <sourceEncoding> the character encoding of <src>
* @return UTF-32 string, or NULL in case of error (then an error message was already printed on stderr)
*/
uint32_t *u32_strconv_from_arg(const char *src, const char *sourceEncoding);
/**
* Convert a string from UTF-32 internal representation to input/output encoding (`encoding` in this .h file).
* Memory will be allocated for the converted string.
*
* @param <src> UTF-32 string to convert, zero-terminated
* @return string in input/output encoding, or NULL on error (then an error message was already printed on stderr)
*/
char *u32_strconv_to_output(const uint32_t *src);
/**
* Convert a string from UTF-32 internal representation to the given target encoding.
* Memory will be allocated for the converted string.
*
* @param <src> UTF-32 string to convert, zero-terminated
* @param <targetEncoding> the character encoding of the result
* @return string in target encoding, or NULL in case of error (then an error message was already printed on stderr)
*/
char *u32_strconv_to_arg(const uint32_t *src, const char *targetEncoding);
/**
* Check if the given <manual_encoding> can be used to covert anything. This should reveal invalid encoding names that
* have been specified on the command line. If no <manual_encoding> was specified, or if an invalid encoding is
* detected, we fall back to the system encoding. No new memory is allocated.
*
* @param <manual_encoding> the encoding set on the command line, may be NULL
* @param <system_encoding> the system encoding
* @return <manual_encoding> if it is set to a valid value, <system_encoding> otherwise
*/
const char *check_encoding(const char *manual_encoding, const char *system_encoding);
#endif #endif

View File

@ -0,0 +1,24 @@
:ARGS
-ac -n ISO_8859-15
:INPUT
ä
äb
äbç
äbçd
äbçdé
äbçdéf
äbçdéfg
äbçdéfgh
:OUTPUT-FILTER
:EXPECTED
/**************/
/* ä */
/* äb */
/* äbç */
/* äbçd */
/* äbçdé */
/* äbçdéf */
/* äbçdéfg */
/* äbçdéfgh */
/**************/
:EOF