mirror of
https://github.com/ascii-boxes/boxes.git
synced 2025-06-20 17:58:56 +02:00
Add command line option -n to set character encoding of input text #1
This commit is contained in:
parent
4c656727ec
commit
44c2c526af
13
.gitattributes
vendored
Normal file
13
.gitattributes
vendored
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
* text=auto
|
||||||
|
|
||||||
|
# shell scripts
|
||||||
|
*.sh text eol=lf
|
||||||
|
|
||||||
|
# Windows batch files
|
||||||
|
*.bat text eol=crlf
|
||||||
|
|
||||||
|
# the test cases
|
||||||
|
/test/*.txt text eol=lf
|
||||||
|
|
||||||
|
# special test case for testing ISO encoding
|
||||||
|
/test/111_manual_encoding_iso.txt text working-tree-encoding=ISO_8859-15
|
@ -10,7 +10,7 @@ boxes \- text mode box and comment drawing filter
|
|||||||
.SH SYNOPSIS
|
.SH SYNOPSIS
|
||||||
.B boxes
|
.B boxes
|
||||||
[\-hlmrv] [\-a\ format] [\-d\ design] [\-f\ file] [\-i\ indent] [\-k\ bool]
|
[\-hlmrv] [\-a\ format] [\-d\ design] [\-f\ file] [\-i\ indent] [\-k\ bool]
|
||||||
[\-p\ pad] [\-s\ size] [\-t\ tabopts] [infile [outfile]]
|
[\-n\ encoding] [\-p\ pad] [\-s\ size] [\-t\ tabopts] [infile [outfile]]
|
||||||
.SH DESCRIPTION
|
.SH DESCRIPTION
|
||||||
.I Boxes
|
.I Boxes
|
||||||
is a text filter which can draw any kind of box around its input text. Box
|
is a text filter which can draw any kind of box around its input text. Box
|
||||||
@ -185,6 +185,14 @@ padding, indentation, etc. for the mended box. Implies
|
|||||||
false.
|
false.
|
||||||
.\" - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
.\" - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||||
.TP 0.6i
|
.TP 0.6i
|
||||||
|
.B \-n \fIencoding\fP
|
||||||
|
Character encoding. Overrides the character encoding of the input and output
|
||||||
|
text. Choose from the list shown by \fIiconv -l\fP. If an invalid character
|
||||||
|
encoding is specified here, \fIUTF-8\fP is used as a fallback. The default
|
||||||
|
is to use the system encoding, which is normally the best course of action.
|
||||||
|
So don't specify this option unless you have to.
|
||||||
|
.\" - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||||
|
.TP 0.6i
|
||||||
.B \-p \fIstring\fP
|
.B \-p \fIstring\fP
|
||||||
Padding. Specify padding in spaces around the input text block for all
|
Padding. Specify padding in spaces around the input text block for all
|
||||||
sides of the box. The argument string may not contain whitespace and must
|
sides of the box. The argument string may not contain whitespace and must
|
||||||
@ -357,4 +365,5 @@ configuration file (takes precedence over system-wide configuration file)
|
|||||||
system\-wide configuration file
|
system\-wide configuration file
|
||||||
.\" =======================================================================
|
.\" =======================================================================
|
||||||
.SH "SEE ALSO"
|
.SH "SEE ALSO"
|
||||||
.I figlet(6)
|
.BR figlet (6),
|
||||||
|
.BR iconv (1)
|
||||||
|
@ -79,7 +79,7 @@ lex.yy.c: lexer.l boxes.h
|
|||||||
cat lexer.tmp.c >> lex.yy.c
|
cat lexer.tmp.c >> lex.yy.c
|
||||||
rm lexer.tmp.c
|
rm lexer.tmp.c
|
||||||
|
|
||||||
|
# TODO In the end, check declared dependencies again
|
||||||
boxes.o: boxes.c boxes.h regulex.h shape.h tools.h unicode.h generate.h remove.h config.h
|
boxes.o: boxes.c boxes.h regulex.h shape.h tools.h unicode.h generate.h remove.h config.h
|
||||||
tools.o: tools.c tools.h boxes.h shape.h config.h
|
tools.o: tools.c tools.h boxes.h shape.h config.h
|
||||||
unicode.o: unicode.c unicode.h config.h
|
unicode.o: unicode.c unicode.h config.h
|
||||||
|
33
src/boxes.c
33
src/boxes.c
@ -29,7 +29,6 @@
|
|||||||
#include <sys/types.h>
|
#include <sys/types.h>
|
||||||
#include <sys/stat.h>
|
#include <sys/stat.h>
|
||||||
|
|
||||||
#include <uniconv.h>
|
|
||||||
#include <unictype.h>
|
#include <unictype.h>
|
||||||
#include <unistdio.h>
|
#include <unistdio.h>
|
||||||
#include <unistr.h>
|
#include <unistr.h>
|
||||||
@ -107,6 +106,7 @@ static void usage(FILE *st)
|
|||||||
fprintf(st, " -k bool leading/trailing blank line retention on removal\n");
|
fprintf(st, " -k bool leading/trailing blank line retention on removal\n");
|
||||||
fprintf(st, " -l list available box designs w/ samples\n");
|
fprintf(st, " -l list available box designs w/ samples\n");
|
||||||
fprintf(st, " -m mend box, i.e. remove it and redraw it afterwards\n");
|
fprintf(st, " -m mend box, i.e. remove it and redraw it afterwards\n");
|
||||||
|
fprintf(st, " -n enc Character encoding of input and output\n");
|
||||||
fprintf(st, " -p fmt padding [default: none]\n");
|
fprintf(st, " -p fmt padding [default: none]\n");
|
||||||
/* fprintf(st, " -q modify command for needs of the web UI (undocumented)\n"); */
|
/* fprintf(st, " -q modify command for needs of the web UI (undocumented)\n"); */
|
||||||
fprintf(st, " -r remove box\n");
|
fprintf(st, " -r remove box\n");
|
||||||
@ -370,6 +370,7 @@ static int process_commandline(int argc, char *argv[])
|
|||||||
opt.tabstop = DEF_TABSTOP;
|
opt.tabstop = DEF_TABSTOP;
|
||||||
opt.tabexp = 'e';
|
opt.tabexp = 'e';
|
||||||
opt.killblank = -1;
|
opt.killblank = -1;
|
||||||
|
opt.encoding = NULL;
|
||||||
for (idummy = 0; idummy < ANZ_SIDES; ++idummy) {
|
for (idummy = 0; idummy < ANZ_SIDES; ++idummy) {
|
||||||
opt.padding[idummy] = -1;
|
opt.padding[idummy] = -1;
|
||||||
}
|
}
|
||||||
@ -388,7 +389,7 @@ static int process_commandline(int argc, char *argv[])
|
|||||||
* Parse Command Line
|
* Parse Command Line
|
||||||
*/
|
*/
|
||||||
do {
|
do {
|
||||||
oc = getopt(argc, argv, "a:c:d:f:hi:k:lmp:qrs:t:v");
|
oc = getopt(argc, argv, "a:c:d:f:hi:k:lmn:p:qrs:t:v");
|
||||||
|
|
||||||
switch (oc) {
|
switch (oc) {
|
||||||
|
|
||||||
@ -580,6 +581,17 @@ static int process_commandline(int argc, char *argv[])
|
|||||||
opt.killblank = 0;
|
opt.killblank = 0;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case 'n':
|
||||||
|
/*
|
||||||
|
* Character encoding
|
||||||
|
*/
|
||||||
|
opt.encoding = (char *) strdup(optarg);
|
||||||
|
if (opt.encoding == NULL) {
|
||||||
|
perror(PROJECT);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
case 'p':
|
case 'p':
|
||||||
/*
|
/*
|
||||||
* Padding. format is ([ahvtrbl]n)+
|
* Padding. format is ([ahvtrbl]n)+
|
||||||
@ -1257,6 +1269,9 @@ static int apply_substitutions(const int mode)
|
|||||||
/*
|
/*
|
||||||
* Compile regular expressions
|
* Compile regular expressions
|
||||||
*/
|
*/
|
||||||
|
#ifdef REGEXP_DEBUG
|
||||||
|
fprintf(stderr, "Compiling %d %s rule patterns\n", (int) anz_rules, mode ? "reversion" : "replacement");
|
||||||
|
#endif
|
||||||
errno = 0;
|
errno = 0;
|
||||||
opt.design->current_rule = rules;
|
opt.design->current_rule = rules;
|
||||||
for (j = 0; j < anz_rules; ++j, ++(opt.design->current_rule)) {
|
for (j = 0; j < anz_rules; ++j, ++(opt.design->current_rule)) {
|
||||||
@ -1278,13 +1293,13 @@ static int apply_substitutions(const int mode)
|
|||||||
for (j = 0; j < anz_rules; ++j, ++(opt.design->current_rule)) {
|
for (j = 0; j < anz_rules; ++j, ++(opt.design->current_rule)) {
|
||||||
#ifdef REGEXP_DEBUG
|
#ifdef REGEXP_DEBUG
|
||||||
fprintf (stderr, "regex_replace(0x%p, \"%s\", \"%s\", %d, \'%c\') == ",
|
fprintf (stderr, "regex_replace(0x%p, \"%s\", \"%s\", %d, \'%c\') == ",
|
||||||
rules[j].prog, rules[j].repstr, u32_strconv_to_locale(input.lines[k].mbtext),
|
rules[j].prog, rules[j].repstr, u32_strconv_to_output(input.lines[k].mbtext),
|
||||||
(int) input.lines[k].num_chars, rules[j].mode);
|
(int) input.lines[k].num_chars, rules[j].mode);
|
||||||
#endif
|
#endif
|
||||||
uint32_t *newtext = regex_replace(rules[j].prog, rules[j].repstr,
|
uint32_t *newtext = regex_replace(rules[j].prog, rules[j].repstr,
|
||||||
input.lines[k].mbtext, input.lines[k].num_chars, rules[j].mode == 'g');
|
input.lines[k].mbtext, input.lines[k].num_chars, rules[j].mode == 'g');
|
||||||
#ifdef REGEXP_DEBUG
|
#ifdef REGEXP_DEBUG
|
||||||
fprintf (stderr, "\"%s\"\n", newtext ? u32_strconv_to_locale(newtext) : "NULL");
|
fprintf (stderr, "\"%s\"\n", newtext ? u32_strconv_to_output(newtext) : "NULL");
|
||||||
#endif
|
#endif
|
||||||
if (newtext == NULL) {
|
if (newtext == NULL) {
|
||||||
return 1;
|
return 1;
|
||||||
@ -1298,7 +1313,7 @@ static int apply_substitutions(const int mode)
|
|||||||
|
|
||||||
#ifdef REGEXP_DEBUG
|
#ifdef REGEXP_DEBUG
|
||||||
fprintf (stderr, "input.lines[%d] == {%d, \"%s\"}\n", (int) k,
|
fprintf (stderr, "input.lines[%d] == {%d, \"%s\"}\n", (int) k,
|
||||||
(int) input.lines[k].num_chars, u32_strconv_to_locale(input.lines[k].mbtext));
|
(int) input.lines[k].num_chars, u32_strconv_to_output(input.lines[k].mbtext));
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
opt.design->current_rule = NULL;
|
opt.design->current_rule = NULL;
|
||||||
@ -1393,7 +1408,7 @@ static int read_all_input(const int use_stdin)
|
|||||||
input.lines = tmp;
|
input.lines = tmp;
|
||||||
}
|
}
|
||||||
|
|
||||||
mbtemp = u32_strconv_from_locale(buf);
|
mbtemp = u32_strconv_from_input(buf);
|
||||||
len_chars = u32_strlen(mbtemp);
|
len_chars = u32_strlen(mbtemp);
|
||||||
input.final_newline = has_linebreak(mbtemp, len_chars);
|
input.final_newline = has_linebreak(mbtemp, len_chars);
|
||||||
input.lines[input.anz_lines].posmap = NULL;
|
input.lines[input.anz_lines].posmap = NULL;
|
||||||
@ -1484,7 +1499,7 @@ static int read_all_input(const int use_stdin)
|
|||||||
for (i = 0; i < input.anz_lines; ++i) {
|
for (i = 0; i < input.anz_lines; ++i) {
|
||||||
#ifdef DEBUG
|
#ifdef DEBUG
|
||||||
fprintf(stderr, "%2d: mbtext = \"%s\" (%d chars)\n", (int) i,
|
fprintf(stderr, "%2d: mbtext = \"%s\" (%d chars)\n", (int) i,
|
||||||
u32_strconv_to_locale(input.lines[i].mbtext), (int) input.lines[i].num_chars);
|
u32_strconv_to_output(input.lines[i].mbtext), (int) input.lines[i].num_chars);
|
||||||
#endif
|
#endif
|
||||||
if (input.lines[i].num_chars >= input.indent) {
|
if (input.lines[i].num_chars >= input.indent) {
|
||||||
memmove(input.lines[i].text, input.lines[i].text + input.indent,
|
memmove(input.lines[i].text, input.lines[i].text + input.indent,
|
||||||
@ -1496,7 +1511,7 @@ static int read_all_input(const int use_stdin)
|
|||||||
}
|
}
|
||||||
#ifdef DEBUG
|
#ifdef DEBUG
|
||||||
fprintf(stderr, "%2d: mbtext = \"%s\" (%d chars)\n", (int) i,
|
fprintf(stderr, "%2d: mbtext = \"%s\" (%d chars)\n", (int) i,
|
||||||
u32_strconv_to_locale(input.lines[i].mbtext), (int) input.lines[i].num_chars);
|
u32_strconv_to_output(input.lines[i].mbtext), (int) input.lines[i].num_chars);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
input.maxline -= input.indent;
|
input.maxline -= input.indent;
|
||||||
@ -1557,7 +1572,7 @@ int main(int argc, char *argv[])
|
|||||||
* Store system character encoding
|
* Store system character encoding
|
||||||
*/
|
*/
|
||||||
setlocale(LC_ALL, ""); /* switch from default "C" encoding to system encoding */
|
setlocale(LC_ALL, ""); /* switch from default "C" encoding to system encoding */
|
||||||
encoding = locale_charset();
|
encoding = check_encoding(opt.encoding, locale_charset());
|
||||||
#ifdef DEBUG
|
#ifdef DEBUG
|
||||||
fprintf (stderr, "Character Encoding = %s\n", encoding);
|
fprintf (stderr, "Character Encoding = %s\n", encoding);
|
||||||
#endif
|
#endif
|
||||||
|
@ -139,6 +139,7 @@ typedef struct { /* Command line options: */
|
|||||||
char indentmode; /* 'b', 't', 'n', or '\0' */
|
char indentmode; /* 'b', 't', 'n', or '\0' */
|
||||||
char justify; /* 'l', 'c', 'r', or '\0' */
|
char justify; /* 'l', 'c', 'r', or '\0' */
|
||||||
int killblank; /* -1 if not set */
|
int killblank; /* -1 if not set */
|
||||||
|
char *encoding; /* character encoding override for input and output text */
|
||||||
FILE *infile; /* where we get our input */
|
FILE *infile; /* where we get our input */
|
||||||
FILE *outfile; /* where we put our output */
|
FILE *outfile; /* where we put our output */
|
||||||
} opt_t;
|
} opt_t;
|
||||||
|
@ -28,7 +28,6 @@
|
|||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
|
||||||
#include <uniconv.h>
|
|
||||||
#include <unistr.h>
|
#include <unistr.h>
|
||||||
|
|
||||||
#include "shape.h"
|
#include "shape.h"
|
||||||
@ -729,7 +728,7 @@ static int justify_line(line_t *line, int skew)
|
|||||||
|
|
||||||
#if defined(DEBUG) || 0
|
#if defined(DEBUG) || 0
|
||||||
fprintf (stderr, "justify_line(%c): Input: real: (%02d) \"%s\", text: (%02d) \"%s\", invisible=%d, skew=%d",
|
fprintf (stderr, "justify_line(%c): Input: real: (%02d) \"%s\", text: (%02d) \"%s\", invisible=%d, skew=%d",
|
||||||
opt.justify ? opt.justify : '0', (int) line->num_chars, u32_strconv_to_locale(line->mbtext),
|
opt.justify ? opt.justify : '0', (int) line->num_chars, u32_strconv_to_output(line->mbtext),
|
||||||
(int) line->len, line->text, (int) line->invis, skew);
|
(int) line->len, line->text, (int) line->invis, skew);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -998,7 +997,7 @@ int output_box(const sentry_t *thebox)
|
|||||||
concat_strings(obuf, LINE_MAX_BYTES + 1, 8, restored_indent,
|
concat_strings(obuf, LINE_MAX_BYTES + 1, 8, restored_indent,
|
||||||
skip_left ? "" : thebox[BLEF].chars[j], hfill1,
|
skip_left ? "" : thebox[BLEF].chars[j], hfill1,
|
||||||
ti >= 0 && shift > 0 ? nspaces(shift) : "",
|
ti >= 0 && shift > 0 ? nspaces(shift) : "",
|
||||||
ti >= 0 ? u32_strconv_to_locale(mbtext_shifted) : "",
|
ti >= 0 ? u32_strconv_to_output(mbtext_shifted) : "",
|
||||||
hfill2, nspaces(input.maxline - input.lines[ti].len - shift),
|
hfill2, nspaces(input.maxline - input.lines[ti].len - shift),
|
||||||
thebox[BRIG].chars[j]);
|
thebox[BRIG].chars[j]);
|
||||||
}
|
}
|
||||||
|
@ -26,9 +26,10 @@
|
|||||||
#include <errno.h>
|
#include <errno.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <uniconv.h>
|
#include <string.h>
|
||||||
|
|
||||||
#include "tools.h"
|
#include "tools.h"
|
||||||
|
#include "unicode.h"
|
||||||
#include "regulex.h"
|
#include "regulex.h"
|
||||||
|
|
||||||
|
|
||||||
@ -37,7 +38,10 @@ pcre2_code *compile_pattern(char *pattern)
|
|||||||
{
|
{
|
||||||
int errornumber;
|
int errornumber;
|
||||||
PCRE2_SIZE erroroffset;
|
PCRE2_SIZE erroroffset;
|
||||||
PCRE2_SPTR pattern32 = u32_strconv_from_locale(pattern);
|
PCRE2_SPTR pattern32 = u32_strconv_from_arg(pattern, config_encoding);
|
||||||
|
if (pattern32 == NULL) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
pcre2_code *re = pcre2_compile(
|
pcre2_code *re = pcre2_compile(
|
||||||
pattern32, /* the pattern */
|
pattern32, /* the pattern */
|
||||||
@ -51,7 +55,7 @@ pcre2_code *compile_pattern(char *pattern)
|
|||||||
PCRE2_UCHAR buffer[256];
|
PCRE2_UCHAR buffer[256];
|
||||||
pcre2_get_error_message(errornumber, buffer, sizeof(buffer));
|
pcre2_get_error_message(errornumber, buffer, sizeof(buffer));
|
||||||
fprintf(stderr, "Regular expression pattern \"%s\" failed to compile at offset %d: %s\n",
|
fprintf(stderr, "Regular expression pattern \"%s\" failed to compile at offset %d: %s\n",
|
||||||
pattern, (int) erroroffset, u32_strconv_to_locale(buffer));
|
pattern, (int) erroroffset, u32_strconv_to_output(buffer));
|
||||||
}
|
}
|
||||||
return re;
|
return re;
|
||||||
}
|
}
|
||||||
@ -60,7 +64,10 @@ pcre2_code *compile_pattern(char *pattern)
|
|||||||
|
|
||||||
uint32_t *regex_replace(pcre2_code *search, char *replace, uint32_t *input, const size_t input_len, const int global)
|
uint32_t *regex_replace(pcre2_code *search, char *replace, uint32_t *input, const size_t input_len, const int global)
|
||||||
{
|
{
|
||||||
PCRE2_SPTR replacement = u32_strconv_from_locale(replace);
|
PCRE2_SPTR replacement = u32_strconv_from_arg(replace, config_encoding);
|
||||||
|
if (replacement == NULL) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
uint32_t options = PCRE2_SUBSTITUTE_OVERFLOW_LENGTH | PCRE2_SUBSTITUTE_EXTENDED
|
uint32_t options = PCRE2_SUBSTITUTE_OVERFLOW_LENGTH | PCRE2_SUBSTITUTE_EXTENDED
|
||||||
| (global ? PCRE2_SUBSTITUTE_GLOBAL : 0);
|
| (global ? PCRE2_SUBSTITUTE_GLOBAL : 0);
|
||||||
PCRE2_SIZE outlen = input_len * 2; /* estimated length of output buffer in characters, fine if too small */
|
PCRE2_SIZE outlen = input_len * 2; /* estimated length of output buffer in characters, fine if too small */
|
||||||
@ -101,7 +108,7 @@ uint32_t *regex_replace(pcre2_code *search, char *replace, uint32_t *input, cons
|
|||||||
PCRE2_UCHAR buffer[256];
|
PCRE2_UCHAR buffer[256];
|
||||||
pcre2_get_error_message(pcre2_rc, buffer, sizeof(buffer));
|
pcre2_get_error_message(pcre2_rc, buffer, sizeof(buffer));
|
||||||
/* buffer will normally contain "invalid replacement string" */
|
/* buffer will normally contain "invalid replacement string" */
|
||||||
fprintf(stderr, "Error substituting \"%s\": %s\n", replace, u32_strconv_to_locale(buffer));
|
fprintf(stderr, "Error substituting \"%s\": %s\n", replace, u32_strconv_to_output(buffer));
|
||||||
BFREE(output);
|
BFREE(output);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
@ -26,7 +26,6 @@
|
|||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <uniconv.h>
|
|
||||||
#include <unistr.h>
|
#include <unistr.h>
|
||||||
|
|
||||||
#include "shape.h"
|
#include "shape.h"
|
||||||
@ -1098,7 +1097,7 @@ void output_input(const int trim_only)
|
|||||||
indent = 0;
|
indent = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
fprintf(opt.outfile, "%s%s%s", indentspc, u32_strconv_to_locale(advance32(input.lines[j].mbtext, indent)),
|
fprintf(opt.outfile, "%s%s%s", indentspc, u32_strconv_to_output(advance32(input.lines[j].mbtext, indent)),
|
||||||
(input.final_newline || j < input.anz_lines - 1 ? "\n" : ""));
|
(input.final_newline || j < input.anz_lines - 1 ? "\n" : ""));
|
||||||
BFREE (indentspc);
|
BFREE (indentspc);
|
||||||
}
|
}
|
||||||
|
@ -30,7 +30,6 @@
|
|||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <strings.h>
|
#include <strings.h>
|
||||||
|
|
||||||
#include <uniconv.h>
|
|
||||||
#include <unictype.h>
|
#include <unictype.h>
|
||||||
#include <unistr.h>
|
#include <unistr.h>
|
||||||
#include <unitypes.h>
|
#include <unitypes.h>
|
||||||
@ -514,7 +513,7 @@ void print_input_lines(const char *heading)
|
|||||||
fprintf(stderr, " [num_chars] \"real text\" [num_cols] \"ascii_text\"\n");
|
fprintf(stderr, " [num_chars] \"real text\" [num_cols] \"ascii_text\"\n");
|
||||||
for (size_t i = 0; i < input.anz_lines; ++i) {
|
for (size_t i = 0; i < input.anz_lines; ++i) {
|
||||||
fprintf(stderr, "%4d [%02d] \"%s\" [%02d] \"%s\"", (int) i,
|
fprintf(stderr, "%4d [%02d] \"%s\" [%02d] \"%s\"", (int) i,
|
||||||
(int) input.lines[i].num_chars, u32_strconv_to_locale(input.lines[i].mbtext),
|
(int) input.lines[i].num_chars, u32_strconv_to_output(input.lines[i].mbtext),
|
||||||
(int) input.lines[i].len, input.lines[i].text);
|
(int) input.lines[i].len, input.lines[i].text);
|
||||||
fprintf(stderr, "\tTabs: [");
|
fprintf(stderr, "\tTabs: [");
|
||||||
if (input.lines[i].tabpos != NULL) {
|
if (input.lines[i].tabpos != NULL) {
|
||||||
|
108
src/unicode.c
108
src/unicode.c
@ -24,22 +24,42 @@
|
|||||||
|
|
||||||
#include "config.h"
|
#include "config.h"
|
||||||
#include <errno.h>
|
#include <errno.h>
|
||||||
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
#include <uniconv.h>
|
||||||
#include <unictype.h>
|
#include <unictype.h>
|
||||||
#include <unistr.h>
|
#include <unistr.h>
|
||||||
|
|
||||||
|
#include "boxes.h"
|
||||||
|
#include "tools.h"
|
||||||
#include "unicode.h"
|
#include "unicode.h"
|
||||||
|
|
||||||
|
|
||||||
const char *encoding; /* the character encoding that we use */
|
|
||||||
|
|
||||||
const ucs4_t char_tab = 0x00000009; /* ucs4_t character '\t' (tab) */
|
const char *config_encoding = "ISO_8859-15";
|
||||||
const ucs4_t char_space = 0x00000020; /* ucs4_t character ' ' (space) */
|
|
||||||
const ucs4_t char_cr = 0x0000000d; /* ucs4_t character '\r' (carriage return) */
|
/* effective character encoding of input and output text */
|
||||||
const ucs4_t char_newline = 0x0000000a; /* ucs4_t character '\n' (newline) */
|
const char *encoding;
|
||||||
const ucs4_t char_esc = 0x0000001b; /* ucs4_t character 0x1b (escape) */
|
|
||||||
const ucs4_t char_nul = 0x00000000; /* ucs4_t character '\0' (zero) */
|
/* ucs4_t character '\t' (tab) */
|
||||||
|
const ucs4_t char_tab = 0x00000009;
|
||||||
|
|
||||||
|
/* ucs4_t character ' ' (space) */
|
||||||
|
const ucs4_t char_space = 0x00000020;
|
||||||
|
|
||||||
|
/* ucs4_t character '\r' (carriage return) */
|
||||||
|
const ucs4_t char_cr = 0x0000000d;
|
||||||
|
|
||||||
|
/* ucs4_t character '\n' (newline) */
|
||||||
|
const ucs4_t char_newline = 0x0000000a;
|
||||||
|
|
||||||
|
/* ucs4_t character 0x1b (escape) */
|
||||||
|
const ucs4_t char_esc = 0x0000001b;
|
||||||
|
|
||||||
|
/* ucs4_t character '\0' (zero) */
|
||||||
|
const ucs4_t char_nul = 0x00000000;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -176,4 +196,78 @@ uint32_t *advance32(uint32_t *s, const size_t offset)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
uint32_t *u32_strconv_from_input(const char *src)
|
||||||
|
{
|
||||||
|
return u32_strconv_from_arg(src, encoding);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
uint32_t *u32_strconv_from_arg(const char *src, const char *sourceEncoding)
|
||||||
|
{
|
||||||
|
if (src == NULL) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
if (src[0] == '\0') {
|
||||||
|
return new_empty_string32();
|
||||||
|
}
|
||||||
|
|
||||||
|
uint32_t *result = u32_strconv_from_encoding(
|
||||||
|
src, /* the source string to convert */
|
||||||
|
sourceEncoding, /* the character encoding from which to convert */
|
||||||
|
iconveh_question_mark); /* produce one question mark '?' per unconvertible character */
|
||||||
|
|
||||||
|
if (result == NULL) {
|
||||||
|
fprintf(stderr, "%s: failed to convert from '%s' to UTF-32: %s\n", PROJECT, sourceEncoding, strerror(errno));
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
char *u32_strconv_to_output(const uint32_t *src)
|
||||||
|
{
|
||||||
|
return u32_strconv_to_arg(src, encoding);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
char *u32_strconv_to_arg(const uint32_t *src, const char *targetEncoding)
|
||||||
|
{
|
||||||
|
if (src == NULL) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
if (is_empty(src)) {
|
||||||
|
return strdup("");
|
||||||
|
}
|
||||||
|
|
||||||
|
char *result = u32_strconv_to_encoding(
|
||||||
|
src, /* the source string to convert */
|
||||||
|
targetEncoding, /* the character encoding to which to convert */
|
||||||
|
iconveh_question_mark); /* produce one question mark '?' per unconvertible character */
|
||||||
|
|
||||||
|
if (result == NULL) {
|
||||||
|
fprintf(stderr, "%s: failed to convert from UTF-32 to '%s': %s\n", PROJECT, targetEncoding, strerror(errno));
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
const char *check_encoding(const char *manual_encoding, const char *system_encoding)
|
||||||
|
{
|
||||||
|
if (manual_encoding != NULL) {
|
||||||
|
uint32_t *unicode = u32_strconv_from_encoding(" ", manual_encoding, iconveh_error);
|
||||||
|
if (unicode != NULL) {
|
||||||
|
BFREE(unicode);
|
||||||
|
return manual_encoding;
|
||||||
|
}
|
||||||
|
fprintf(stderr, "%s: Invalid character encoding: %s - falling back to %s\n",
|
||||||
|
PROJECT, manual_encoding, system_encoding);
|
||||||
|
}
|
||||||
|
return system_encoding;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/*EOF*/ /* vim: set sw=4: */
|
/*EOF*/ /* vim: set sw=4: */
|
||||||
|
@ -28,16 +28,31 @@
|
|||||||
#include <unitypes.h>
|
#include <unitypes.h>
|
||||||
|
|
||||||
|
|
||||||
extern const char *encoding; /* the character encoding that we use */
|
|
||||||
|
|
||||||
extern const ucs4_t char_tab; /* ucs4_t character '\t' (tab) */
|
/** The boxes config file is still encoded with a single-byte character set. Officially, it is ASCII!
|
||||||
extern const ucs4_t char_space; /* ucs4_t character ' ' (space) */
|
* However, people might not conform to this, so we use ISO_8859-15 as a reasonable superset. */
|
||||||
extern const ucs4_t char_cr; /* ucs4_t character '\r' (carriage return) */
|
extern const char *config_encoding;
|
||||||
extern const ucs4_t char_newline; /* ucs4_t character '\n' (newline) */
|
|
||||||
extern const ucs4_t char_esc; /* ucs4_t character 0x1b (escape) */
|
|
||||||
extern const ucs4_t char_nul; /* ucs4_t character '\0' (zero) */
|
|
||||||
|
|
||||||
|
/** the character encoding of input (and output) text */
|
||||||
|
extern const char *encoding;
|
||||||
|
|
||||||
|
/** ucs4_t character '\t' (tab) */
|
||||||
|
extern const ucs4_t char_tab;
|
||||||
|
|
||||||
|
/** ucs4_t character ' ' (space) */
|
||||||
|
extern const ucs4_t char_space;
|
||||||
|
|
||||||
|
/** ucs4_t character '\r' (carriage return) */
|
||||||
|
extern const ucs4_t char_cr;
|
||||||
|
|
||||||
|
/** ucs4_t character '\n' (newline) */
|
||||||
|
extern const ucs4_t char_newline;
|
||||||
|
|
||||||
|
/** ucs4_t character 0x1b (escape) */
|
||||||
|
extern const ucs4_t char_esc;
|
||||||
|
|
||||||
|
/** ucs4_t character '\0' (zero) */
|
||||||
|
extern const ucs4_t char_nul;
|
||||||
|
|
||||||
int is_char_at(const uint32_t *text, const size_t idx, const ucs4_t expected_char);
|
int is_char_at(const uint32_t *text, const size_t idx, const ucs4_t expected_char);
|
||||||
|
|
||||||
@ -75,6 +90,55 @@ uint32_t *advance_next32(const uint32_t *s, size_t *invis);
|
|||||||
*/
|
*/
|
||||||
uint32_t *advance32(uint32_t *s, const size_t offset);
|
uint32_t *advance32(uint32_t *s, const size_t offset);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Convert a string from the input/output encoding (`encoding` in this .h file) to UTF-32 internal representation.
|
||||||
|
* Memory will be allocated for the converted string.
|
||||||
|
*
|
||||||
|
* @param <src> string to convert, zero-terminated
|
||||||
|
* @return UTF-32 string, or NULL in case of error (then an error message was already printed on stderr)
|
||||||
|
*/
|
||||||
|
uint32_t *u32_strconv_from_input(const char *src);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Convert a string from the given source encoding to UTF-32 internal representation.
|
||||||
|
* Memory will be allocated for the converted string.
|
||||||
|
*
|
||||||
|
* @param <src> string to convert, zero-terminated
|
||||||
|
* @param <sourceEncoding> the character encoding of <src>
|
||||||
|
* @return UTF-32 string, or NULL in case of error (then an error message was already printed on stderr)
|
||||||
|
*/
|
||||||
|
uint32_t *u32_strconv_from_arg(const char *src, const char *sourceEncoding);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Convert a string from UTF-32 internal representation to input/output encoding (`encoding` in this .h file).
|
||||||
|
* Memory will be allocated for the converted string.
|
||||||
|
*
|
||||||
|
* @param <src> UTF-32 string to convert, zero-terminated
|
||||||
|
* @return string in input/output encoding, or NULL on error (then an error message was already printed on stderr)
|
||||||
|
*/
|
||||||
|
char *u32_strconv_to_output(const uint32_t *src);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Convert a string from UTF-32 internal representation to the given target encoding.
|
||||||
|
* Memory will be allocated for the converted string.
|
||||||
|
*
|
||||||
|
* @param <src> UTF-32 string to convert, zero-terminated
|
||||||
|
* @param <targetEncoding> the character encoding of the result
|
||||||
|
* @return string in target encoding, or NULL in case of error (then an error message was already printed on stderr)
|
||||||
|
*/
|
||||||
|
char *u32_strconv_to_arg(const uint32_t *src, const char *targetEncoding);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check if the given <manual_encoding> can be used to covert anything. This should reveal invalid encoding names that
|
||||||
|
* have been specified on the command line. If no <manual_encoding> was specified, or if an invalid encoding is
|
||||||
|
* detected, we fall back to the system encoding. No new memory is allocated.
|
||||||
|
*
|
||||||
|
* @param <manual_encoding> the encoding set on the command line, may be NULL
|
||||||
|
* @param <system_encoding> the system encoding
|
||||||
|
* @return <manual_encoding> if it is set to a valid value, <system_encoding> otherwise
|
||||||
|
*/
|
||||||
|
const char *check_encoding(const char *manual_encoding, const char *system_encoding);
|
||||||
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
24
test/111_manual_encoding_iso.txt
Normal file
24
test/111_manual_encoding_iso.txt
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
:ARGS
|
||||||
|
-ac -n ISO_8859-15
|
||||||
|
:INPUT
|
||||||
|
ä
|
||||||
|
äb
|
||||||
|
äbç
|
||||||
|
äbçd
|
||||||
|
äbçdé
|
||||||
|
äbçdéf
|
||||||
|
äbçdéfg
|
||||||
|
äbçdéfgh
|
||||||
|
:OUTPUT-FILTER
|
||||||
|
:EXPECTED
|
||||||
|
/**************/
|
||||||
|
/* ä */
|
||||||
|
/* äb */
|
||||||
|
/* äbç */
|
||||||
|
/* äbçd */
|
||||||
|
/* äbçdé */
|
||||||
|
/* äbçdéf */
|
||||||
|
/* äbçdéfg */
|
||||||
|
/* äbçdéfgh */
|
||||||
|
/**************/
|
||||||
|
:EOF
|
Loading…
x
Reference in New Issue
Block a user