mirror of
https://github.com/ascii-boxes/boxes.git
synced 2025-06-10 20:06:46 +02:00
Unicode-enable the reading of input #1
This commit is contained in:
parent
f2ddb6d6df
commit
a6a5898341
@ -26,9 +26,9 @@ GEN_HDR = parser.h boxes.h
|
||||
GEN_SRC = parser.c lex.yy.c
|
||||
GEN_FILES = $(GEN_SRC) $(GEN_HDR)
|
||||
ORIG_HDRCL = boxes.h.in config.h
|
||||
ORIG_HDR = $(ORIG_HDRCL) lexer.h tools.h shape.h generate.h remove.h
|
||||
ORIG_HDR = $(ORIG_HDRCL) lexer.h tools.h shape.h generate.h remove.h unicode.h
|
||||
ORIG_GEN = lexer.l parser.y
|
||||
ORIG_NORM = boxes.c tools.c shape.c generate.c remove.c
|
||||
ORIG_NORM = boxes.c tools.c shape.c generate.c remove.c unicode.c
|
||||
ORIG_SRC = $(ORIG_GEN) $(ORIG_NORM)
|
||||
ORIG_FILES = $(ORIG_SRC) $(ORIG_HDR)
|
||||
OTH_FILES = Makefile
|
||||
@ -47,7 +47,7 @@ debug: flags_$(BOXES_PLATFORM)
|
||||
|
||||
boxes: $(ALL_OBJ)
|
||||
$(MAKE) -C regexp CC=$(CC) libregexp.a
|
||||
$(CC) $(LDFLAGS) $(ALL_OBJ) -o $(BOXES_EXECUTABLE_NAME) -lregexp
|
||||
$(CC) $(LDFLAGS) $(ALL_OBJ) -o $(BOXES_EXECUTABLE_NAME) -lunistring -lpcre2-32 -lregexp
|
||||
if [ "$(STRIP)" = "true" ] ; then strip $(BOXES_EXECUTABLE_NAME) ; fi
|
||||
|
||||
boxes.exe: $(ALL_OBJ)
|
||||
@ -81,8 +81,9 @@ lex.yy.c: lexer.l boxes.h
|
||||
rm lexer.tmp.c
|
||||
|
||||
|
||||
boxes.o: boxes.c boxes.h regexp/regexp.h shape.h tools.h generate.h remove.h config.h
|
||||
boxes.o: boxes.c boxes.h regexp/regexp.h shape.h tools.h unicode.h generate.h remove.h config.h
|
||||
tools.o: tools.c tools.h boxes.h shape.h config.h
|
||||
unicode.o: unicode.c unicode.h config.h
|
||||
shape.o: shape.c shape.h boxes.h config.h tools.h
|
||||
generate.o: generate.c generate.h boxes.h shape.h tools.h config.h
|
||||
remove.o: remove.c remove.h boxes.h shape.h tools.h config.h
|
||||
|
243
src/boxes.c
243
src/boxes.c
@ -21,6 +21,7 @@
|
||||
#include "config.h"
|
||||
#include <errno.h>
|
||||
#include <limits.h>
|
||||
#include <locale.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
@ -29,6 +30,7 @@
|
||||
#include <sys/stat.h>
|
||||
|
||||
#include <uniconv.h>
|
||||
#include <unictype.h>
|
||||
#include <unistdio.h>
|
||||
#include <unistr.h>
|
||||
#include <unitypes.h>
|
||||
@ -42,6 +44,7 @@
|
||||
#include "regexp.h"
|
||||
#include "generate.h"
|
||||
#include "remove.h"
|
||||
#include "unicode.h"
|
||||
|
||||
#ifdef __MINGW32__
|
||||
#include <windows.h>
|
||||
@ -92,14 +95,6 @@ int anz_designs = 0; /* no of designs after parsing */
|
||||
|
||||
opt_t opt; /* command line options */
|
||||
|
||||
char *encoding; /* the character encoding that we use */
|
||||
|
||||
ucs4_t char_tab = 0x00000009; /* ucs4_t character '\t' (tab) */
|
||||
ucs4_t char_space = 0x00000020; /* ucs4_t character ' ' (space) */
|
||||
ucs4_t char_cr = 0x0000000d; /* ucs4_t character '\r' (carriage return) */
|
||||
ucs4_t char_newline = 0x0000000a; /* ucs4_t character '\n' (newline) */
|
||||
ucs4_t char_nul = 0x00000000; /* ucs4_t character '\0' (zero) */
|
||||
|
||||
input_t input = INPUT_INITIALIZER; /* input lines */
|
||||
|
||||
|
||||
@ -1175,12 +1170,12 @@ static int list_styles()
|
||||
|
||||
|
||||
|
||||
static int get_indent (const line_t *lines, const size_t lanz)
|
||||
static int get_indent (const line_t *lines, const size_t lines_size)
|
||||
/*
|
||||
* Determine indentation of given lines in spaces.
|
||||
*
|
||||
* lines the lines to examine
|
||||
* lanz number of lines to examine
|
||||
* lines_size number of lines to examine
|
||||
*
|
||||
* Lines are assumed to be free of trailing whitespace.
|
||||
*
|
||||
@ -1190,7 +1185,6 @@ static int get_indent (const line_t *lines, const size_t lanz)
|
||||
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
|
||||
*/
|
||||
{
|
||||
size_t j;
|
||||
int res = LINE_MAX_BYTES; /* result */
|
||||
int nonblank = 0; /* true if one non-blank line found */
|
||||
|
||||
@ -1198,24 +1192,26 @@ static int get_indent (const line_t *lines, const size_t lanz)
|
||||
fprintf(stderr, "%s: internal error\n", PROJECT);
|
||||
return -1;
|
||||
}
|
||||
if (lanz == 0)
|
||||
if (lines_size == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
for (j=0; j<lanz; ++j) {
|
||||
for (size_t j = 0; j < lines_size; ++j) {
|
||||
if (lines[j].len > 0) {
|
||||
size_t ispc;
|
||||
nonblank = 1;
|
||||
ispc = strspn (lines[j].text, " ");
|
||||
if ((int) ispc < res)
|
||||
size_t ispc = strspn(lines[j].text, " ");
|
||||
if ((int) ispc < res) {
|
||||
res = ispc;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (nonblank)
|
||||
if (nonblank) {
|
||||
return res; /* success */
|
||||
else
|
||||
} else {
|
||||
return 0; /* success, but only blank lines */
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -1327,12 +1323,12 @@ static int apply_substitutions (const int mode)
|
||||
|
||||
|
||||
|
||||
static int has_linebreak (const char *s, const int len)
|
||||
static int has_linebreak (const uint32_t *s, const int len)
|
||||
/*
|
||||
* Determine if the given line of raw text is ended by a line break.
|
||||
*
|
||||
* s: the string to check
|
||||
* len: length of s
|
||||
* len: length of s in characters
|
||||
*
|
||||
* RETURNS: != 0 line break found
|
||||
* == 0 line break not found
|
||||
@ -1342,10 +1338,10 @@ static int has_linebreak (const char *s, const int len)
|
||||
{
|
||||
int result = 0;
|
||||
if (s != NULL && len > 0) {
|
||||
char the_last = s[len - 1];
|
||||
result = the_last == '\r' || the_last == '\n';
|
||||
ucs4_t the_last = s[len - 1];
|
||||
result = u32_cmp(&char_cr, &the_last, 1) == 0 || u32_cmp(&char_newline, &the_last, 1) == 0;
|
||||
#if defined(DEBUG)
|
||||
fprintf(stderr, "has_linebreak: (%d) %d\n", the_last, result);
|
||||
fprintf(stderr, "has_linebreak: (%#010x) %d\n", (int) the_last, result);
|
||||
#endif
|
||||
}
|
||||
return result;
|
||||
@ -1353,6 +1349,60 @@ static int has_linebreak (const char *s, const int len)
|
||||
|
||||
|
||||
|
||||
static size_t count_invisible_chars(const uint32_t *s, const size_t buflen, size_t *num_esc, char **ascii)
|
||||
{
|
||||
size_t invis = 0; /* counts invisible characters */
|
||||
int ansipos = 0; /* progression of ansi sequence */
|
||||
*num_esc = 0; /* counts the number of escape sequences found */
|
||||
|
||||
if (is_empty(s)) {
|
||||
(*ascii) = (char *) strdup("");
|
||||
return 0;
|
||||
}
|
||||
(*ascii) = (char *) calloc(buflen, sizeof(char));
|
||||
char *p = *ascii;
|
||||
|
||||
ucs4_t c;
|
||||
const uint32_t *rest = s;
|
||||
while ((rest = u32_next(&c, rest))) {
|
||||
if (ansipos == 0 && c == 0x0000001b) {
|
||||
/* Found an ESC char, count it as invisible and move 1 forward in the detection of CSI sequences */
|
||||
ansipos++;
|
||||
invis++;
|
||||
(*num_esc)++;
|
||||
} else if (ansipos == 1 && c == '[') {
|
||||
/* Found '[' char after ESC. A CSI sequence has started. */
|
||||
ansipos++;
|
||||
invis++;
|
||||
} else if (ansipos == 1 && c >= 0x40 && c <= 0x5f) {
|
||||
/* Found a byte designating the end of a two-byte escape sequence */
|
||||
invis++;
|
||||
ansipos = 0;
|
||||
} else if (ansipos == 2) {
|
||||
/* Inside CSI sequence - Keep counting bytes as invisible */
|
||||
invis++;
|
||||
|
||||
/* A char between 0x40 and 0x7e signals the end of an CSI or escape sequence */
|
||||
if (c >= 0x40 && c <= 0x7e) {
|
||||
ansipos = 0;
|
||||
}
|
||||
} else if (is_ascii_printable(c)) {
|
||||
*p = c & 0xff;
|
||||
++p;
|
||||
} else {
|
||||
int cols = uc_width(c, encoding);
|
||||
if (cols > 0) {
|
||||
memset(p, (int) 'x', cols);
|
||||
p += cols;
|
||||
}
|
||||
}
|
||||
}
|
||||
*p = '\0';
|
||||
return invis;
|
||||
}
|
||||
|
||||
|
||||
|
||||
static int read_all_input (const int use_stdin)
|
||||
/*
|
||||
* Read entire input (possibly from stdin) and store it in 'input' array.
|
||||
@ -1370,15 +1420,9 @@ static int read_all_input (const int use_stdin)
|
||||
*/
|
||||
{
|
||||
char buf[LINE_MAX_BYTES + 2]; /* input buffer */
|
||||
size_t len_bytes;
|
||||
char c;
|
||||
size_t invis; /* counts invisible characters */
|
||||
int ansipos; /* progression of ansi sequence */
|
||||
size_t len_chars;
|
||||
size_t input_size = 0; /* number of elements allocated */
|
||||
line_t *tmp = NULL;
|
||||
char *temp = NULL; /* string resulting from tab exp. */
|
||||
uint8_t *mbtemp = NULL; /* temp string for preparing the multi-byte input */
|
||||
size_t newlen; /* line length after tab expansion */
|
||||
uint32_t *mbtemp = NULL; /* temp string for preparing the multi-byte input */
|
||||
size_t i;
|
||||
int rc;
|
||||
|
||||
@ -1391,11 +1435,10 @@ static int read_all_input (const int use_stdin)
|
||||
/*
|
||||
* Start reading
|
||||
*/
|
||||
while (fgets (buf, LINE_MAX_BYTES+1, opt.infile))
|
||||
{
|
||||
while (fgets(buf, LINE_MAX_BYTES + 1, opt.infile)) {
|
||||
if (input_size % 100 == 0) {
|
||||
input_size += 100;
|
||||
tmp = (line_t *) realloc (input.lines, input_size*sizeof(line_t));
|
||||
line_t *tmp = (line_t *) realloc(input.lines, input_size * sizeof(line_t));
|
||||
if (tmp == NULL) {
|
||||
perror(PROJECT);
|
||||
BFREE (input.lines);
|
||||
@ -1404,83 +1447,60 @@ static int read_all_input (const int use_stdin)
|
||||
input.lines = tmp;
|
||||
}
|
||||
|
||||
len_bytes = strlen(buf);
|
||||
mbtemp = u8_strconv_from_locale(buf);
|
||||
input.lines[input.anz_lines].len = u8_strwidth(mbtemp, encoding);
|
||||
input.lines[input.anz_lines].num_leading_blanks = 0;
|
||||
input.final_newline = has_linebreak(buf, len_bytes);
|
||||
mbtemp = u32_strconv_from_locale(buf);
|
||||
len_chars = u32_strlen(mbtemp);
|
||||
input.final_newline = has_linebreak(mbtemp, len_chars);
|
||||
|
||||
if (opt.r) {
|
||||
input.lines[input.anz_lines].len -= 1; /* TODO HERE */
|
||||
if (buf[input.lines[input.anz_lines].len] == '\n')
|
||||
buf[input.lines[input.anz_lines].len] = '\0';
|
||||
if (is_char_at(mbtemp, len_chars - 1, char_newline)) {
|
||||
set_char_at(mbtemp, len_chars - 1, char_nul);
|
||||
--len_chars;
|
||||
}
|
||||
}
|
||||
else {
|
||||
btrim (buf, &(input.lines[input.anz_lines].len));
|
||||
btrim32(mbtemp, &len_chars);
|
||||
}
|
||||
|
||||
if (input.lines[input.anz_lines].len > 0) {
|
||||
newlen = expand_tabs_into (buf,
|
||||
input.lines[input.anz_lines].len, opt.tabstop, &temp,
|
||||
/*
|
||||
* Expand tabs
|
||||
*/
|
||||
if (len_chars > 0) {
|
||||
uint32_t *temp = NULL;
|
||||
len_chars = expand_tabs_into(mbtemp, opt.tabstop, &temp,
|
||||
&(input.lines[input.anz_lines].tabpos),
|
||||
&(input.lines[input.anz_lines].tabpos_len));
|
||||
if (newlen == 0) {
|
||||
if (len_chars == 0) {
|
||||
perror(PROJECT);
|
||||
BFREE (input.lines);
|
||||
return 1;
|
||||
}
|
||||
input.lines[input.anz_lines].text = temp;
|
||||
input.lines[input.anz_lines].len = newlen;
|
||||
input.lines[input.anz_lines].mbtext = temp;
|
||||
temp = NULL;
|
||||
}
|
||||
else {
|
||||
input.lines[input.anz_lines].text = (char *) strdup (buf);
|
||||
input.lines[input.anz_lines].mbtext = mbtemp;
|
||||
}
|
||||
input.lines[input.anz_lines].num_chars = len_chars;
|
||||
|
||||
/*
|
||||
* Find ANSI CSI/ESC sequences
|
||||
*/
|
||||
invis = 0;
|
||||
ansipos = 0;
|
||||
for (i=0; i<input.lines[input.anz_lines].len; ++i) {
|
||||
c = input.lines[input.anz_lines].text[i];
|
||||
if (ansipos == 0 && c == 0x1b){
|
||||
/* Found an ESC char, count it as invisible and move 1 forward in the
|
||||
* detection of CSI sequences */
|
||||
ansipos++;
|
||||
invis++;
|
||||
} else if (ansipos == 1 && c == '[') {
|
||||
/* Found '[' char after ESC. A CSI sequence has started. */
|
||||
ansipos++;
|
||||
invis++;
|
||||
} else if (ansipos == 1 && c >= 0x40 && c <= 0x5f) {
|
||||
/* Found a byte designating the end of a two-byte
|
||||
* escape sequence */
|
||||
invis++;
|
||||
ansipos = 0;
|
||||
} else if (ansipos == 2) {
|
||||
/* Inside CSI sequence - Keep counting bytes as invisible */
|
||||
invis++;
|
||||
|
||||
/* A char between 0x40 and 0x7e signals the end of an CSI or escape sequence */
|
||||
if (c >= 0x40 && c <= 0x7e)
|
||||
ansipos = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/* Save the count of invisible chars and visible chars.
|
||||
* I'm happy about suggestions for a more elegant handling
|
||||
* of this and the use of .invis and .vischar (and .len)
|
||||
* in the other functions.
|
||||
*/
|
||||
size_t num_esc = 0;
|
||||
size_t invis = count_invisible_chars(input.lines[input.anz_lines].mbtext, strlen(buf), &num_esc,
|
||||
&(input.lines[input.anz_lines].text));
|
||||
input.lines[input.anz_lines].invis = invis;
|
||||
input.lines[input.anz_lines].vischar = input.lines[input.anz_lines].len - invis;
|
||||
input.lines[input.anz_lines].vischar = len_chars - invis;
|
||||
|
||||
/* u32_strwidth() does not count control characters, i.e. ESC characters, for which we must correct */
|
||||
input.lines[input.anz_lines].len =
|
||||
u32_strwidth(input.lines[input.anz_lines].mbtext, encoding) - invis + num_esc;
|
||||
input.lines[input.anz_lines].num_leading_blanks = 0;
|
||||
|
||||
/*
|
||||
* Update length of longest line
|
||||
*/
|
||||
if (input.lines[input.anz_lines].vischar > input.maxline) {
|
||||
input.maxline = input.lines[input.anz_lines].vischar;
|
||||
if (input.lines[input.anz_lines].len > input.maxline) {
|
||||
input.maxline = input.lines[input.anz_lines].len;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1499,26 +1519,34 @@ static int read_all_input (const int use_stdin)
|
||||
else {
|
||||
/* recalculate input statistics for redrawing the mended box */
|
||||
for (i = 0; i < input.anz_lines; ++i) {
|
||||
input.lines[i].len = strlen (input.lines[i].text);
|
||||
if (input.lines[i].len > input.maxline)
|
||||
size_t num_esc = 0;
|
||||
char *dummy;
|
||||
size_t invis = count_invisible_chars(input.lines[i].mbtext, strlen(input.lines[i].text), &num_esc, &dummy);
|
||||
BFREE(dummy);
|
||||
input.lines[i].len = u32_strwidth(input.lines[i].mbtext, encoding) - invis + num_esc;
|
||||
input.lines[i].num_chars = u32_strlen(input.lines[i].mbtext);
|
||||
if (input.lines[i].len > input.maxline) {
|
||||
input.maxline = input.lines[i].len;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Exit if there was no input at all
|
||||
*/
|
||||
if (input.lines == NULL || input.lines[0].text == NULL)
|
||||
if (input.lines == NULL || input.lines[0].text == NULL) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Compute indentation
|
||||
*/
|
||||
rc = get_indent(input.lines, input.anz_lines);
|
||||
if (rc >= 0)
|
||||
if (rc >= 0) {
|
||||
input.indent = (size_t) rc;
|
||||
else
|
||||
} else {
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Remove indentation, unless we want to preserve it (when removing
|
||||
@ -1526,11 +1554,15 @@ static int read_all_input (const int use_stdin)
|
||||
*/
|
||||
if (opt.design->indentmode != 't' && opt.r == 0) {
|
||||
for (i = 0; i < input.anz_lines; ++i) {
|
||||
if (input.lines[i].len >= input.indent) {
|
||||
if (input.lines[i].num_chars >= input.indent) {
|
||||
memmove(input.lines[i].text, input.lines[i].text + input.indent,
|
||||
input.lines[i].len - input.indent + 1);
|
||||
input.lines[i].len -= input.indent;
|
||||
input.lines[i].vischar -= input.indent;
|
||||
|
||||
u32_move(input.lines[i].mbtext, input.lines[i].mbtext + input.indent,
|
||||
input.lines[i].num_chars - input.indent + 1);
|
||||
input.lines[i].num_chars -= input.indent;
|
||||
}
|
||||
}
|
||||
input.maxline -= input.indent;
|
||||
@ -1540,32 +1572,38 @@ static int read_all_input (const int use_stdin)
|
||||
* Apply regular expression substitutions
|
||||
*/
|
||||
if (opt.r == 0) {
|
||||
if (apply_substitutions(0) != 0)
|
||||
if (apply_substitutions(0) != 0) { // TODO HERE
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
#if 0
|
||||
#if 1
|
||||
/*
|
||||
* Debugging Code: Display contents of input structure
|
||||
*/
|
||||
fprintf (stderr, "Encoding: %s\n", encoding);
|
||||
fprintf (stderr, "Input Lines:\n");
|
||||
fprintf (stderr, " [num_chars] \"real text\" [num_cols] \"ascii_text\"\n");
|
||||
for (i=0; i<input.anz_lines; ++i) {
|
||||
fprintf (stderr, "%3d [%02d] \"%s\"", i, input.lines[i].len,
|
||||
input.lines[i].text);
|
||||
fprintf (stderr, "%4d [%02d] \"%s\" [%02d] \"%s\"", (int) i,
|
||||
(int) input.lines[i].num_chars, u32_strconv_to_locale(input.lines[i].mbtext),
|
||||
(int) input.lines[i].len, input.lines[i].text);
|
||||
fprintf (stderr, "\tTabs: [");
|
||||
if (input.lines[i].tabpos != NULL) {
|
||||
size_t j;
|
||||
for (j=0; j<input.lines[i].tabpos_len; ++j) {
|
||||
fprintf (stderr, "%d", input.lines[i].tabpos[j]);
|
||||
fprintf (stderr, "%d", (int) input.lines[i].tabpos[j]);
|
||||
if (j < input.lines[i].tabpos_len - 1) {
|
||||
fprintf (stderr, ", ");
|
||||
}
|
||||
}
|
||||
}
|
||||
fprintf (stderr, "] (%d)\n", input.lines[i].tabpos_len);
|
||||
fprintf (stderr, "] (%d)", (int) input.lines[i].tabpos_len);
|
||||
fprintf (stderr, "\tvis=%d, invis=%d\n", (int) input.lines[i].vischar, (int) input.lines[i].invis);
|
||||
}
|
||||
fprintf (stderr, "\n Longest line: %d characters.\n", input.maxline);
|
||||
fprintf (stderr, " Indentation: %2d spaces.\n", input.indent);
|
||||
fprintf (stderr, "Final newline: %d.\n", input.final_newline);
|
||||
fprintf (stderr, " Longest line: %d columns\n", (int) input.maxline);
|
||||
fprintf (stderr, " Indentation: %2d spaces\n", (int) input.indent);
|
||||
fprintf (stderr, "Final newline: %s\n", input.final_newline ? "yes" : "no");
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
@ -1607,6 +1645,9 @@ int main (int argc, char *argv[])
|
||||
*/
|
||||
setlocale(LC_ALL, ""); /* switch from default "C" encoding to system encoding */
|
||||
encoding = locale_charset();
|
||||
#ifdef DEBUG
|
||||
fprintf (stderr, "Character Encoding = %s\n", encoding);
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Parse config file, then reset design pointer
|
||||
|
@ -145,26 +145,19 @@ typedef struct { /* Command line options: */
|
||||
extern opt_t opt;
|
||||
|
||||
|
||||
extern char *encoding; /* the character encoding that we use */
|
||||
|
||||
extern ucs4_t char_tab; /* ucs4_t character '\t' (tab) */
|
||||
extern ucs4_t char_space; /* ucs4_t character ' ' (space) */
|
||||
extern ucs4_t char_cr; /* ucs4_t character '\r' (carriage return) */
|
||||
extern ucs4_t char_newline; /* ucs4_t character '\n' (newline) */
|
||||
extern ucs4_t char_nul; /* ucs4_t character '\0' (zero) */
|
||||
|
||||
|
||||
typedef struct {
|
||||
size_t len; /* length of text in columns (character positions in a text terminal) */
|
||||
char *text; /* ASCII line content, tabs expanded, multi-byte chars replaced with 'x' */
|
||||
uint8_t *mbtext; /* multi-byte (original) line content, tabs expanded. We use UTF-8 so that our old regex code can find ASCII characters in it. */
|
||||
size_t invis; /* number of characters part of an ansi sequence */
|
||||
size_t vischar; /* number of normal printable characters */
|
||||
size_t *tabpos; /* tab positions in expanded work strings */
|
||||
char *text; /* ASCII line content, tabs expanded, multi-byte chars replaced with one or more 'x' */
|
||||
uint32_t *mbtext; /* multi-byte (original) line content, tabs expanded. We use UTF-32 in order to enable pointer arithmetic. */
|
||||
size_t num_chars; /* number of characters in mbtext, visible + invisible */
|
||||
size_t invis; /* number of characters part of an ansi sequence (aka "invisible") */
|
||||
size_t vischar; /* number of normal printable characters (aka "visible") */
|
||||
size_t *tabpos; /* tab positions in expanded work strings, or NULL if not needed */
|
||||
size_t tabpos_len; /* number of tabs in a line */
|
||||
size_t num_leading_blanks; /* number of spaces at the start of the line after justification */
|
||||
} line_t;
|
||||
|
||||
|
||||
#ifndef FILE_LEXER_L
|
||||
typedef struct {
|
||||
line_t *lines;
|
||||
|
50
src/tools.c
50
src/tools.c
@ -30,11 +30,13 @@
|
||||
#include <string.h>
|
||||
#include <strings.h>
|
||||
|
||||
#include <unictype.h>
|
||||
#include <unistr.h>
|
||||
#include <unitypes.h>
|
||||
|
||||
#include "shape.h"
|
||||
#include "boxes.h"
|
||||
#include "unicode.h"
|
||||
#include "tools.h"
|
||||
|
||||
|
||||
@ -239,13 +241,12 @@ int empty_line(const line_t *line)
|
||||
|
||||
|
||||
|
||||
size_t expand_tabs_into(const uint32_t *input_buffer, const size_t in_len,
|
||||
const int tabstop, uint32_t **text, size_t **tabpos, size_t *tabpos_len)
|
||||
size_t expand_tabs_into(const uint32_t *input_buffer, const int tabstop, uint32_t **text,
|
||||
size_t **tabpos, size_t *tabpos_len)
|
||||
/*
|
||||
* Expand tab chars in input_buffer and store result in text.
|
||||
*
|
||||
* input_buffer Line of text with tab chars
|
||||
* in_len length of the string in input_buffer in characters
|
||||
* tabstop tab stop distance
|
||||
* text address of the pointer that will take the result
|
||||
* tabpos array of ints giving the positions of the first
|
||||
@ -263,17 +264,20 @@ size_t expand_tabs_into(const uint32_t *input_buffer, const size_t in_len,
|
||||
{
|
||||
static uint32_t temp[LINE_MAX_BYTES * MAX_TABSTOP + 1]; /* work string */
|
||||
size_t io; /* character position in work string */
|
||||
size_t tabnum; /* index of the current tab */
|
||||
size_t tabnum = 0; /* index of the current tab */
|
||||
|
||||
*text = NULL;
|
||||
|
||||
if (opt.tabexp != 'k') {
|
||||
/* We need to know the exact tab positions only if expansion type 'k' is requested (keep tabs as much as they
|
||||
* were as possible). Else we'll just convert spaces and tabs without having to know where exactly the tabs
|
||||
* were in the first place. */
|
||||
*tabpos_len = 0;
|
||||
} else {
|
||||
ucs4_t puc;
|
||||
const uint32_t *rest = input_buffer;
|
||||
while (rest = u32_next(&puc, rest)) {
|
||||
if (u32_cmp(&char_tab, &puc, 1) == 0) {
|
||||
while ((rest = u32_next(&puc, rest))) {
|
||||
if (puc == char_tab) {
|
||||
(*tabpos_len)++;
|
||||
}
|
||||
}
|
||||
@ -289,8 +293,8 @@ size_t expand_tabs_into(const uint32_t *input_buffer, const size_t in_len,
|
||||
ucs4_t puc;
|
||||
const uint32_t *rest = input_buffer;
|
||||
io = 0;
|
||||
while (rest = u32_next(&puc, rest)) {
|
||||
if (u32_cmp(&char_tab, &puc, 1) == 0) { /* Is it a tab char? */
|
||||
while ((rest = u32_next(&puc, rest))) {
|
||||
if (puc == char_tab) {
|
||||
if (*tabpos_len > 0) {
|
||||
(*tabpos)[tabnum++] = io;
|
||||
}
|
||||
@ -299,7 +303,7 @@ size_t expand_tabs_into(const uint32_t *input_buffer, const size_t in_len,
|
||||
io += num_spc;
|
||||
}
|
||||
else {
|
||||
u32_set(temp + io, puc, 1);
|
||||
set_char_at(temp, io, puc);
|
||||
++io;
|
||||
}
|
||||
}
|
||||
@ -335,6 +339,34 @@ void btrim(char *text, size_t *len)
|
||||
|
||||
|
||||
|
||||
void btrim32(uint32_t *text, size_t *len)
|
||||
/*
|
||||
* Remove trailing whitespace from line (unicode version).
|
||||
*
|
||||
* text string to trim
|
||||
* len pointer to the length of the string in characters
|
||||
*
|
||||
* Both the string and the length will be modified as trailing whitespace is removed.
|
||||
*
|
||||
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
|
||||
*/
|
||||
{
|
||||
int idx = (int) (*len - 1);
|
||||
|
||||
for (; idx >= 0; --idx) {
|
||||
ucs4_t c = text[idx];
|
||||
if (uc_is_c_whitespace(c) || uc_is_property_white_space(c) || uc_is_property_bidi_whitespace(c)) {
|
||||
set_char_at(text, idx, char_nul);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
*len = idx + 1;
|
||||
}
|
||||
|
||||
|
||||
|
||||
char *my_strnrstr(const char *s1, const char *s2, const size_t s2_len, int skip)
|
||||
/*
|
||||
* Return pointer to last occurrence of string s2 in string s1.
|
||||
|
@ -43,11 +43,13 @@ void regerror(char *msg);
|
||||
|
||||
int empty_line(const line_t *line);
|
||||
|
||||
size_t expand_tabs_into(const uint32_t *input_buffer, const size_t in_len,
|
||||
const int tabstop, uint32_t **text, size_t **tabpos, size_t *tabpos_len);
|
||||
size_t expand_tabs_into(const uint32_t *input_buffer, const int tabstop, uint32_t **text,
|
||||
size_t **tabpos, size_t *tabpos_len);
|
||||
|
||||
void btrim(char *text, size_t *len);
|
||||
|
||||
void btrim32(uint32_t *text, size_t *len);
|
||||
|
||||
char *my_strnrstr(const char *s1, const char *s2, const size_t s2_len,
|
||||
int skip);
|
||||
|
||||
|
96
src/unicode.c
Normal file
96
src/unicode.c
Normal file
@ -0,0 +1,96 @@
|
||||
/*
|
||||
* boxes - Command line filter to draw/remove ASCII boxes around text
|
||||
* Copyright (C) 1999 Thomas Jensen and the boxes contributors
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License, version 2, as published
|
||||
* by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but
|
||||
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||||
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along
|
||||
* with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
*
|
||||
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
|
||||
*/
|
||||
|
||||
/*
|
||||
* Functions and constants for handling unicode strings with libunistring.
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include <errno.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include <unictype.h>
|
||||
#include <unistr.h>
|
||||
#include <unitypes.h>
|
||||
|
||||
#include "unicode.h"
|
||||
|
||||
|
||||
const char *encoding; /* the character encoding that we use */
|
||||
|
||||
const ucs4_t char_tab = 0x00000009; /* ucs4_t character '\t' (tab) */
|
||||
const ucs4_t char_space = 0x00000020; /* ucs4_t character ' ' (space) */
|
||||
const ucs4_t char_cr = 0x0000000d; /* ucs4_t character '\r' (carriage return) */
|
||||
const ucs4_t char_newline = 0x0000000a; /* ucs4_t character '\n' (newline) */
|
||||
const ucs4_t char_nul = 0x00000000; /* ucs4_t character '\0' (zero) */
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Check whether the character at the given index has the given value.
|
||||
*
|
||||
* @param <text> the string to check
|
||||
* @param <idx> the index position of the character to check
|
||||
* @param <expected_char> the expected character value
|
||||
* @return flag indicating whether the character has the expected value
|
||||
*/
|
||||
int is_char_at(const uint32_t *text, const size_t idx, const ucs4_t expected_char)
|
||||
{
|
||||
return text != NULL && u32_cmp(text + idx, &expected_char, 1) == 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Set the character at the given index to the given value.
|
||||
*
|
||||
* @param <text> the string to modify
|
||||
* @param <idx> the index position of the character to modify
|
||||
* @param <char_to_set> the new character value
|
||||
*/
|
||||
void set_char_at(uint32_t *text, const size_t idx, const ucs4_t char_to_set)
|
||||
{
|
||||
u32_set(text + idx, char_to_set, 1);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Determine if a string is NULL/empty or not.
|
||||
*
|
||||
* @param <text> the string to check
|
||||
* @return > 0: the string is empty or NULL
|
||||
* 0: the string contains at least 1 character
|
||||
*/
|
||||
int is_empty(const uint32_t *text)
|
||||
{
|
||||
return text == NULL || is_char_at(text, 0, char_nul);
|
||||
}
|
||||
|
||||
|
||||
|
||||
int is_ascii_printable(const ucs4_t c)
|
||||
{
|
||||
return c >= 0x20 && c < 0x7f;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*EOF*/ /* vim: set sw=4: */
|
49
src/unicode.h
Normal file
49
src/unicode.h
Normal file
@ -0,0 +1,49 @@
|
||||
/*
|
||||
* boxes - Command line filter to draw/remove ASCII boxes around text
|
||||
* Copyright (C) 1999 Thomas Jensen and the boxes contributors
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License, version 2, as published
|
||||
* by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but
|
||||
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||||
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along
|
||||
* with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
*
|
||||
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
|
||||
*/
|
||||
|
||||
/*
|
||||
* Functions and constants for handling unicode strings with libunistring.
|
||||
*/
|
||||
|
||||
#ifndef UNICODE_H
|
||||
#define UNICODE_H
|
||||
|
||||
extern const char *encoding; /* the character encoding that we use */
|
||||
|
||||
extern const ucs4_t char_tab; /* ucs4_t character '\t' (tab) */
|
||||
extern const ucs4_t char_space; /* ucs4_t character ' ' (space) */
|
||||
extern const ucs4_t char_cr; /* ucs4_t character '\r' (carriage return) */
|
||||
extern const ucs4_t char_newline; /* ucs4_t character '\n' (newline) */
|
||||
extern const ucs4_t char_nul; /* ucs4_t character '\0' (zero) */
|
||||
|
||||
|
||||
|
||||
int is_char_at(const uint32_t *text, const size_t idx, const ucs4_t expected_char);
|
||||
|
||||
void set_char_at(uint32_t *text, const size_t idx, const ucs4_t char_to_set);
|
||||
|
||||
int is_empty(const uint32_t *text);
|
||||
|
||||
int is_ascii_printable(const ucs4_t c);
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
/*EOF*/ /* vim: set cindent sw=4: */
|
Loading…
x
Reference in New Issue
Block a user