Add unicode-awareness in box removal functionality #1

This commit is contained in:
Thomas Jensen 2021-02-06 18:17:00 +01:00
parent c41a4e881e
commit 5dad2d1137
No known key found for this signature in database
GPG Key ID: A4ACEE270D0FB7DB
7 changed files with 330 additions and 275 deletions

View File

@ -51,7 +51,7 @@ shapes {
} }
delim ?" delim ?"
replace "\*/" with "*\/" # quote closing comment tags replace "\*/" with "*\\/" # escape closing comment tags
reverse "\*\\/" to "*/" reverse "\*\\/" to "*/"
padding { padding {
@ -945,8 +945,8 @@ tags "programming, comment, deprecated"
sample sample
%{-----------------------------------------------------------------+ %{-----------------------------------------------------------------+
| IBM Net.Data Macro Sample - Perl and SQL Backends | | IBM Net.Data Macro Comment |
| Thomas Jensen, February 17, 1998 (Tuesday, 16:40h) | | Created February 17, 1998 (Tuesday, 16:40h) |
+-----------------------------------------------------------------%} +-----------------------------------------------------------------%}
ends ends
@ -1875,7 +1875,7 @@ shapes {
} }
replace "(.)(?!$)" with "$1 " replace "(.)(?!$)" with "$1 "
reverse "^( *)([^ ]*) " to "$1$2" # $1 to leave indentation untouched # TODO reverse "([^ ]) " to "$1"
padding { padding {
horiz 2 horiz 2

View File

@ -1218,88 +1218,6 @@ static int get_indent(const line_t *lines, const size_t lines_size)
/**
* Analyze the multi-byte string in order to determine its metrics:
* - number of visible columns it occupies
* - number of escape characters (== number of escape sequences)
* - the ASCII equivalent of the string
* - the number of invisible characters in the string
*
* @param <s> the multi-byte string to analyze
* @param <num_esc> pointer to where the number of escape sequences should be stored
* @param <ascii> pointer to where the ASCII equivalent of the string should be stored
* @returns the number of invisible characters in <s>
*/
static size_t count_invisible_chars(const uint32_t *s, size_t *num_esc, char **ascii)
{
size_t invis = 0; /* counts invisible column positions */
int ansipos = 0; /* progression of ansi sequence */
*num_esc = 0; /* counts the number of escape sequences found */
if (is_empty(s)) {
(*ascii) = (char *) strdup("");
return 0;
}
size_t buflen = (size_t) u32_strwidth(s, encoding);
(*ascii) = (char *) calloc(buflen, sizeof(char)); /* maybe a little too much, but certainly enough */
char *p = *ascii;
ucs4_t c;
const uint32_t *rest = s;
while ((rest = u32_next(&c, rest))) {
if (ansipos == 0 && c == char_esc) {
/* Found an ESC char, count it as invisible and move 1 forward in the detection of CSI sequences */
ansipos++;
invis++;
(*num_esc)++;
} else if (ansipos == 1 && c == '[') {
/* Found '[' char after ESC. A CSI sequence has started. */
ansipos++;
invis++;
} else if (ansipos == 1 && c >= 0x40 && c <= 0x5f) {
/* Found a byte designating the end of a two-byte escape sequence */
invis++;
ansipos = 0;
} else if (ansipos == 2) {
/* Inside CSI sequence - Keep counting bytes as invisible */
invis++;
/* A char between 0x40 and 0x7e signals the end of an CSI or escape sequence */
if (c >= 0x40 && c <= 0x7e) {
ansipos = 0;
}
} else if (is_ascii_printable(c)) {
*p = c & 0xff;
++p;
} else {
int cols = uc_width(c, encoding);
if (cols > 0) {
memset(p, (int) 'x', cols);
p += cols;
}
}
}
*p = '\0';
return invis;
}
static void analyze_line_ascii(line_t *line)
{
size_t num_esc = 0;
char *ascii;
size_t invis = count_invisible_chars(line->mbtext, &num_esc, &ascii);
line->invis = invis;
/* u32_strwidth() does not count control characters, i.e. ESC characters, for which we must correct */
line->len = u32_strwidth(line->mbtext, encoding) - invis + num_esc;
line->num_chars = u32_strlen(line->mbtext);
BFREE(line->text);
line->text = ascii;
}
static int apply_substitutions(const int mode) static int apply_substitutions(const int mode)
/* /*
* Apply regular expression substitutions to input text. * Apply regular expression substitutions to input text.
@ -1377,9 +1295,6 @@ static int apply_substitutions(const int mode)
input.lines[k].mbtext_org = newtext; input.lines[k].mbtext_org = newtext;
analyze_line_ascii(input.lines + k); analyze_line_ascii(input.lines + k);
if (input.lines[k].len > input.maxline) {
input.maxline = input.lines[k].len;
}
#ifdef REGEXP_DEBUG #ifdef REGEXP_DEBUG
fprintf (stderr, "input.lines[%d] == {%d, \"%s\"}\n", (int) k, fprintf (stderr, "input.lines[%d] == {%d, \"%s\"}\n", (int) k,
@ -1481,12 +1396,17 @@ static int read_all_input(const int use_stdin)
mbtemp = u32_strconv_from_locale(buf); mbtemp = u32_strconv_from_locale(buf);
len_chars = u32_strlen(mbtemp); len_chars = u32_strlen(mbtemp);
input.final_newline = has_linebreak(mbtemp, len_chars); input.final_newline = has_linebreak(mbtemp, len_chars);
input.lines[input.anz_lines].posmap = NULL;
if (opt.r) { if (opt.r) {
if (is_char_at(mbtemp, len_chars - 1, char_newline)) { if (is_char_at(mbtemp, len_chars - 1, char_newline)) {
set_char_at(mbtemp, len_chars - 1, char_nul); set_char_at(mbtemp, len_chars - 1, char_nul);
--len_chars; --len_chars;
} }
if (is_char_at(mbtemp, len_chars - 1, char_cr)) {
set_char_at(mbtemp, len_chars - 1, char_nul);
--len_chars;
}
} }
else { else {
btrim32(mbtemp, &len_chars); btrim32(mbtemp, &len_chars);
@ -1518,11 +1438,9 @@ static int read_all_input(const int use_stdin)
/* /*
* Build ASCII equivalent of the multi-byte string, update line stats * Build ASCII equivalent of the multi-byte string, update line stats
*/ */
input.lines[input.anz_lines].text = NULL; /* we haven't used it yet! */
analyze_line_ascii(input.lines + input.anz_lines); analyze_line_ascii(input.lines + input.anz_lines);
input.lines[input.anz_lines].num_leading_blanks = 0; input.lines[input.anz_lines].num_leading_blanks = 0;
if (input.lines[input.anz_lines].len > input.maxline) {
input.maxline = input.lines[input.anz_lines].len;
}
++input.anz_lines; ++input.anz_lines;
} }
@ -1538,9 +1456,6 @@ static int read_all_input(const int use_stdin)
/* recalculate input statistics for redrawing the mended box */ /* recalculate input statistics for redrawing the mended box */
for (i = 0; i < input.anz_lines; ++i) { for (i = 0; i < input.anz_lines; ++i) {
analyze_line_ascii(input.lines + i); analyze_line_ascii(input.lines + i);
if (input.lines[i].len > input.maxline) {
input.maxline = input.lines[i].len;
}
} }
} }
@ -1567,6 +1482,10 @@ static int read_all_input(const int use_stdin)
*/ */
if (opt.design->indentmode != 't' && opt.r == 0) { if (opt.design->indentmode != 't' && opt.r == 0) {
for (i = 0; i < input.anz_lines; ++i) { for (i = 0; i < input.anz_lines; ++i) {
#ifdef DEBUG
fprintf(stderr, "%2d: mbtext = \"%s\" (%d chars)\n", (int) i,
u32_strconv_to_locale(input.lines[i].mbtext), (int) input.lines[i].num_chars);
#endif
if (input.lines[i].num_chars >= input.indent) { if (input.lines[i].num_chars >= input.indent) {
memmove(input.lines[i].text, input.lines[i].text + input.indent, memmove(input.lines[i].text, input.lines[i].text + input.indent,
input.lines[i].len - input.indent + 1); input.lines[i].len - input.indent + 1);
@ -1575,6 +1494,10 @@ static int read_all_input(const int use_stdin)
input.lines[i].mbtext = advance32(input.lines[i].mbtext, input.indent); input.lines[i].mbtext = advance32(input.lines[i].mbtext, input.indent);
input.lines[i].num_chars -= input.indent; input.lines[i].num_chars -= input.indent;
} }
#ifdef DEBUG
fprintf(stderr, "%2d: mbtext = \"%s\" (%d chars)\n", (int) i,
u32_strconv_to_locale(input.lines[i].mbtext), (int) input.lines[i].num_chars);
#endif
} }
input.maxline -= input.indent; input.maxline -= input.indent;
} }
@ -1588,33 +1511,9 @@ static int read_all_input(const int use_stdin)
} }
} }
#if 0 #ifdef DEBUG
/*
* Debugging Code: Display contents of input structure
*/
fprintf (stderr, "Encoding: %s\n", encoding); fprintf (stderr, "Encoding: %s\n", encoding);
fprintf (stderr, "Input Lines:\n"); print_input_lines(NULL);
fprintf (stderr, " [num_chars] \"real text\" [num_cols] \"ascii_text\"\n");
for (i=0; i<input.anz_lines; ++i) {
fprintf (stderr, "%4d [%02d] \"%s\" [%02d] \"%s\"", (int) i,
(int) input.lines[i].num_chars, u32_strconv_to_locale(input.lines[i].mbtext),
(int) input.lines[i].len, input.lines[i].text);
fprintf (stderr, "\tTabs: [");
if (input.lines[i].tabpos != NULL) {
size_t j;
for (j=0; j<input.lines[i].tabpos_len; ++j) {
fprintf (stderr, "%d", (int) input.lines[i].tabpos[j]);
if (j < input.lines[i].tabpos_len - 1) {
fprintf (stderr, ", ");
}
}
}
fprintf (stderr, "] (%d)", (int) input.lines[i].tabpos_len);
fprintf (stderr, "\tinvisible=%d\n", (int) input.lines[i].invis);
}
fprintf (stderr, " Longest line: %d columns\n", (int) input.maxline);
fprintf (stderr, " Indentation: %2d spaces\n", (int) input.indent);
fprintf (stderr, "Final newline: %s\n", input.final_newline ? "yes" : "no");
#endif #endif
return 0; return 0;

View File

@ -26,7 +26,7 @@
#define BOXES_H #define BOXES_H
/* #define DEBUG 1 */ /* #define DEBUG 1 */
#define REGEXP_DEBUG 1 /* #define REGEXP_DEBUG 1 */
/* #define PARSER_DEBUG 1 */ /* #define PARSER_DEBUG 1 */
/* #define LEXER_DEBUG 1 */ /* #define LEXER_DEBUG 1 */
@ -158,6 +158,7 @@ typedef struct {
size_t *tabpos; /* tab positions in expanded work strings, or NULL if not needed */ size_t *tabpos; /* tab positions in expanded work strings, or NULL if not needed */
size_t tabpos_len; /* number of tabs in a line */ size_t tabpos_len; /* number of tabs in a line */
size_t num_leading_blanks; /* number of spaces at the start of the line after justification */ size_t num_leading_blanks; /* number of spaces at the start of the line after justification */
size_t *posmap; /* for each character in `text`, position of corresponding char in `mbtext`. Needed for box removal. */
} line_t; } line_t;

View File

@ -101,8 +101,8 @@ static int horiz_precalc(const sentry_t *sarr,
#ifdef DEBUG #ifdef DEBUG
fprintf (stderr, "in horiz_precalc:\n "); fprintf (stderr, "in horiz_precalc:\n ");
fprintf (stderr, "opt.design->minwidth %d, input.maxline %d, target_width" fprintf (stderr, "opt.design->minwidth %d, input.maxline %d, target_width"
" %d, tnumsh %d, bnumsh %d\n", opt.design->minwidth, " %d, tnumsh %d, bnumsh %d\n", (int) opt.design->minwidth,
input.maxline, target_width, tnumsh, bnumsh); (int) input.maxline, (int) target_width, tnumsh, bnumsh);
#endif #endif
twidth = 0; twidth = 0;
@ -533,17 +533,17 @@ static int horiz_generate(sentry_t *tresult, sentry_t *bresult)
#ifdef DEBUG #ifdef DEBUG
fprintf (stderr, "Top side box rect width %d, height %d.\n", fprintf (stderr, "Top side box rect width %d, height %d.\n",
tresult->width, tresult->height); (int) tresult->width, (int) tresult->height);
fprintf (stderr, "Top columns to fill: %s %d, %s %d, %s %d.\n", fprintf (stderr, "Top columns to fill: %s %d, %s %d, %s %d.\n",
shape_name[north_side[1]], tiltf[0], shape_name[north_side[1]], (int) tiltf[0],
shape_name[north_side[2]], tiltf[1], shape_name[north_side[2]], (int) tiltf[1],
shape_name[north_side[3]], tiltf[2]); shape_name[north_side[3]], (int) tiltf[2]);
fprintf (stderr, "Bottom side box rect width %d, height %d.\n", fprintf (stderr, "Bottom side box rect width %d, height %d.\n",
bresult->width, bresult->height); (int) bresult->width, (int) bresult->height);
fprintf (stderr, "Bottom columns to fill: %s %d, %s %d, %s %d.\n", fprintf (stderr, "Bottom columns to fill: %s %d, %s %d, %s %d.\n",
shape_name[south_side[1]], biltf[0], shape_name[south_side[1]], (int) biltf[0],
shape_name[south_side[2]], biltf[1], shape_name[south_side[2]], (int) biltf[1],
shape_name[south_side[3]], biltf[2]); shape_name[south_side[3]], (int) biltf[2]);
#endif #endif
tresult->chars = (char **) calloc(tresult->height, sizeof(char *)); tresult->chars = (char **) calloc(tresult->height, sizeof(char *));
@ -561,21 +561,21 @@ static int horiz_generate(sentry_t *tresult, sentry_t *bresult)
return rc; return rc;
} }
#if defined(DEBUG) && 1 #ifdef DEBUG
{ {
/* /*
* Debugging code - Output horizontal sides of box * Debugging code - Output horizontal sides of box
*/ */
size_t j; size_t j;
fprintf (stderr, "TOP SIDE:\n"); fprintf(stderr, "TOP SIDE:\n");
for (j=0; j<tresult->height; ++j) { for (j = 0; j < tresult->height; ++j) {
fprintf (stderr, " %2d: \'%s\'\n", j, fprintf(stderr, " %2d: \'%s\'\n", (int) j,
tresult->chars[j]? tresult->chars[j] : "(null)"); tresult->chars[j] ? tresult->chars[j] : "(null)");
} }
fprintf (stderr, "BOTTOM SIDE:\n"); fprintf(stderr, "BOTTOM SIDE:\n");
for (j=0; j<bresult->height; ++j) { for (j = 0; j < bresult->height; ++j) {
fprintf (stderr, " %2d: \'%s\'\n", j, fprintf(stderr, " %2d: \'%s\'\n", (int) j,
bresult->chars[j]? bresult->chars[j] : "(null)"); bresult->chars[j] ? bresult->chars[j] : "(null)");
} }
} }
#endif #endif
@ -616,18 +616,18 @@ static int vert_generate(sentry_t *lresult, sentry_t *rresult)
opt.design->shape[NE].height + opt.design->shape[SE].height; opt.design->shape[NE].height + opt.design->shape[SE].height;
#ifdef DEBUG #ifdef DEBUG
fprintf (stderr, "Left side box rect width %d, height %d, vspace %d.\n", fprintf(stderr, "Left side box rect width %d, height %d, vspace %d.\n",
lresult->width, lresult->height, vspace); (int) lresult->width, (int) lresult->height, (int) vspace);
fprintf (stderr, "Left lines to fill: %s %d, %s %d, %s %d.\n", fprintf(stderr, "Left lines to fill: %s %d, %s %d, %s %d.\n",
shape_name[west_side[1]], leftiltf[0], shape_name[west_side[1]], (int) leftiltf[0],
shape_name[west_side[2]], leftiltf[1], shape_name[west_side[2]], (int) leftiltf[1],
shape_name[west_side[3]], leftiltf[2]); shape_name[west_side[3]], (int) leftiltf[2]);
fprintf (stderr, "Right side box rect width %d, height %d, vspace %d.\n", fprintf(stderr, "Right side box rect width %d, height %d, vspace %d.\n",
rresult->width, rresult->height, vspace); (int) rresult->width, (int) rresult->height, (int) vspace);
fprintf (stderr, "Right lines to fill: %s %d, %s %d, %s %d.\n", fprintf(stderr, "Right lines to fill: %s %d, %s %d, %s %d.\n",
shape_name[east_side[1]], rightiltf[0], shape_name[east_side[1]], (int) rightiltf[0],
shape_name[east_side[2]], rightiltf[1], shape_name[east_side[2]], (int) rightiltf[1],
shape_name[east_side[3]], rightiltf[2]); shape_name[east_side[3]], (int) rightiltf[2]);
#endif #endif
lresult->chars = (char **) calloc(lresult->height, sizeof(char *)); lresult->chars = (char **) calloc(lresult->height, sizeof(char *));
@ -648,15 +648,15 @@ static int vert_generate(sentry_t *lresult, sentry_t *rresult)
* Debugging code - Output left and right side of box * Debugging code - Output left and right side of box
*/ */
size_t j; size_t j;
fprintf (stderr, "LEFT SIDE:\n"); fprintf(stderr, "LEFT SIDE:\n");
for (j=0; j<lresult->height; ++j) { for (j = 0; j < lresult->height; ++j) {
fprintf (stderr, " %2d: \'%s\'\n", j, fprintf(stderr, " %2d: \'%s\'\n", (int) j,
lresult->chars[j]? lresult->chars[j] : "(null)"); lresult->chars[j] ? lresult->chars[j] : "(null)");
} }
fprintf (stderr, "RIGHT SIDE:\n"); fprintf(stderr, "RIGHT SIDE:\n");
for (j=0; j<rresult->height; ++j) { for (j = 0; j < rresult->height; ++j) {
fprintf (stderr, " %2d: \'%s\'\n", j, fprintf(stderr, " %2d: \'%s\'\n", (int) j,
rresult->chars[j]? rresult->chars[j] : "(null)"); rresult->chars[j] ? rresult->chars[j] : "(null)");
} }
} }
#endif #endif
@ -933,10 +933,10 @@ int output_box(const sentry_t *thebox)
hfill2[hpr] = '\0'; hfill2[hpr] = '\0';
#if defined(DEBUG) #if defined(DEBUG)
fprintf (stderr, "Alignment: hfill %d hpl %d hpr %d, vfill %d " fprintf(stderr, "Alignment: hfill %d hpl %d hpr %d, vfill %d vfill1 %d vfill2 %d.\n",
"vfill1 %d vfill2 %d.\n", hfill, hpl, hpr, vfill, vfill1, vfill2); (int) hfill, (int) hpl, (int) hpr, (int) vfill, (int) vfill1, (int) vfill2);
fprintf (stderr, " hfill1 = \"%s\"; hfill2 = \"%s\"; " fprintf(stderr, " hfill1 = \"%s\"; hfill2 = \"%s\"; indentspc = \"%s\";\n",
"indentspc = \"%s\";\n", hfill1, hfill2, indentspc); hfill1, hfill2, indentspc);
#endif #endif
/* /*
@ -955,10 +955,10 @@ int output_box(const sentry_t *thebox)
if (empty_side(opt.design->shape, BLEF)) { if (empty_side(opt.design->shape, BLEF)) {
skip_left = opt.design->shape[NW].width; skip_left = opt.design->shape[NW].width;
} /* could simply be 1, though */ } /* could simply be 1, though */
#if defined(DEBUG) #if defined(DEBUG)
fprintf (stderr, "skip_start = %d; skip_end = %d; skip_left = %d; " fprintf(stderr, "skip_start = %d; skip_end = %d; skip_left = %d; nol = %d;\n",
"nol = %d;\n", skip_start, skip_end, skip_left, nol); (int) skip_start, (int) skip_end, (int) skip_left, (int) nol);
#endif #endif
/* /*
* Generate actual output * Generate actual output

View File

@ -26,9 +26,13 @@
#include <stdlib.h> #include <stdlib.h>
#include <stdio.h> #include <stdio.h>
#include <string.h> #include <string.h>
#include <uniconv.h>
#include <unistr.h>
#include "shape.h" #include "shape.h"
#include "boxes.h" #include "boxes.h"
#include "tools.h" #include "tools.h"
#include "unicode.h"
#include "remove.h" #include "remove.h"
@ -72,8 +76,8 @@ static int best_match(const line_t *line,
nume += opt.design->shape[ESE].height; nume += opt.design->shape[ESE].height;
#ifdef DEBUG #ifdef DEBUG
fprintf (stderr, "Number of WEST side shape lines: %d\n", numw); fprintf (stderr, "Number of WEST side shape lines: %d\n", (int) numw);
fprintf (stderr, "Number of EAST side shape lines: %d\n", nume); fprintf (stderr, "Number of EAST side shape lines: %d\n", (int) nume);
#endif #endif
/* /*
@ -156,10 +160,10 @@ static int best_match(const line_t *line,
k = 0; k = 0;
cs = opt.design->shape + east_side[++w]; cs = opt.design->shape + east_side[++w];
} }
#ifdef DEBUG #ifdef DEBUG
fprintf (stderr, "\nj %d, k %d, w %d, cs->chars[k] = \"%s\"\n", fprintf(stderr, "\nj %d, k %d, w %d, cs->chars[k] = \"%s\"\n",
j, k, w, cs->chars[k]?cs->chars[k]:"(null)"); (int) j, (int) k, w, cs->chars[k] ? cs->chars[k] : "(null)");
#endif #endif
chkline.text = cs->chars[k]; chkline.text = cs->chars[k];
chkline.len = cs->width; chkline.len = cs->width;
@ -248,8 +252,8 @@ static int hmm(const int aside, const size_t follow,
int rc; int rc;
#ifdef DEBUG #ifdef DEBUG
fprintf (stderr, "hmm (%s, %d, \'%c\', \'%c\', %d)\n", fprintf(stderr, "hmm (%s, %d, \'%c\', \'%c\', %d)\n",
aside==BTOP?"BTOP":"BBOT", follow, p[0], *ecs, cnt); aside == BTOP ? "BTOP" : "BBOT", (int) follow, p[0], *ecs, cnt);
#endif #endif
if (p > ecs) { /* last shape tried was too long */ if (p > ecs) { /* last shape tried was too long */
@ -358,11 +362,10 @@ static int detect_horiz(const int aside, size_t *hstart, size_t *hend)
&& line >= input.lines; ++lcnt) { && line >= input.lines; ++lcnt) {
goeast = gowest = 0; goeast = gowest = 0;
#ifdef DEBUG #ifdef DEBUG
fprintf (stderr, "----- Processing line index %2d ----------" fprintf(stderr, "----- Processing line index %2d -----------------------------------------------\n",
"-------------------------------------\n", (int) (aside == BTOP ? lcnt : input.anz_lines - lcnt - 1));
aside == BTOP? lcnt: input.anz_lines - lcnt - 1); #endif
#endif
do { do {
/* /*
@ -412,14 +415,13 @@ static int detect_horiz(const int aside, size_t *hstart, size_t *hend)
/* Now, wcs is either NULL (if west side is empty) */ /* Now, wcs is either NULL (if west side is empty) */
/* or not NULL (if west side is not empty). In any case, p */ /* or not NULL (if west side is not empty). In any case, p */
/* points to where we start searching for the east corner. */ /* points to where we start searching for the east corner. */
#ifdef DEBUG #ifdef DEBUG
if (wcs) { if (wcs) {
fprintf (stderr, "West corner shape matched at " fprintf(stderr, "West corner shape matched at position %d.\n", (int) (wcs - line->text));
"position %d.\n", wcs - line->text); } else {
} else { fprintf(stderr, "West box side is empty.\n");
fprintf (stderr, "West box side is empty.\n"); }
} #endif
#endif
/* /*
* Look for east corner shape * Look for east corner shape
*/ */
@ -444,9 +446,9 @@ static int detect_horiz(const int aside, size_t *hstart, size_t *hend)
continue; continue;
} }
} }
#ifdef DEBUG #ifdef DEBUG
fprintf (stderr, "East corner shape matched at position %d.\n", ecs-line->text); fprintf(stderr, "East corner shape matched at position %d.\n", (int) (ecs - line->text));
#endif #endif
/* /*
* Check if text between corner shapes is valid * Check if text between corner shapes is valid
@ -455,9 +457,9 @@ static int detect_horiz(const int aside, size_t *hstart, size_t *hend)
if (!mmok) { if (!mmok) {
++goeast; ++goeast;
} }
#ifdef DEBUG #ifdef DEBUG
fprintf (stderr, "Text between corner shapes is %s.\n", mmok? "VALID": "NOT valid"); fprintf(stderr, "Text between corner shapes is %s.\n", mmok ? "VALID" : "NOT valid");
#endif #endif
} while (!mmok); } while (!mmok);
/* /*
@ -800,6 +802,21 @@ static design_t *detect_design()
static void add_spaces_to_line(line_t* line, const size_t n)
{
if (n == 0) {
return;
}
line->mbtext_org = (uint32_t *) realloc(line->mbtext_org, (line->num_chars + n + 1) * sizeof(uint32_t));
line->mbtext = line->mbtext_org;
u32_set(line->mbtext + line->num_chars, char_space, n);
set_char_at(line->mbtext, line->num_chars + n, char_nul);
line->num_chars += n;
analyze_line_ascii(line);
}
int remove_box() int remove_box()
/* /*
* Remove box from input. * Remove box from input.
@ -810,12 +827,12 @@ int remove_box()
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
*/ */
{ {
size_t textstart = 0; /* index of 1st line of box body */ size_t textstart = 0; /* index of 1st line of box body */
size_t textend = 0; /* index of 1st line of south side */ size_t textend = 0; /* index of 1st line of south side */
size_t boxstart = 0; /* index of 1st line of box */ size_t boxstart = 0; /* index of 1st line of box */
size_t boxend = 0; /* index of 1st line trailing the box */ size_t boxend = 0; /* index of 1st line trailing the box */
int m; /* true if a match was found */ int m; /* true if a match was found */
size_t j; /* loop counter */ size_t j; /* loop counter */
int did_something = 0; /* true if there was something to remove */ int did_something = 0; /* true if there was something to remove */
/* /*
@ -827,14 +844,12 @@ int remove_box()
design_t *tmp = detect_design(); design_t *tmp = detect_design();
if (tmp) { if (tmp) {
opt.design = tmp; opt.design = tmp;
#ifdef DEBUG #ifdef DEBUG
fprintf (stderr, "Design autodetection: Removing box of " fprintf(stderr, "Design autodetection: Removing box of design \"%s\".\n", opt.design->name);
"design \"%s\".\n", opt.design->name); #endif
#endif
} }
else { else {
fprintf(stderr, "%s: Box design autodetection failed. Use -d " fprintf(stderr, "%s: Box design autodetection failed. Use -d option.\n", PROJECT);
"option.\n", PROJECT);
return 1; return 1;
} }
} }
@ -847,31 +862,14 @@ int remove_box()
* whose east sides consist of lots of spaces (the given value). So we * whose east sides consist of lots of spaces (the given value). So we
* add a number of spaces equal to the east side width. * add a number of spaces equal to the east side width.
*/ */
input.maxline += opt.design->shape[NE].width; const size_t normalized_len = input.maxline + opt.design->shape[NE].width;
for (j = 0; j < input.anz_lines; ++j) { for (j = 0; j < input.anz_lines; ++j) {
input.lines[j].text = (char *) add_spaces_to_line(input.lines + j, normalized_len - input.lines[j].len);
realloc(input.lines[j].text, input.maxline + input.lines[j].invis + 1);
if (input.lines[j].text == NULL) {
perror(PROJECT);
return 1;
}
memset(input.lines[j].text + input.lines[j].len, ' ',
input.maxline - input.lines[j].len + input.lines[j].invis);
input.lines[j].text[input.maxline] = '\0';
input.lines[j].len = input.maxline;
} }
#ifdef DEBUG
/* fprintf(stderr, "Normalized all lines to %d columns (maxline + east width).\n", (int) input.maxline);
* Debugging Code: Display contents of input structure print_input_lines(" (remove_box)");
*/ #endif
#if defined(DEBUG) && 1
for (j=0; j<input.anz_lines; ++j) {
fprintf (stderr, "%3d [%02d] \"%s\"\n", j, input.lines[j].len,
input.lines[j].text);
}
fprintf (stderr, "\nLongest line: %d characters.\n", input.maxline);
fprintf (stderr, " Indentation: %2d spaces.\n", input.indent);
#endif
/* /*
* Phase 1: Try to find out how many lines belong to the top of the box * Phase 1: Try to find out how many lines belong to the top of the box
@ -880,14 +878,14 @@ int remove_box()
textstart = 0; textstart = 0;
if (empty_side(opt.design->shape, BTOP)) { if (empty_side(opt.design->shape, BTOP)) {
#ifdef DEBUG #ifdef DEBUG
fprintf (stderr, "----> Top box side is empty: boxstart == textstart == 0.\n"); fprintf(stderr, "----> Top box side is empty: boxstart == textstart == 0.\n");
#endif #endif
} }
else { else {
detect_horiz(BTOP, &boxstart, &textstart); detect_horiz(BTOP, &boxstart, &textstart);
#ifdef DEBUG #ifdef DEBUG
fprintf (stderr, "----> First line of box is %d, ", boxstart); fprintf(stderr, "----> First line of box is %d, ", (int) boxstart);
fprintf (stderr, "first line of box body (text) is %d.\n", textstart); fprintf(stderr, "first line of box body (text) is %d.\n", (int) textstart);
#endif #endif
} }
@ -899,8 +897,7 @@ int remove_box()
textend = input.anz_lines; textend = input.anz_lines;
boxend = input.anz_lines; boxend = input.anz_lines;
#ifdef DEBUG #ifdef DEBUG
fprintf (stderr, "----> Bottom box side is empty: boxend == textend == %d.\n", fprintf(stderr, "----> Bottom box side is empty: boxend == textend == %d.\n", (int) input.anz_lines);
input.anz_lines);
#endif #endif
} }
else { else {
@ -912,8 +909,8 @@ int remove_box()
boxend = input.anz_lines; boxend = input.anz_lines;
} }
#ifdef DEBUG #ifdef DEBUG
fprintf (stderr, "----> Last line of box body (text) is %d, ", textend-1); fprintf(stderr, "----> Last line of box body (text) is %d, ", (int) (textend - 1));
fprintf (stderr, "last line of box is %d.\n", boxend-1); fprintf(stderr, "last line of box is %d.\n", (int) (boxend - 1));
#endif #endif
} }
@ -925,7 +922,7 @@ int remove_box()
char *p; char *p;
#ifdef DEBUG #ifdef DEBUG
fprintf (stderr, "Calling best_match() for line %d:\n", j); fprintf(stderr, "Calling best_match() for line %d:\n", (int) j);
#endif #endif
m = best_match(input.lines + j, &ws, &we, &es, &ee); m = best_match(input.lines + j, &ws, &we, &es, &ee);
if (m < 0) { if (m < 0) {
@ -934,30 +931,33 @@ int remove_box()
} }
else if (m == 0) { else if (m == 0) {
#ifdef DEBUG #ifdef DEBUG
fprintf (stderr, "\033[00;33;01mline %2d: no side match\033[00m\n", j); fprintf(stderr, "\033[00;33;01mline %2d: no side match\033[00m\n", (int) j);
#endif #endif
} }
else { else {
#ifdef DEBUG #ifdef DEBUG
fprintf (stderr, "\033[00;33;01mline %2d: west: %d (\'%c\') to " fprintf(stderr, "\033[00;33;01mline %2d: west: %d (\'%c\') to %d (\'%c\') [len %d]; "
"%d (\'%c\') [len %d]; east: %d (\'%c\') to %d (\'%c\')" "east: %d (\'%c\') to %d (\'%c\') [len %d]\033[00m\n", (int) j,
" [len %d]\033[00m\n", j, (int) (ws ? ws - input.lines[j].text : 0), ws ? ws[0] : '?',
ws? ws-input.lines[j].text:0, ws?ws[0]:'?', (int) (we ? we - input.lines[j].text - 1 : 0), we ? we[-1] : '?',
we? we-input.lines[j].text-1:0, we?we[-1]:'?', (int) (ws && we ? (we - input.lines[j].text - (ws - input.lines[j].text)) : 0),
ws&&we? (we-input.lines[j].text-(ws-input.lines[j].text)):0, (int) (es ? es - input.lines[j].text : 0), es ? es[0] : '?',
es? es-input.lines[j].text:0, es?es[0]:'?', (int) (ee ? ee - input.lines[j].text - 1 : 0), ee ? ee[-1] : '?',
ee? ee-input.lines[j].text-1:0, ee?ee[-1]:'?', (int) (es && ee ? (ee - input.lines[j].text - (es - input.lines[j].text)) : 0));
es&&ee? (ee-input.lines[j].text-(es-input.lines[j].text)):0);
#endif #endif
if (ws && we) { if (ws && we) {
did_something = 1; did_something = 1;
for (p = ws; p < we; ++p) { for (p = ws; p < we; ++p) {
size_t idx = p - input.lines[j].text;
*p = ' '; *p = ' ';
set_char_at(input.lines[j].mbtext, input.lines[j].posmap[idx], char_space);
} }
} }
if (es && ee) { if (es && ee) {
for (p = es; p < ee; ++p) { for (p = es; p < ee; ++p) {
size_t idx = p - input.lines[j].text;
*p = ' '; *p = ' ';
set_char_at(input.lines[j].mbtext, input.lines[j].posmap[idx], char_space);
} }
} }
} }
@ -978,19 +978,23 @@ int remove_box()
} }
} }
#ifdef DEBUG #ifdef DEBUG
fprintf (stderr, "memmove (\"%s\", \"%s\", %d);\n", fprintf(stderr, "memmove(\"%s\", \"%s\", %d);\n",
input.lines[j].text, input.lines[j].text + c, input.lines[j].text, input.lines[j].text + c, (int) (input.lines[j].len - c + 1));
input.lines[j].len - c + 1);
#endif #endif
memmove(input.lines[j].text, input.lines[j].text + c, memmove(input.lines[j].text, input.lines[j].text + c,
input.lines[j].len - c + 1); /* +1 for zero byte */ input.lines[j].len - c + 1); /* +1 for zero byte */
input.lines[j].len -= c; input.lines[j].len -= c;
/* TODO the next line may kill an escape code to color the next char */
u32_move(input.lines[j].mbtext, input.lines[j].mbtext + input.lines[j].posmap[c],
input.lines[j].num_chars - c + 1); /* +1 for zero byte */
input.lines[j].num_chars -= c;
} }
} }
#ifdef DEBUG #ifdef DEBUG
if (!did_something) if (!did_something) {
fprintf (stderr, fprintf(stderr, "There is nothing to remove (did_something == 0).\n");
"There is nothing to remove (did_something == 0).\n"); }
#endif #endif
/* /*
@ -999,13 +1003,13 @@ int remove_box()
if (opt.killblank) { if (opt.killblank) {
while (empty_line(input.lines + textstart) && textstart < textend) { while (empty_line(input.lines + textstart) && textstart < textend) {
#ifdef DEBUG #ifdef DEBUG
fprintf (stderr, "Killing leading blank line in box body.\n"); fprintf(stderr, "Killing leading blank line in box body.\n");
#endif #endif
++textstart; ++textstart;
} }
while (empty_line(input.lines + textend - 1) && textend > textstart) { while (empty_line(input.lines + textend - 1) && textend > textstart) {
#ifdef DEBUG #ifdef DEBUG
fprintf (stderr, "Killing trailing blank line in box body.\n"); fprintf(stderr, "Killing trailing blank line in box body.\n");
#endif #endif
--textend; --textend;
} }
@ -1034,19 +1038,13 @@ int remove_box()
} }
} }
memset(input.lines + input.anz_lines, 0, memset(input.lines + input.anz_lines, 0,
(BMAX (textstart - boxstart, 0) + BMAX (boxend - textend, 0)) * (BMAX (textstart - boxstart, (size_t) 0) + BMAX (boxend - textend, (size_t) 0)) * sizeof(line_t));
sizeof(line_t));
#ifdef DEBUG #ifdef DEBUG
#if 0 print_input_lines(" (remove_box) after box removal");
for (j=0; j<input.anz_lines; ++j) { fprintf(stderr, "Number of lines shrunk by %d.\n",
fprintf (stderr, "%3d [%02d] \"%s\"\n", j, input.lines[j].len, (int) (BMAX (textstart - boxstart, (size_t) 0) + BMAX (boxend - textend, (size_t) 0)));
input.lines[j].text);
}
#endif #endif
fprintf (stderr, "Number of lines shrunk by %d.\n",
BMAX (textstart - boxstart, 0) + BMAX (boxend - textend, 0));
#endif
return 0; /* all clear */ return 0; /* all clear */
} }
@ -1061,24 +1059,23 @@ void output_input(const int trim_only)
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
*/ */
{ {
size_t j;
size_t indent; size_t indent;
char *indentspc;
int ntabs, nspcs; int ntabs, nspcs;
#ifdef DEBUG #ifdef DEBUG
fprintf (stderr, "output_input() - enter (trim_only=%d)\n", trim_only); fprintf(stderr, "output_input() - enter (trim_only=%d)\n", trim_only);
#endif #endif
for (j = 0; j < input.anz_lines; ++j) { for (size_t j = 0; j < input.anz_lines; ++j) {
if (input.lines[j].text == NULL) { if (input.lines[j].text == NULL) {
continue; continue;
} }
btrim(input.lines[j].text, &(input.lines[j].len)); btrim(input.lines[j].text, &(input.lines[j].len));
btrim32(input.lines[j].mbtext, &(input.lines[j].num_chars));
if (trim_only) { if (trim_only) {
continue; continue;
} }
indentspc = NULL; char *indentspc = NULL;
if (opt.tabexp == 'u') { if (opt.tabexp == 'u') {
indent = strspn(input.lines[j].text, " "); indent = strspn(input.lines[j].text, " ");
ntabs = indent / opt.tabstop; ntabs = indent / opt.tabstop;
@ -1101,7 +1098,7 @@ void output_input(const int trim_only)
indent = 0; indent = 0;
} }
fprintf(opt.outfile, "%s%s%s", indentspc, input.lines[j].text + indent, fprintf(opt.outfile, "%s%s%s", indentspc, u32_strconv_to_locale(advance32(input.lines[j].mbtext, indent)),
(input.final_newline || j < input.anz_lines - 1 ? "\n" : "")); (input.final_newline || j < input.anz_lines - 1 ? "\n" : ""));
BFREE (indentspc); BFREE (indentspc);
} }

View File

@ -30,9 +30,11 @@
#include <string.h> #include <string.h>
#include <strings.h> #include <strings.h>
#include <uniconv.h>
#include <unictype.h> #include <unictype.h>
#include <unistr.h> #include <unistr.h>
#include <unitypes.h> #include <unitypes.h>
#include <uniwidth.h>
#include "shape.h" #include "shape.h"
#include "boxes.h" #include "boxes.h"
@ -487,4 +489,152 @@ char *nspaces(const size_t n)
/**
* Debugging Code: Display contents of input structure
*/
void print_input_lines(const char *heading)
{
fprintf(stderr, "Input Lines%s:\n", heading != NULL ? heading : "");
fprintf(stderr, " [num_chars] \"real text\" [num_cols] \"ascii_text\"\n");
for (size_t i = 0; i < input.anz_lines; ++i) {
fprintf(stderr, "%4d [%02d] \"%s\" [%02d] \"%s\"", (int) i,
(int) input.lines[i].num_chars, u32_strconv_to_locale(input.lines[i].mbtext),
(int) input.lines[i].len, input.lines[i].text);
fprintf(stderr, "\tTabs: [");
if (input.lines[i].tabpos != NULL) {
for (size_t j = 0; j < input.lines[i].tabpos_len; ++j) {
fprintf(stderr, "%d", (int) input.lines[i].tabpos[j]);
if (j < input.lines[i].tabpos_len - 1) {
fprintf(stderr, ", ");
}
}
}
fprintf(stderr, "] (%d)", (int) input.lines[i].tabpos_len);
fprintf(stderr, "\tinvisible=%d\n", (int) input.lines[i].invis);
fprintf(stderr, " posmap=");
if (input.lines[i].posmap != NULL) {
fprintf(stderr, "[");
for (size_t j = 0; j < input.lines[i].len; j++) {
fprintf(stderr, "%d%s", (int) input.lines[i].posmap[j], j == (input.lines[i].len - 1) ? "" : ", ");
}
fprintf(stderr, "]\n");
} else {
fprintf(stderr, "null\n");
}
}
fprintf(stderr, " Longest line: %d columns\n", (int) input.maxline);
fprintf(stderr, " Indentation: %2d spaces\n", (int) input.indent);
fprintf(stderr, "Final newline: %s\n", input.final_newline ? "yes" : "no");
}
/**
* Analyze the multi-byte string in order to determine its metrics:
* - number of visible columns it occupies
* - number of escape characters (== number of escape sequences)
* - the ASCII equivalent of the string
* - the number of invisible characters in the string
*
* @param <s> the multi-byte string to analyze
* @param <num_esc> pointer to where the number of escape sequences should be stored
* @param <ascii> pointer to where the ASCII equivalent of the string should be stored
* @param <posmap> pointer to the position map, which maps each position in <ascii> to a position in <s>
* @returns the number of invisible characters in <s>
*/
static size_t count_invisible_chars(const uint32_t *s, size_t *num_esc, char **ascii, size_t **posmap)
{
size_t invis = 0; /* counts invisible column positions */
int ansipos = 0; /* progression of ansi sequence */
*num_esc = 0; /* counts the number of escape sequences found */
if (is_empty(s)) {
(*ascii) = (char *) strdup("");
(*posmap) = NULL;
return 0;
}
size_t buflen = (size_t) u32_strwidth(s, encoding) + 1;
size_t map_size = BMAX((size_t) 5, buflen);
size_t map_idx = 0;
size_t *map = (size_t *) calloc(map_size, sizeof(size_t)); /* might not be enough if many double-wide chars */
(*ascii) = (char *) calloc(buflen, sizeof(char)); /* maybe a little too much, but certainly enough */
char *p = *ascii;
ucs4_t c;
size_t mb_idx = 0;
const uint32_t *rest = s;
while ((rest = u32_next(&c, rest))) {
if (map_idx >= map_size - 4) {
map_size = map_size * 2 + 1;
map = (size_t *) realloc(map, map_size * sizeof(size_t));
}
if (ansipos == 0 && c == char_esc) {
/* Found an ESC char, count it as invisible and move 1 forward in the detection of CSI sequences */
ansipos++;
invis++;
(*num_esc)++;
} else if (ansipos == 1 && c == '[') {
/* Found '[' char after ESC. A CSI sequence has started. */
ansipos++;
invis++;
} else if (ansipos == 1 && c >= 0x40 && c <= 0x5f) {
/* Found a byte designating the end of a two-byte escape sequence */
invis++;
ansipos = 0;
} else if (ansipos == 2) {
/* Inside CSI sequence - Keep counting bytes as invisible */
invis++;
/* A char between 0x40 and 0x7e signals the end of an CSI or escape sequence */
if (c >= 0x40 && c <= 0x7e) {
ansipos = 0;
}
} else if (is_ascii_printable(c)) {
*p = c & 0xff;
map[map_idx++] = mb_idx;
++p;
} else {
int cols = uc_width(c, encoding);
if (cols > 0) {
memset(p, (int) 'x', cols);
for (int i = 0; i < cols; i++) {
map[map_idx++] = mb_idx;
}
p += cols;
}
}
++mb_idx;
}
*p = '\0';
(*posmap) = map;
return invis;
}
void analyze_line_ascii(line_t *line)
{
size_t num_esc = 0;
char *ascii;
size_t *map;
size_t invis = count_invisible_chars(line->mbtext, &num_esc, &ascii, &(map));
line->invis = invis;
/* u32_strwidth() does not count control characters, i.e. ESC characters, for which we must correct */
line->len = u32_strwidth(line->mbtext, encoding) - invis + num_esc;
line->num_chars = u32_strlen(line->mbtext);
BFREE(line->text);
line->text = ascii;
BFREE(line->posmap);
line->posmap = map;
if (line->len > input.maxline) {
input.maxline = line->len;
}
}
/*EOF*/ /* vim: set sw=4: */ /*EOF*/ /* vim: set sw=4: */

View File

@ -30,7 +30,11 @@
#include "boxes.h" #include "boxes.h"
#define BMAX(a, b) ((a)>(b)? (a):(b)) /* return the larger value */ #define BMAX(a, b) ({ /* return the larger value */ \
__typeof__ (a) _a = (a); \
__typeof__ (b) _b = (b); \
_a > _b ? _a : _b; \
})
#define BFREE(p) { /* free memory and clear pointer */ \ #define BFREE(p) { /* free memory and clear pointer */ \
if (p) { \ if (p) { \
@ -66,6 +70,10 @@ char *tabbify_indent(const size_t lineno, char *indentspc, const size_t indentsp
char *nspaces(const size_t n); char *nspaces(const size_t n);
void print_input_lines(const char *heading);
void analyze_line_ascii(line_t *line);
#endif #endif