boxes/src/input.c
2021-04-17 15:30:56 +02:00

351 lines
10 KiB
C

/*
* boxes - Command line filter to draw/remove ASCII boxes around text
* Copyright (c) 1999-2021 Thomas Jensen and the boxes contributors
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License, version 2, as published
* by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
*/
/*
* Read and analyze input text.
*/
#include "config.h"
#include <errno.h>
#include <string.h>
#include <unistr.h>
#include <unitypes.h>
#include "boxes.h"
#include "regulex.h"
#include "tools.h"
#include "unicode.h"
#include "input.h"
/**
* Determine if the given line of raw text is ended by a line break.
* @param s the string to check
* @param len length of s in characters
* @returns != 0 if line break found;
* == 0 if line break not found
*/
static int has_linebreak(const uint32_t *s, const int len)
{
int result = 0;
if (s != NULL && len > 0) {
ucs4_t the_last = s[len - 1];
result = u32_cmp(&char_cr, &the_last, 1) == 0 || u32_cmp(&char_newline, &the_last, 1) == 0;
#if defined(DEBUG)
fprintf(stderr, "has_linebreak: (%#010x) %d\n", (int) the_last, result);
#endif
}
return result;
}
/**
* Determine indentation of given lines in spaces. Lines are assumed to be free of trailing whitespace.
* @param lines the lines to examine
* @param lines_size number of lines to examine
* @returns >= 0: indentation in spaces; < 0: error
*/
static int get_indent(const line_t *lines, const size_t lines_size)
{
int res = LINE_MAX_BYTES; /* result */
int nonblank = 0; /* true if one non-blank line found */
if (lines == NULL) {
fprintf(stderr, "%s: internal error\n", PROJECT);
return -1;
}
if (lines_size == 0) {
return 0;
}
for (size_t j = 0; j < lines_size; ++j) {
if (lines[j].len > 0) {
nonblank = 1;
size_t ispc = strspn(lines[j].text, " ");
if ((int) ispc < res) {
res = ispc;
}
}
}
if (nonblank) {
return res; /* success */
} else {
return 0; /* success, but only blank lines */
}
}
int apply_substitutions(input_t *result, const int mode)
{
size_t anz_rules;
reprule_t *rules;
size_t j, k;
if (opt.design == NULL) {
return 1;
}
if (mode == 0) {
anz_rules = opt.design->anz_reprules;
rules = opt.design->reprules;
}
else if (mode == 1) {
anz_rules = opt.design->anz_revrules;
rules = opt.design->revrules;
}
else {
fprintf(stderr, "%s: internal error\n", PROJECT);
return 2;
}
/*
* Compile regular expressions
*/
#ifdef REGEXP_DEBUG
fprintf(stderr, "Compiling %d %s rule patterns\n", (int) anz_rules, mode ? "reversion" : "replacement");
#endif
errno = 0;
opt.design->current_rule = rules;
for (j = 0; j < anz_rules; ++j, ++(opt.design->current_rule)) {
rules[j].prog = compile_pattern(rules[j].search);
if (rules[j].prog == NULL) {
return 5;
}
}
opt.design->current_rule = NULL;
if (errno) {
return 3;
}
/*
* Apply regular expression substitutions to input lines
*/
for (k = 0; k < result->num_lines; ++k) {
opt.design->current_rule = rules;
for (j = 0; j < anz_rules; ++j, ++(opt.design->current_rule)) {
#ifdef REGEXP_DEBUG
fprintf (stderr, "regex_replace(0x%p, \"%s\", \"%s\", %d, \'%c\') == ",
rules[j].prog, rules[j].repstr, u32_strconv_to_output(result->lines[k].mbtext),
(int) result->lines[k].num_chars, rules[j].mode);
#endif
uint32_t *newtext = regex_replace(rules[j].prog, rules[j].repstr,
result->lines[k].mbtext, result->lines[k].num_chars, rules[j].mode == 'g');
#ifdef REGEXP_DEBUG
fprintf (stderr, "\"%s\"\n", newtext ? u32_strconv_to_output(newtext) : "NULL");
#endif
if (newtext == NULL) {
return 1;
}
BFREE(result->lines[k].mbtext_org); /* original address allocated for mbtext */
result->lines[k].mbtext = newtext;
result->lines[k].mbtext_org = newtext;
analyze_line_ascii(result, result->lines + k);
#ifdef REGEXP_DEBUG
fprintf (stderr, "result->lines[%d] == {%d, \"%s\"}\n", (int) k,
(int) result->lines[k].num_chars, u32_strconv_to_output(result->lines[k].mbtext));
#endif
}
opt.design->current_rule = NULL;
}
/*
* If text indentation was part of the lines processed, indentation
* may now be different -> recalculate result->indent.
*/
if (opt.design->indentmode == 't') {
int rc;
rc = get_indent(result->lines, result->num_lines);
if (rc >= 0) {
result->indent = (size_t) rc;
} else {
return 4;
}
}
return 0;
}
static void trim_trailing_ws_carefully(uint32_t *mbtemp, size_t *len_chars)
{
if (opt.r) {
/* remove only trailing line breaks, but keep the space */
if (is_char_at(mbtemp, *len_chars - 1, char_newline)) {
set_char_at(mbtemp, *len_chars - 1, char_nul);
--(*len_chars);
}
if (is_char_at(mbtemp, *len_chars - 1, char_cr)) {
set_char_at(mbtemp, *len_chars - 1, char_nul);
--(*len_chars);
}
}
else {
/* remove all trailing whitespace, including unicode whitespace */
btrim32(mbtemp, len_chars);
}
}
input_t *read_all_input()
{
char buf[LINE_MAX_BYTES + 3]; /* static input buffer incl. newline + zero terminator */
size_t input_size = 0; /* number of elements allocated */
input_t *result = (input_t *) calloc(1, sizeof(input_t));
result->indent = LINE_MAX_BYTES;
while (fgets(buf, LINE_MAX_BYTES + 2, opt.infile))
{
if (result->num_lines % 100 == 0) {
input_size += 100;
line_t *tmp = (line_t *) realloc(result->lines, input_size * sizeof(line_t));
if (tmp == NULL) {
perror(PROJECT);
BFREE (result->lines);
return NULL;
}
result->lines = tmp;
}
memset(result->lines + result->num_lines, 0, sizeof(line_t));
uint32_t *mbtemp = u32_strconv_from_input(buf);
size_t len_chars = u32_strlen(mbtemp);
result->final_newline = has_linebreak(mbtemp, len_chars);
trim_trailing_ws_carefully(mbtemp, &len_chars);
/*
* Expand tabs
*/
if (len_chars > 0) {
uint32_t *temp = NULL;
len_chars = expand_tabs_into(mbtemp, opt.tabstop, &temp,
&(result->lines[result->num_lines].tabpos),
&(result->lines[result->num_lines].tabpos_len));
if (len_chars == 0) {
perror(PROJECT);
BFREE (result->lines);
return NULL;
}
result->lines[result->num_lines].mbtext = temp;
BFREE(mbtemp);
temp = NULL;
}
else {
result->lines[result->num_lines].mbtext = mbtemp;
}
result->lines[result->num_lines].mbtext_org = result->lines[result->num_lines].mbtext;
result->lines[result->num_lines].num_chars = len_chars;
++result->num_lines;
}
if (ferror(stdin)) {
perror(PROJECT);
BFREE (result->lines);
return NULL;
}
return result;
}
int analyze_input(input_t *result)
{
result->indent = LINE_MAX_BYTES;
result->maxline = 0;
/*
* Build ASCII equivalent of the multi-byte string, update line stats
*/
for (size_t i = 0; i < result->num_lines; ++i) {
analyze_line_ascii(result, result->lines + i);
}
/*
* Exit if there was no input at all
*/
if (result->lines == NULL || result->lines[0].text == NULL) {
return 0;
}
/*
* Compute indentation
*/
int rc = get_indent(result->lines, result->num_lines);
if (rc >= 0) {
result->indent = (size_t) rc;
} else {
return 1;
}
/*
* Remove indentation, unless we want to preserve it (when removing
* a box or if the user wants to retain it inside the box)
*/
if (opt.design->indentmode != 't' && opt.r == 0) {
for (size_t i = 0; i < result->num_lines; ++i) {
#ifdef DEBUG
fprintf(stderr, "%2d: mbtext = \"%s\" (%d chars)\n", (int) i,
u32_strconv_to_output(result->lines[i].mbtext), (int) result->lines[i].num_chars);
#endif
if (result->lines[i].num_chars >= result->indent) {
memmove(result->lines[i].text, result->lines[i].text + result->indent,
result->lines[i].len - result->indent + 1);
result->lines[i].len -= result->indent;
result->lines[i].mbtext = advance32(result->lines[i].mbtext, result->indent);
result->lines[i].num_chars -= result->indent;
}
#ifdef DEBUG
fprintf(stderr, "%2d: mbtext = \"%s\" (%d chars)\n", (int) i,
u32_strconv_to_output(result->lines[i].mbtext), (int) result->lines[i].num_chars);
#endif
}
result->maxline -= result->indent;
}
/*
* Apply regular expression substitutions
*/
if (opt.r == 0) {
if (apply_substitutions(result, 0) != 0) {
return 1;
}
}
#ifdef DEBUG
fprintf (stderr, "Effective encoding: %s\n", encoding);
print_input_lines(NULL);
#endif
return 0;
}
/*EOF*/ /* vim: set sw=4: */