Unicode-enable regex processing by introducing PCRE2 #1 #67

This commit is contained in:
Thomas Jensen 2021-02-05 11:01:38 +01:00
parent 6a3d0e715c
commit a579da13a5
No known key found for this signature in database
GPG Key ID: A4ACEE270D0FB7DB
16 changed files with 324 additions and 1610 deletions

View File

@ -48,7 +48,7 @@ infomsg:
replaceinfos: src/boxes.h doc/boxes.1
src/boxes.h: src/boxes.h.in src/regexp/regexp.h Makefile
src/boxes.h: src/boxes.h.in src/regulex.h src/shape.h Makefile
sed -e 's/--BVERSION--/$(BVERSION) $(GIT_STATUS)/; s/--GLOBALCONF--/$(subst /,\/,$(GLOBALCONF))/' src/boxes.h.in > src/boxes.h
doc/boxes.1: doc/boxes.1.in Makefile

View File

@ -1874,14 +1874,11 @@ shapes {
sw ("/*") s ("*") se ("*/")
}
replace "^( *)([^ ])" with "\\1\\2 "
replace "([^ ]) ([^ ])" with "\\1 \\2"
reverse "^( *)([^ ]*) " to "\\1\\2" # \1 to leave indentation untouched
reverse "([^ ]) ([^ ])" to "\\1 \\2"
replace "(.)(?!$)" with "$1 "
reverse "^( *)([^ ]*) " to "$1$2" # $1 to leave indentation untouched # TODO
padding {
left 2
right 1
horiz 2
}
elastic (n,e,s,w)
@ -2253,7 +2250,7 @@ shapes {
elastic (n, s, e, w)
padding {
left 1
left 2
}
END unicornsay

View File

@ -26,9 +26,9 @@ GEN_HDR = parser.h boxes.h
GEN_SRC = parser.c lex.yy.c
GEN_FILES = $(GEN_SRC) $(GEN_HDR)
ORIG_HDRCL = boxes.h.in config.h
ORIG_HDR = $(ORIG_HDRCL) lexer.h tools.h shape.h generate.h remove.h unicode.h
ORIG_HDR = $(ORIG_HDRCL) lexer.h tools.h shape.h generate.h remove.h unicode.h regulex.h
ORIG_GEN = lexer.l parser.y
ORIG_NORM = boxes.c tools.c shape.c generate.c remove.c unicode.c
ORIG_NORM = boxes.c tools.c shape.c generate.c remove.c unicode.c regulex.c
ORIG_SRC = $(ORIG_GEN) $(ORIG_NORM)
ORIG_FILES = $(ORIG_SRC) $(ORIG_HDR)
OTH_FILES = Makefile
@ -46,8 +46,7 @@ debug: flags_$(BOXES_PLATFORM)
$(MAKE) BOXES_PLATFORM=$(BOXES_PLATFORM) ALL_OBJ="$(ALL_OBJ)" CFLAGS_ADDTL="-g $(CFLAGS_ADDTL)" STRIP=false flags_$(BOXES_PLATFORM) $(BOXES_EXECUTABLE_NAME)
boxes: $(ALL_OBJ)
$(MAKE) -C regexp CC=$(CC) libregexp.a
$(CC) $(LDFLAGS) $(ALL_OBJ) -o $(BOXES_EXECUTABLE_NAME) -lunistring -lpcre2-32 -lregexp
$(CC) $(LDFLAGS) $(ALL_OBJ) -o $(BOXES_EXECUTABLE_NAME) -lunistring -lpcre2-32
if [ "$(STRIP)" = "true" ] ; then strip $(BOXES_EXECUTABLE_NAME) ; fi
boxes.exe: $(ALL_OBJ)
@ -56,22 +55,22 @@ boxes.exe: $(ALL_OBJ)
flags_unix:
$(eval CFLAGS := -I. -Iregexp -Wall -W $(CFLAGS_ADDTL))
$(eval LDFLAGS := -Lregexp $(LDFLAGS_ADDTL))
$(eval CFLAGS := -I. -Wall -W $(CFLAGS_ADDTL))
$(eval LDFLAGS := $(LDFLAGS_ADDTL))
$(eval BOXES_EXECUTABLE_NAME := boxes)
$(eval ALL_OBJ := $(GEN_SRC:.c=.o) $(ORIG_NORM:.c=.o))
flags_win32:
$(eval CFLAGS := -Os -s -m32 -I. -Iregexp -Wall -W $(CFLAGS_ADDTL))
$(eval CFLAGS := -Os -s -m32 -I. -Wall -W $(CFLAGS_ADDTL))
$(eval LDFLAGS := -s -m32)
$(eval BOXES_EXECUTABLE_NAME := boxes.exe)
$(eval ALL_OBJ := $(GEN_SRC:.c=.o) $(ORIG_NORM:.c=.o) regexp/regexp.o regexp/regsub.o misc/getopt.o)
$(eval ALL_OBJ := $(GEN_SRC:.c=.o) $(ORIG_NORM:.c=.o) misc/getopt.o)
flags_:
@echo Please call make from the top level directory.
exit 1
parser.c parser.h: parser.y boxes.h regexp/regexp.h
parser.c parser.h: parser.y boxes.h
$(YACC) -o parser.c -d parser.y
lex.yy.c: lexer.l boxes.h
@ -81,16 +80,15 @@ lex.yy.c: lexer.l boxes.h
rm lexer.tmp.c
boxes.o: boxes.c boxes.h regexp/regexp.h shape.h tools.h unicode.h generate.h remove.h config.h
boxes.o: boxes.c boxes.h regulex.h shape.h tools.h unicode.h generate.h remove.h config.h
tools.o: tools.c tools.h boxes.h shape.h config.h
unicode.o: unicode.c unicode.h config.h
shape.o: shape.c shape.h boxes.h config.h tools.h
generate.o: generate.c generate.h boxes.h shape.h tools.h config.h
remove.o: remove.c remove.h boxes.h shape.h tools.h config.h
regulex.o: regulex.c regulex.h config.h
lex.yy.o: lex.yy.c parser.h tools.h shape.h lexer.h config.h
parser.o: parser.c parser.h tools.h shape.h lexer.h config.h
regexp/regexp.o: regexp/regexp.c
regexp/regsub.o: regexp/regsub.c
misc/getopt.o: misc/getopt.c
@ -102,7 +100,6 @@ clean: flags_unix
rm -f $(ALL_OBJ)
rm -f $(GEN_FILES)
rm -f core boxes boxes.exe
$(MAKE) -C regexp clean
#EOF

View File

@ -41,8 +41,8 @@
#include "shape.h"
#include "boxes.h"
#include "tools.h"
#include "regexp.h"
#include "generate.h"
#include "regulex.h"
#include "remove.h"
#include "unicode.h"
@ -1218,6 +1218,73 @@ static int get_indent(const line_t *lines, const size_t lines_size)
/**
* Analyze the multi-byte string in order to determine its metrics:
* - number of visible columns it occupies
* - number of escape characters (== number of escape sequences)
* - the ASCII equivalent of the string
* - the number of invisible characters in the string
*
* @param <s> the multi-byte string to analyze
* @param <num_esc> pointer to where the number of escape sequences should be stored
* @param <ascii> pointer to where the ASCII equivalent of the string should be stored
* @returns the number of invisible characters in <s>
*/
static size_t count_invisible_chars(const uint32_t *s, size_t *num_esc, char **ascii)
{
size_t invis = 0; /* counts invisible column positions */
int ansipos = 0; /* progression of ansi sequence */
*num_esc = 0; /* counts the number of escape sequences found */
if (is_empty(s)) {
(*ascii) = (char *) strdup("");
return 0;
}
size_t buflen = (size_t) u32_strwidth(s, encoding);
(*ascii) = (char *) calloc(buflen, sizeof(char)); /* maybe a little too much, but certainly enough */
char *p = *ascii;
ucs4_t c;
const uint32_t *rest = s;
while ((rest = u32_next(&c, rest))) {
if (ansipos == 0 && c == char_esc) {
/* Found an ESC char, count it as invisible and move 1 forward in the detection of CSI sequences */
ansipos++;
invis++;
(*num_esc)++;
} else if (ansipos == 1 && c == '[') {
/* Found '[' char after ESC. A CSI sequence has started. */
ansipos++;
invis++;
} else if (ansipos == 1 && c >= 0x40 && c <= 0x5f) {
/* Found a byte designating the end of a two-byte escape sequence */
invis++;
ansipos = 0;
} else if (ansipos == 2) {
/* Inside CSI sequence - Keep counting bytes as invisible */
invis++;
/* A char between 0x40 and 0x7e signals the end of an CSI or escape sequence */
if (c >= 0x40 && c <= 0x7e) {
ansipos = 0;
}
} else if (is_ascii_printable(c)) {
*p = c & 0xff;
++p;
} else {
int cols = uc_width(c, encoding);
if (cols > 0) {
memset(p, (int) 'x', cols);
p += cols;
}
}
}
*p = '\0';
return invis;
}
static int apply_substitutions(const int mode)
/*
* Apply regular expression substitutions to input text.
@ -1236,8 +1303,6 @@ static int apply_substitutions(const int mode)
size_t anz_rules;
reprule_t *rules;
size_t j, k;
char buf[LINE_MAX_BYTES * 2];
size_t buf_len; /* length of string in buf */
if (opt.design == NULL) {
return 1;
@ -1262,7 +1327,10 @@ static int apply_substitutions(const int mode)
errno = 0;
opt.design->current_rule = rules;
for (j = 0; j < anz_rules; ++j, ++(opt.design->current_rule)) {
rules[j].prog = regcomp(rules[j].search);
rules[j].prog = compile_pattern(rules[j].search);
if (rules[j].prog == NULL) {
return 5;
}
}
opt.design->current_rule = NULL;
if (errno) {
@ -1276,37 +1344,37 @@ static int apply_substitutions(const int mode)
opt.design->current_rule = rules;
for (j = 0; j < anz_rules; ++j, ++(opt.design->current_rule)) {
#ifdef REGEXP_DEBUG
fprintf (stderr, "myregsub (0x%p, \"%s\", %d, \"%s\", buf, %d, \'%c\') == ",
rules[j].prog, input.lines[k].text,
input.lines[k].len, rules[j].repstr, LINE_MAX_BYTES*2,
rules[j].mode);
fprintf (stderr, "regex_replace(0x%p, \"%s\", \"%s\", %d, \'%c\') == ",
rules[j].prog, rules[j].repstr, u32_strconv_to_locale(input.lines[k].mbtext),
(int) input.lines[k].num_chars, rules[j].mode);
#endif
errno = 0;
buf_len = myregsub(rules[j].prog, input.lines[k].text,
input.lines[k].len, rules[j].repstr, buf, LINE_MAX_BYTES * 2,
rules[j].mode);
uint32_t *newtext = regex_replace(rules[j].prog, rules[j].repstr,
input.lines[k].mbtext, input.lines[k].num_chars, rules[j].mode == 'g');
#ifdef REGEXP_DEBUG
fprintf (stderr, "%d\n", buf_len);
fprintf (stderr, "\"%s\"\n", newtext ? u32_strconv_to_locale(newtext) : "NULL");
#endif
if (errno) {
if (newtext == NULL) {
return 1;
}
BFREE (input.lines[k].text);
input.lines[k].text = (char *) strdup(buf);
if (input.lines[k].text == NULL) {
perror(PROJECT);
return 1;
}
input.lines[k].len = buf_len;
BFREE(input.lines[k].mbtext_org); /* original address allocated for mbtext */
input.lines[k].mbtext = newtext;
input.lines[k].mbtext_org = newtext;
size_t num_esc = 0;
char *ascii; // TODO HERE extract into function analyze/asciify(line_t) ?
size_t invis = count_invisible_chars(input.lines[k].mbtext, &num_esc, &ascii);
input.lines[k].len = u32_strwidth(input.lines[k].mbtext, encoding) - invis + num_esc;
input.lines[k].num_chars = u32_strlen(input.lines[k].mbtext);
BFREE(input.lines[k].text);
input.lines[k].text = ascii;
if (input.lines[k].len > input.maxline) {
input.maxline = input.lines[k].len;
}
#ifdef REGEXP_DEBUG
fprintf (stderr, "input.lines[%d] == {%d, \"%s\"}\n", k, input.lines[k].len, input.lines[k].text);
fprintf (stderr, "input.lines[%d] == {%d, \"%s\"}\n", (int) k,
(int) input.lines[k].num_chars, u32_strconv_to_locale(input.lines[k].mbtext));
#endif
}
opt.design->current_rule = NULL;
@ -1357,60 +1425,6 @@ static int has_linebreak(const uint32_t *s, const int len)
static size_t count_invisible_chars(const uint32_t *s, const size_t buflen, size_t *num_esc, char **ascii)
{
size_t invis = 0; /* counts invisible column positions */
int ansipos = 0; /* progression of ansi sequence */
*num_esc = 0; /* counts the number of escape sequences found */
if (is_empty(s)) {
(*ascii) = (char *) strdup("");
return 0;
}
(*ascii) = (char *) calloc(buflen, sizeof(char));
char *p = *ascii;
ucs4_t c;
const uint32_t *rest = s;
while ((rest = u32_next(&c, rest))) {
if (ansipos == 0 && c == char_esc) {
/* Found an ESC char, count it as invisible and move 1 forward in the detection of CSI sequences */
ansipos++;
invis++;
(*num_esc)++;
} else if (ansipos == 1 && c == '[') {
/* Found '[' char after ESC. A CSI sequence has started. */
ansipos++;
invis++;
} else if (ansipos == 1 && c >= 0x40 && c <= 0x5f) {
/* Found a byte designating the end of a two-byte escape sequence */
invis++;
ansipos = 0;
} else if (ansipos == 2) {
/* Inside CSI sequence - Keep counting bytes as invisible */
invis++;
/* A char between 0x40 and 0x7e signals the end of an CSI or escape sequence */
if (c >= 0x40 && c <= 0x7e) {
ansipos = 0;
}
} else if (is_ascii_printable(c)) {
*p = c & 0xff;
++p;
} else {
int cols = uc_width(c, encoding);
if (cols > 0) {
memset(p, (int) 'x', cols);
p += cols;
}
}
}
*p = '\0';
return invis;
}
static int read_all_input(const int use_stdin)
/*
* Read entire input (possibly from stdin) and store it in 'input' array.
@ -1444,7 +1458,7 @@ static int read_all_input(const int use_stdin)
* Start reading
*/
while (fgets(buf, LINE_MAX_BYTES + 1, opt.infile)) {
if (input_size % 100 == 0) {
if (input.anz_lines % 100 == 0) {
input_size += 100;
line_t *tmp = (line_t *) realloc(input.lines, input_size * sizeof(line_t));
if (tmp == NULL) {
@ -1483,18 +1497,20 @@ static int read_all_input(const int use_stdin)
return 1;
}
input.lines[input.anz_lines].mbtext = temp;
BFREE(mbtemp);
temp = NULL;
}
else {
input.lines[input.anz_lines].mbtext = mbtemp;
}
input.lines[input.anz_lines].mbtext_org = input.lines[input.anz_lines].mbtext;
input.lines[input.anz_lines].num_chars = len_chars;
/*
* Find ANSI CSI/ESC sequences
*/
size_t num_esc = 0;
size_t invis = count_invisible_chars(input.lines[input.anz_lines].mbtext, strlen(buf), &num_esc,
size_t invis = count_invisible_chars(input.lines[input.anz_lines].mbtext, &num_esc,
&(input.lines[input.anz_lines].text));
input.lines[input.anz_lines].invis = invis;
/* u32_strwidth() does not count control characters, i.e. ESC characters, for which we must correct */
@ -1526,8 +1542,8 @@ static int read_all_input(const int use_stdin)
/* recalculate input statistics for redrawing the mended box */
for (i = 0; i < input.anz_lines; ++i) {
size_t num_esc = 0;
char *dummy;
size_t invis = count_invisible_chars(input.lines[i].mbtext, strlen(input.lines[i].text), &num_esc, &dummy);
char *dummy; // TODO extract into function
size_t invis = count_invisible_chars(input.lines[i].mbtext, &num_esc, &dummy);
BFREE(dummy);
input.lines[i].len = u32_strwidth(input.lines[i].mbtext, encoding) - invis + num_esc;
input.lines[i].num_chars = u32_strlen(input.lines[i].mbtext);
@ -1576,7 +1592,7 @@ static int read_all_input(const int use_stdin)
* Apply regular expression substitutions
*/
if (opt.r == 0) {
if (apply_substitutions(0) != 0) { // TODO
if (apply_substitutions(0) != 0) {
return 1;
}
}

View File

@ -25,13 +25,14 @@
#ifndef BOXES_H
#define BOXES_H
/* #define DEBUG */
/* #define REGEXP_DEBUG */
/* #define PARSER_DEBUG */
/* #define LEXER_DEBUG */
/* #define DEBUG 1 */
#define REGEXP_DEBUG 1
/* #define PARSER_DEBUG 1 */
/* #define LEXER_DEBUG 1 */
#include <unitypes.h>
#include "regexp/regexp.h"
#include "regulex.h"
#include "shape.h"
@ -80,11 +81,11 @@
typedef struct {
char *search;
char *repstr;
regexp *prog; /* compiled search pattern */
int line; /* line of definition in config file */
char mode; /* 'g' or 'o' */
char *search;
char *repstr;
pcre2_code *prog; /* compiled search pattern */
int line; /* line of definition in config file */
char mode; /* 'g' or 'o' */
} reprule_t;
@ -147,11 +148,12 @@ extern opt_t opt;
typedef struct {
size_t len; /* length of visible text in columns (visible character positions in a text terminal), which is the same as the length of the 'text' field */
char *text; /* ASCII line content, tabs expanded, multi-byte chars replaced with one or more 'x' */
char *text; /* ASCII line content, tabs expanded, ansi escapes removed, multi-byte chars replaced with one or more 'x' */
size_t invis; /* number of invisble columns/characters (part of an ansi sequence) */
uint32_t *mbtext; /* multi-byte (original) line content, tabs expanded. We use UTF-32 in order to enable pointer arithmetic. */
size_t num_chars; /* total number of characters in mbtext, visible + invisible */
uint32_t *mbtext_org; /* mbtext as originally allocated, so that we can free it again */
size_t *tabpos; /* tab positions in expanded work strings, or NULL if not needed */
size_t tabpos_len; /* number of tabs in a line */

View File

@ -1,49 +0,0 @@
#
# boxes - Command line filter to draw/remove ASCII boxes around text
# Copyright (C) 1999 Thomas Jensen and the boxes contributors
#
# This program is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License, version 2, as published
# by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
# for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
#____________________________________________________________________________
#============================================================================
CFLAGS = -O -I. $(CFLAGS_ADDTL)
ALL_CL = regexp/regexp.c regexp/regsub.c
C_SRC = $(notdir $(ALL_CL))
ALLFILES = Makefile $(C_SRC) regexp.h regmagic.h
ALLOBJ = $(C_SRC:.c=.o)
.PHONY: clean build debug
build: libregexp.a
debug: libregexp.a
libregexp.a: $(ALLOBJ)
ar cr libregexp.a $(ALLOBJ)
regexp.o: regexp.c regmagic.h regexp.h ../config.h
regsub.o: regsub.c regmagic.h regexp.h ../config.h
.c.o:
$(CC) $(CFLAGS) -c $<
clean:
rm -f $(ALLOBJ) libregexp.a core
#EOF

File diff suppressed because it is too large Load Diff

View File

@ -1,30 +0,0 @@
/*
* Definitions etc. for regexp(3) routines.
*
* Caveat: this is V8 regexp(3) [actually, a reimplementation thereof],
* not the System V one.
*/
#ifndef REGEXP_H
#define REGEXP_H
#define NSUBEXP 10
typedef struct regexp {
char *startp[NSUBEXP];
char *endp[NSUBEXP];
char regstart; /* Internal use only. */
char reganch; /* Internal use only. */
char *regmust; /* Internal use only. */
int regmlen; /* Internal use only. */
char program[1]; /* Unwarranted chumminess with compiler. */
} regexp;
extern regexp *regcomp();
extern int regexec();
/* extern size_t regsub(); */
extern size_t myregsub();
/* extern void regerror(); */
#endif /* REGEXP_H */

View File

@ -1,5 +0,0 @@
/*
* The first byte of the regexp internal "program" is actually this magic
* number; the start node begins in the second byte.
*/
#define MAGIC 0234

View File

@ -1,187 +0,0 @@
/*
* File: regsub.c
* Date created: Copyright (c) 1986 by University of Toronto.
* Author: Henry Spencer.
* Extensions and modifications by Thomas Jensen
* Language: K&R C (traditional)
* Purpose: Perform substitutions after a regexp match
* License: - Not derived from licensed software.
* - Permission is granted to anyone to use this
* software for any purpose on any computer system,
* and to redistribute it freely, subject to the
* following restrictions:
* 1. The author is not responsible for the
* consequences of use of this software, no matter
* how awful, even if they arise from defects in it.
* 2. The origin of this software must not be
* misrepresented, either by explicit claim or by
* omission.
* 3. Altered versions must be plainly marked as such,
* and must not be misrepresented as being the
* original software.
*
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
*/
#include <stdio.h>
#include <string.h>
#include "regexp.h"
#include "regmagic.h"
#ifndef CHARBITS
#define UCHARAT(p) ((int)*(unsigned char *)(p))
#else
#define UCHARAT(p) ((int)*(p)&CHARBITS)
#endif
/*
- regsub - perform substitutions after a regexp match
*/
size_t /* RETURNS length of dest str */
regsub (prog, source, dest, dest_size)
regexp *prog;
char *source;
char *dest;
size_t dest_size; /* size of destination buffer */
{
register char *src;
register char *dst;
register char c;
register int no;
register int len;
size_t fill; /* current number of chars in dest */
if (prog == NULL || source == NULL || dest == NULL) {
regerror("NULL parm to regsub");
return 0;
}
if (UCHARAT(prog->program) != MAGIC) {
regerror("damaged regexp fed to regsub");
return 0;
}
src = source;
dst = dest;
fill = 0;
while ((c = *src++) != '\0') {
if (c == '&')
no = 0;
else if (c == '\\' && '0' <= *src && *src <= '9')
no = *src++ - '0';
else
no = -1;
if (no < 0) { /* Ordinary character. */
if (c == '\\' && (*src == '\\' || *src == '&'))
c = *src++;
*dst++ = c;
++fill;
} else if (prog->startp[no] != NULL && prog->endp[no] != NULL) {
len = prog->endp[no] - prog->startp[no];
if (len < dest_size-fill) {
(void) strncpy(dst, prog->startp[no], len);
dst += len;
fill += len;
if (len != 0 && *(dst-1) == '\0') { /* strncpy hit NUL. */
regerror("damaged match string");
return fill;
}
}
else {
(void) strncpy (dst, prog->startp[no], dest_size-fill);
dest[dest_size-1] = '\0';
return dest_size-1;
}
}
if (fill >= dest_size) {
dest[dest_size-1] = '\0';
return dest_size-1;
}
}
*dst++ = '\0';
return fill;
}
size_t /* RETURNS length of str in destination buffer */
myregsub (prog, orig, orig_len, repstr, dest, dest_size, mode)
regexp *prog; /* pointers for matched regexp to original text */
char *orig; /* original input line */
size_t orig_len; /* length of original input line */
char *repstr; /* source buffer for replaced parts */
char *dest; /* destination buffer */
size_t dest_size; /* size of destination buffer */
char mode; /* 'g' or 'o' */
{
size_t fill; /* current number of chars in dest */
char *sp, *dp; /* source rover, destination rover */
int rc; /* received return codes */
size_t rest_size; /* remaining space in dest */
size_t partlen; /* temp length of a piece handled */
fill = 0;
sp = orig;
dp = dest;
rest_size = dest_size;
do {
rc = regexec (prog, sp);
if (!rc) break;
partlen = prog->startp[0] - sp;
if (partlen < rest_size) {
strncpy (dp, sp, partlen);
fill += partlen;
sp = prog->startp[0];
dp += partlen;
rest_size -= partlen;
}
else {
strncpy (dp, sp, rest_size);
dest[dest_size-1] = '\0';
return dest_size - 1;
}
/* fprintf (stderr, "regsub (%p, \"%s\", \"%s\", %d);\n", */
/* prog, repstr, dp, rest_size); */
fill += regsub (prog, repstr, dp, rest_size);
dp = dest + fill;
sp = prog->endp[0];
rest_size = dest_size - fill;
if (fill >= dest_size) {
dest[dest_size-1] = '\0';
return dest_size - 1;
}
/* fprintf (stderr, "dest = \"%s\";\n", dest); */
if (prog->startp[0] == prog->endp[0])
break; /* match "^" or "$" only once */
} while (mode == 'g');
partlen = orig + orig_len - sp;
if (partlen < rest_size) {
strncpy (dp, sp, partlen);
fill += partlen;
dp[partlen] = '\0';
}
else {
strncpy (dp, sp, rest_size);
dest[dest_size-1] = '\0';
fill = dest_size - 1;
}
return fill;
}
/*EOF*/ /* vim: set sw=4: */

114
src/regulex.c Normal file
View File

@ -0,0 +1,114 @@
/*
* boxes - Command line filter to draw/remove ASCII boxes around text
* Copyright (C) 1999 Thomas Jensen and the boxes contributors
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License, version 2, as published
* by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
*/
/*
* Convenience functions for PCRE2 regular expression processing
*/
#include "config.h"
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <uniconv.h>
#include "tools.h"
#include "regulex.h"
pcre2_code *compile_pattern(char *pattern)
{
int errornumber;
PCRE2_SIZE erroroffset;
PCRE2_SPTR pattern32 = u32_strconv_from_locale(pattern);
pcre2_code *re = pcre2_compile(
pattern32, /* the pattern */
PCRE2_ZERO_TERMINATED, /* indicates pattern is zero-terminated */
0, /* default options */
&errornumber,
&erroroffset,
NULL); /* use default compile context */
if (re == NULL) {
PCRE2_UCHAR buffer[256];
pcre2_get_error_message(errornumber, buffer, sizeof(buffer));
fprintf(stderr, "Regular expression pattern \"%s\" failed to compile at offset %d: %s\n",
pattern, (int) erroroffset, u32_strconv_to_locale(buffer));
}
return re;
}
uint32_t *regex_replace(pcre2_code *search, char *replace, uint32_t *input, const size_t input_len, const int global)
{
PCRE2_SPTR replacement = u32_strconv_from_locale(replace);
uint32_t options = PCRE2_SUBSTITUTE_OVERFLOW_LENGTH | PCRE2_SUBSTITUTE_EXTENDED
| (global ? PCRE2_SUBSTITUTE_GLOBAL : 0);
PCRE2_SIZE outlen = input_len * 2; /* estimated length of output buffer in characters, fine if too small */
PCRE2_SIZE bufsize = (input_len == 0) ? 16 : outlen;
uint32_t *output = (uint32_t *) malloc(sizeof(uint32_t) * bufsize); /* output buffer */
int pcre2_rc;
int done = 0;
while (!done) {
if (output == NULL) {
fprintf(stderr, "out of memory");
return NULL;
}
PCRE2_SIZE outlen = bufsize;
pcre2_rc = pcre2_substitute(search, (PCRE2_SPTR) input, input_len,
0, /* start offset */
options,
NULL, /* ptr to a match data block */
NULL, /* match context */
replacement, PCRE2_ZERO_TERMINATED,
output, &outlen);
if (pcre2_rc != PCRE2_ERROR_NOMEMORY || bufsize >= outlen) {
done = 1;
} else {
#ifdef REGEXP_DEBUG
fprintf(stderr, "Reallocating output buffer from %ld to %ld UTF-32 chars\n", bufsize, outlen);
#endif
bufsize = outlen;
output = (uint32_t *) realloc(output, sizeof(uint32_t) * bufsize);
}
}
BFREE(replacement);
if (pcre2_rc < 0) {
PCRE2_UCHAR buffer[256];
pcre2_get_error_message(pcre2_rc, buffer, sizeof(buffer));
/* buffer will normally contain "invalid replacement string" */
fprintf(stderr, "Error substituting \"%s\": %s\n", replace, u32_strconv_to_locale(buffer));
BFREE(output);
return NULL;
}
return output;
}
/*EOF*/ /* vim: set sw=4: */

67
src/regulex.h Normal file
View File

@ -0,0 +1,67 @@
/*
* boxes - Command line filter to draw/remove ASCII boxes around text
* Copyright (C) 1999 Thomas Jensen and the boxes contributors
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License, version 2, as published
* by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
*/
/*
* Convenience functions for PCRE2 regular expression processing
*/
#ifndef REGULEX_H
#define REGULEX_H
/* Building under Windows: If you want to statically link this program against a non-dll .a file, you must define
* PCRE2_STATIC before including pcre2.h. */
#ifdef __MINGW32__
#define PCRE2_STATIC
#endif
/* The PCRE2_CODE_UNIT_WIDTH macro must be defined before including pcre2.h. For a program that uses only one code unit
* width, setting it to 8, 16, or 32 makes it possible to use generic function names such as pcre2_compile(). Note that
* just changing 8 to 16 (for example) is not sufficient to convert this program to process 16-bit characters. Even in
* a fully 16-bit environment, where string-handling functions such as strcmp() and printf() work with 16-bit
* characters, the code for handling the table of named substrings will still need to be modified. */
#define PCRE2_CODE_UNIT_WIDTH 32
#include <pcre2.h>
#include <unitypes.h>
/*
* Compile the given pattern into a PCRE2 regular expression.
*/
pcre2_code *compile_pattern(char *pattern);
/*
* Perform a regex replacement on the given string.
*
* @param <search> the compiled pattern to search for
* @param <replace> the replacement string
* @param <input> the string to which the replacements shall be applied
* @param <input_len> the length of <input> in characters, not bytes
* @param <global> flag indicating whether all occurrences shall be replaced (true) or just the first (false)
* @return a new string which is a copy of output with the replacements applied, or NULL on error
*/
uint32_t *regex_replace(pcre2_code *search, char *replace, uint32_t *input, const size_t input_len, const int global);
#endif
/*EOF*/ /* vim: set cindent sw=4: */

View File

@ -50,7 +50,7 @@ typedef struct {
char **chars;
size_t height;
size_t width;
int elastic; /* elastic is used only in orginial definition */
int elastic; /* elastic is used only in original definition */
} sentry_t;
#define SENTRY_INITIALIZER (sentry_t) {NULL, 0, 0, 0}

View File

@ -27,14 +27,16 @@
#include <unitypes.h>
#include "boxes.h"
#define BMAX(a, b) ((a)>(b)? (a):(b)) /* return the larger value */
#define BFREE(p) { /* free memory and clear pointer */ \
if (p) { \
free (p); \
(p) = NULL; \
} \
if (p) { \
free((void *) p); \
(p) = NULL; \
} \
}

View File

@ -28,7 +28,6 @@
#include <unictype.h>
#include <unistr.h>
#include <unitypes.h>
#include "unicode.h"

View File

@ -25,6 +25,9 @@
#ifndef UNICODE_H
#define UNICODE_H
#include <unitypes.h>
extern const char *encoding; /* the character encoding that we use */
extern const ucs4_t char_tab; /* ucs4_t character '\t' (tab) */