diff --git a/Makefile b/Makefile index df5f8f7..82991d5 100644 --- a/Makefile +++ b/Makefile @@ -48,7 +48,7 @@ infomsg: replaceinfos: src/boxes.h doc/boxes.1 -src/boxes.h: src/boxes.h.in src/regexp/regexp.h Makefile +src/boxes.h: src/boxes.h.in src/regulex.h src/shape.h Makefile sed -e 's/--BVERSION--/$(BVERSION) $(GIT_STATUS)/; s/--GLOBALCONF--/$(subst /,\/,$(GLOBALCONF))/' src/boxes.h.in > src/boxes.h doc/boxes.1: doc/boxes.1.in Makefile diff --git a/boxes-config b/boxes-config index f5a5dc0..2e2f7e3 100644 --- a/boxes-config +++ b/boxes-config @@ -1874,14 +1874,11 @@ shapes { sw ("/*") s ("*") se ("*/") } -replace "^( *)([^ ])" with "\\1\\2 " -replace "([^ ]) ([^ ])" with "\\1 \\2" -reverse "^( *)([^ ]*) " to "\\1\\2" # \1 to leave indentation untouched -reverse "([^ ]) ([^ ])" to "\\1 \\2" +replace "(.)(?!$)" with "$1 " +reverse "^( *)([^ ]*) " to "$1$2" # $1 to leave indentation untouched # TODO padding { - left 2 - right 1 + horiz 2 } elastic (n,e,s,w) @@ -2253,7 +2250,7 @@ shapes { elastic (n, s, e, w) padding { - left 1 + left 2 } END unicornsay @@ -2549,7 +2546,7 @@ revision "1.0" tags "artwork, box" sample - /\ /\ /\ /\ /\ + /\ /\ /\ /\ /\ |__|__|__|__|__| | | | | | | | | fence | | @@ -2599,7 +2596,7 @@ revision "1.0" tags "artwork, box" sample - /\ !!!!!!!!!!! /\ + /\ !!!!!!!!!!! /\ |! | |! | |! | important |! | |__| |__| @@ -2642,7 +2639,7 @@ revision "1.0" tags "artwork, box" sample - /\ /\ !!!!!!!!!!!! /\ /\ + /\ /\ !!!!!!!!!!!! /\ /\ |! ||! |!!!!!!!!!!!!|! ||! | |! ||! | |! ||! | |! ||! | important2 |! ||! | @@ -2689,7 +2686,7 @@ revision "1.0" tags "artwork, box" sample - /\ /\ /\ !!!!!!!!!!!! /\ /\ /\ + /\ /\ /\ !!!!!!!!!!!! /\ /\ /\ |! ||! ||! |!!!!!!!!!!!!|! ||! ||! | |! ||! ||! |!!!!!!!!!!!!|! ||! ||! | |! ||! ||! | |! ||! ||! | @@ -2861,7 +2858,7 @@ shapes { sw ("| |", "|__", " ", " ", " ", " ", " ", " ", " ") ssw ("_", "_", " ", " ", " ", " ", " ", " ", " ") - + s ("__________________________________", "__________________________________", " )__________|__|__________( ", diff --git a/src/Makefile b/src/Makefile index 3d580e8..cbc385b 100644 --- a/src/Makefile +++ b/src/Makefile @@ -26,9 +26,9 @@ GEN_HDR = parser.h boxes.h GEN_SRC = parser.c lex.yy.c GEN_FILES = $(GEN_SRC) $(GEN_HDR) ORIG_HDRCL = boxes.h.in config.h -ORIG_HDR = $(ORIG_HDRCL) lexer.h tools.h shape.h generate.h remove.h unicode.h +ORIG_HDR = $(ORIG_HDRCL) lexer.h tools.h shape.h generate.h remove.h unicode.h regulex.h ORIG_GEN = lexer.l parser.y -ORIG_NORM = boxes.c tools.c shape.c generate.c remove.c unicode.c +ORIG_NORM = boxes.c tools.c shape.c generate.c remove.c unicode.c regulex.c ORIG_SRC = $(ORIG_GEN) $(ORIG_NORM) ORIG_FILES = $(ORIG_SRC) $(ORIG_HDR) OTH_FILES = Makefile @@ -46,8 +46,7 @@ debug: flags_$(BOXES_PLATFORM) $(MAKE) BOXES_PLATFORM=$(BOXES_PLATFORM) ALL_OBJ="$(ALL_OBJ)" CFLAGS_ADDTL="-g $(CFLAGS_ADDTL)" STRIP=false flags_$(BOXES_PLATFORM) $(BOXES_EXECUTABLE_NAME) boxes: $(ALL_OBJ) - $(MAKE) -C regexp CC=$(CC) libregexp.a - $(CC) $(LDFLAGS) $(ALL_OBJ) -o $(BOXES_EXECUTABLE_NAME) -lunistring -lpcre2-32 -lregexp + $(CC) $(LDFLAGS) $(ALL_OBJ) -o $(BOXES_EXECUTABLE_NAME) -lunistring -lpcre2-32 if [ "$(STRIP)" = "true" ] ; then strip $(BOXES_EXECUTABLE_NAME) ; fi boxes.exe: $(ALL_OBJ) @@ -56,22 +55,22 @@ boxes.exe: $(ALL_OBJ) flags_unix: - $(eval CFLAGS := -I. -Iregexp -Wall -W $(CFLAGS_ADDTL)) - $(eval LDFLAGS := -Lregexp $(LDFLAGS_ADDTL)) + $(eval CFLAGS := -I. -Wall -W $(CFLAGS_ADDTL)) + $(eval LDFLAGS := $(LDFLAGS_ADDTL)) $(eval BOXES_EXECUTABLE_NAME := boxes) $(eval ALL_OBJ := $(GEN_SRC:.c=.o) $(ORIG_NORM:.c=.o)) flags_win32: - $(eval CFLAGS := -Os -s -m32 -I. -Iregexp -Wall -W $(CFLAGS_ADDTL)) + $(eval CFLAGS := -Os -s -m32 -I. -Wall -W $(CFLAGS_ADDTL)) $(eval LDFLAGS := -s -m32) $(eval BOXES_EXECUTABLE_NAME := boxes.exe) - $(eval ALL_OBJ := $(GEN_SRC:.c=.o) $(ORIG_NORM:.c=.o) regexp/regexp.o regexp/regsub.o misc/getopt.o) + $(eval ALL_OBJ := $(GEN_SRC:.c=.o) $(ORIG_NORM:.c=.o) misc/getopt.o) flags_: @echo Please call make from the top level directory. exit 1 -parser.c parser.h: parser.y boxes.h regexp/regexp.h +parser.c parser.h: parser.y boxes.h $(YACC) -o parser.c -d parser.y lex.yy.c: lexer.l boxes.h @@ -81,16 +80,15 @@ lex.yy.c: lexer.l boxes.h rm lexer.tmp.c -boxes.o: boxes.c boxes.h regexp/regexp.h shape.h tools.h unicode.h generate.h remove.h config.h +boxes.o: boxes.c boxes.h regulex.h shape.h tools.h unicode.h generate.h remove.h config.h tools.o: tools.c tools.h boxes.h shape.h config.h unicode.o: unicode.c unicode.h config.h shape.o: shape.c shape.h boxes.h config.h tools.h generate.o: generate.c generate.h boxes.h shape.h tools.h config.h remove.o: remove.c remove.h boxes.h shape.h tools.h config.h +regulex.o: regulex.c regulex.h config.h lex.yy.o: lex.yy.c parser.h tools.h shape.h lexer.h config.h parser.o: parser.c parser.h tools.h shape.h lexer.h config.h -regexp/regexp.o: regexp/regexp.c -regexp/regsub.o: regexp/regsub.c misc/getopt.o: misc/getopt.c @@ -102,7 +100,6 @@ clean: flags_unix rm -f $(ALL_OBJ) rm -f $(GEN_FILES) rm -f core boxes boxes.exe - $(MAKE) -C regexp clean #EOF diff --git a/src/boxes.c b/src/boxes.c index 6e85853..3150cd8 100644 --- a/src/boxes.c +++ b/src/boxes.c @@ -41,8 +41,8 @@ #include "shape.h" #include "boxes.h" #include "tools.h" -#include "regexp.h" #include "generate.h" +#include "regulex.h" #include "remove.h" #include "unicode.h" @@ -1218,6 +1218,73 @@ static int get_indent(const line_t *lines, const size_t lines_size) +/** + * Analyze the multi-byte string in order to determine its metrics: + * - number of visible columns it occupies + * - number of escape characters (== number of escape sequences) + * - the ASCII equivalent of the string + * - the number of invisible characters in the string + * + * @param the multi-byte string to analyze + * @param pointer to where the number of escape sequences should be stored + * @param pointer to where the ASCII equivalent of the string should be stored + * @returns the number of invisible characters in + */ +static size_t count_invisible_chars(const uint32_t *s, size_t *num_esc, char **ascii) +{ + size_t invis = 0; /* counts invisible column positions */ + int ansipos = 0; /* progression of ansi sequence */ + *num_esc = 0; /* counts the number of escape sequences found */ + + if (is_empty(s)) { + (*ascii) = (char *) strdup(""); + return 0; + } + size_t buflen = (size_t) u32_strwidth(s, encoding); + (*ascii) = (char *) calloc(buflen, sizeof(char)); /* maybe a little too much, but certainly enough */ + char *p = *ascii; + + ucs4_t c; + const uint32_t *rest = s; + while ((rest = u32_next(&c, rest))) { + if (ansipos == 0 && c == char_esc) { + /* Found an ESC char, count it as invisible and move 1 forward in the detection of CSI sequences */ + ansipos++; + invis++; + (*num_esc)++; + } else if (ansipos == 1 && c == '[') { + /* Found '[' char after ESC. A CSI sequence has started. */ + ansipos++; + invis++; + } else if (ansipos == 1 && c >= 0x40 && c <= 0x5f) { + /* Found a byte designating the end of a two-byte escape sequence */ + invis++; + ansipos = 0; + } else if (ansipos == 2) { + /* Inside CSI sequence - Keep counting bytes as invisible */ + invis++; + + /* A char between 0x40 and 0x7e signals the end of an CSI or escape sequence */ + if (c >= 0x40 && c <= 0x7e) { + ansipos = 0; + } + } else if (is_ascii_printable(c)) { + *p = c & 0xff; + ++p; + } else { + int cols = uc_width(c, encoding); + if (cols > 0) { + memset(p, (int) 'x', cols); + p += cols; + } + } + } + *p = '\0'; + return invis; +} + + + static int apply_substitutions(const int mode) /* * Apply regular expression substitutions to input text. @@ -1236,8 +1303,6 @@ static int apply_substitutions(const int mode) size_t anz_rules; reprule_t *rules; size_t j, k; - char buf[LINE_MAX_BYTES * 2]; - size_t buf_len; /* length of string in buf */ if (opt.design == NULL) { return 1; @@ -1262,7 +1327,10 @@ static int apply_substitutions(const int mode) errno = 0; opt.design->current_rule = rules; for (j = 0; j < anz_rules; ++j, ++(opt.design->current_rule)) { - rules[j].prog = regcomp(rules[j].search); + rules[j].prog = compile_pattern(rules[j].search); + if (rules[j].prog == NULL) { + return 5; + } } opt.design->current_rule = NULL; if (errno) { @@ -1276,37 +1344,37 @@ static int apply_substitutions(const int mode) opt.design->current_rule = rules; for (j = 0; j < anz_rules; ++j, ++(opt.design->current_rule)) { #ifdef REGEXP_DEBUG - fprintf (stderr, "myregsub (0x%p, \"%s\", %d, \"%s\", buf, %d, \'%c\') == ", - rules[j].prog, input.lines[k].text, - input.lines[k].len, rules[j].repstr, LINE_MAX_BYTES*2, - rules[j].mode); + fprintf (stderr, "regex_replace(0x%p, \"%s\", \"%s\", %d, \'%c\') == ", + rules[j].prog, rules[j].repstr, u32_strconv_to_locale(input.lines[k].mbtext), + (int) input.lines[k].num_chars, rules[j].mode); #endif - errno = 0; - buf_len = myregsub(rules[j].prog, input.lines[k].text, - input.lines[k].len, rules[j].repstr, buf, LINE_MAX_BYTES * 2, - rules[j].mode); + uint32_t *newtext = regex_replace(rules[j].prog, rules[j].repstr, + input.lines[k].mbtext, input.lines[k].num_chars, rules[j].mode == 'g'); #ifdef REGEXP_DEBUG - fprintf (stderr, "%d\n", buf_len); + fprintf (stderr, "\"%s\"\n", newtext ? u32_strconv_to_locale(newtext) : "NULL"); #endif - if (errno) { + if (newtext == NULL) { return 1; } - BFREE (input.lines[k].text); - input.lines[k].text = (char *) strdup(buf); - if (input.lines[k].text == NULL) { - perror(PROJECT); - return 1; - } - - input.lines[k].len = buf_len; + BFREE(input.lines[k].mbtext_org); /* original address allocated for mbtext */ + input.lines[k].mbtext = newtext; + input.lines[k].mbtext_org = newtext; + size_t num_esc = 0; + char *ascii; // TODO HERE extract into function analyze/asciify(line_t) ? + size_t invis = count_invisible_chars(input.lines[k].mbtext, &num_esc, &ascii); + input.lines[k].len = u32_strwidth(input.lines[k].mbtext, encoding) - invis + num_esc; + input.lines[k].num_chars = u32_strlen(input.lines[k].mbtext); + BFREE(input.lines[k].text); + input.lines[k].text = ascii; if (input.lines[k].len > input.maxline) { input.maxline = input.lines[k].len; } #ifdef REGEXP_DEBUG - fprintf (stderr, "input.lines[%d] == {%d, \"%s\"}\n", k, input.lines[k].len, input.lines[k].text); + fprintf (stderr, "input.lines[%d] == {%d, \"%s\"}\n", (int) k, + (int) input.lines[k].num_chars, u32_strconv_to_locale(input.lines[k].mbtext)); #endif } opt.design->current_rule = NULL; @@ -1357,60 +1425,6 @@ static int has_linebreak(const uint32_t *s, const int len) -static size_t count_invisible_chars(const uint32_t *s, const size_t buflen, size_t *num_esc, char **ascii) -{ - size_t invis = 0; /* counts invisible column positions */ - int ansipos = 0; /* progression of ansi sequence */ - *num_esc = 0; /* counts the number of escape sequences found */ - - if (is_empty(s)) { - (*ascii) = (char *) strdup(""); - return 0; - } - (*ascii) = (char *) calloc(buflen, sizeof(char)); - char *p = *ascii; - - ucs4_t c; - const uint32_t *rest = s; - while ((rest = u32_next(&c, rest))) { - if (ansipos == 0 && c == char_esc) { - /* Found an ESC char, count it as invisible and move 1 forward in the detection of CSI sequences */ - ansipos++; - invis++; - (*num_esc)++; - } else if (ansipos == 1 && c == '[') { - /* Found '[' char after ESC. A CSI sequence has started. */ - ansipos++; - invis++; - } else if (ansipos == 1 && c >= 0x40 && c <= 0x5f) { - /* Found a byte designating the end of a two-byte escape sequence */ - invis++; - ansipos = 0; - } else if (ansipos == 2) { - /* Inside CSI sequence - Keep counting bytes as invisible */ - invis++; - - /* A char between 0x40 and 0x7e signals the end of an CSI or escape sequence */ - if (c >= 0x40 && c <= 0x7e) { - ansipos = 0; - } - } else if (is_ascii_printable(c)) { - *p = c & 0xff; - ++p; - } else { - int cols = uc_width(c, encoding); - if (cols > 0) { - memset(p, (int) 'x', cols); - p += cols; - } - } - } - *p = '\0'; - return invis; -} - - - static int read_all_input(const int use_stdin) /* * Read entire input (possibly from stdin) and store it in 'input' array. @@ -1444,7 +1458,7 @@ static int read_all_input(const int use_stdin) * Start reading */ while (fgets(buf, LINE_MAX_BYTES + 1, opt.infile)) { - if (input_size % 100 == 0) { + if (input.anz_lines % 100 == 0) { input_size += 100; line_t *tmp = (line_t *) realloc(input.lines, input_size * sizeof(line_t)); if (tmp == NULL) { @@ -1483,18 +1497,20 @@ static int read_all_input(const int use_stdin) return 1; } input.lines[input.anz_lines].mbtext = temp; + BFREE(mbtemp); temp = NULL; } else { input.lines[input.anz_lines].mbtext = mbtemp; } + input.lines[input.anz_lines].mbtext_org = input.lines[input.anz_lines].mbtext; input.lines[input.anz_lines].num_chars = len_chars; /* * Find ANSI CSI/ESC sequences */ size_t num_esc = 0; - size_t invis = count_invisible_chars(input.lines[input.anz_lines].mbtext, strlen(buf), &num_esc, + size_t invis = count_invisible_chars(input.lines[input.anz_lines].mbtext, &num_esc, &(input.lines[input.anz_lines].text)); input.lines[input.anz_lines].invis = invis; /* u32_strwidth() does not count control characters, i.e. ESC characters, for which we must correct */ @@ -1526,8 +1542,8 @@ static int read_all_input(const int use_stdin) /* recalculate input statistics for redrawing the mended box */ for (i = 0; i < input.anz_lines; ++i) { size_t num_esc = 0; - char *dummy; - size_t invis = count_invisible_chars(input.lines[i].mbtext, strlen(input.lines[i].text), &num_esc, &dummy); + char *dummy; // TODO extract into function + size_t invis = count_invisible_chars(input.lines[i].mbtext, &num_esc, &dummy); BFREE(dummy); input.lines[i].len = u32_strwidth(input.lines[i].mbtext, encoding) - invis + num_esc; input.lines[i].num_chars = u32_strlen(input.lines[i].mbtext); @@ -1576,7 +1592,7 @@ static int read_all_input(const int use_stdin) * Apply regular expression substitutions */ if (opt.r == 0) { - if (apply_substitutions(0) != 0) { // TODO + if (apply_substitutions(0) != 0) { return 1; } } diff --git a/src/boxes.h.in b/src/boxes.h.in index 3156517..0ddc002 100644 --- a/src/boxes.h.in +++ b/src/boxes.h.in @@ -25,13 +25,14 @@ #ifndef BOXES_H #define BOXES_H -/* #define DEBUG */ -/* #define REGEXP_DEBUG */ -/* #define PARSER_DEBUG */ -/* #define LEXER_DEBUG */ +/* #define DEBUG 1 */ +#define REGEXP_DEBUG 1 +/* #define PARSER_DEBUG 1 */ +/* #define LEXER_DEBUG 1 */ #include -#include "regexp/regexp.h" +#include "regulex.h" +#include "shape.h" @@ -80,11 +81,11 @@ typedef struct { - char *search; - char *repstr; - regexp *prog; /* compiled search pattern */ - int line; /* line of definition in config file */ - char mode; /* 'g' or 'o' */ + char *search; + char *repstr; + pcre2_code *prog; /* compiled search pattern */ + int line; /* line of definition in config file */ + char mode; /* 'g' or 'o' */ } reprule_t; @@ -147,11 +148,12 @@ extern opt_t opt; typedef struct { size_t len; /* length of visible text in columns (visible character positions in a text terminal), which is the same as the length of the 'text' field */ - char *text; /* ASCII line content, tabs expanded, multi-byte chars replaced with one or more 'x' */ + char *text; /* ASCII line content, tabs expanded, ansi escapes removed, multi-byte chars replaced with one or more 'x' */ size_t invis; /* number of invisble columns/characters (part of an ansi sequence) */ uint32_t *mbtext; /* multi-byte (original) line content, tabs expanded. We use UTF-32 in order to enable pointer arithmetic. */ size_t num_chars; /* total number of characters in mbtext, visible + invisible */ + uint32_t *mbtext_org; /* mbtext as originally allocated, so that we can free it again */ size_t *tabpos; /* tab positions in expanded work strings, or NULL if not needed */ size_t tabpos_len; /* number of tabs in a line */ diff --git a/src/regexp/Makefile b/src/regexp/Makefile deleted file mode 100644 index 2c242cc..0000000 --- a/src/regexp/Makefile +++ /dev/null @@ -1,49 +0,0 @@ -# -# boxes - Command line filter to draw/remove ASCII boxes around text -# Copyright (C) 1999 Thomas Jensen and the boxes contributors -# -# This program is free software; you can redistribute it and/or modify it -# under the terms of the GNU General Public License, version 2, as published -# by the Free Software Foundation. -# -# This program is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -# for more details. -# -# You should have received a copy of the GNU General Public License along -# with this program; if not, write to the Free Software Foundation, Inc., -# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. -#____________________________________________________________________________ -#============================================================================ - - -CFLAGS = -O -I. $(CFLAGS_ADDTL) - -ALL_CL = regexp/regexp.c regexp/regsub.c -C_SRC = $(notdir $(ALL_CL)) -ALLFILES = Makefile $(C_SRC) regexp.h regmagic.h -ALLOBJ = $(C_SRC:.c=.o) - - -.PHONY: clean build debug - - -build: libregexp.a -debug: libregexp.a - -libregexp.a: $(ALLOBJ) - ar cr libregexp.a $(ALLOBJ) - -regexp.o: regexp.c regmagic.h regexp.h ../config.h -regsub.o: regsub.c regmagic.h regexp.h ../config.h - -.c.o: - $(CC) $(CFLAGS) -c $< - - -clean: - rm -f $(ALLOBJ) libregexp.a core - - -#EOF diff --git a/src/regexp/regexp.c b/src/regexp/regexp.c deleted file mode 100644 index 4c18f91..0000000 --- a/src/regexp/regexp.c +++ /dev/null @@ -1,1212 +0,0 @@ -/* - * regcomp and regexec -- regsub and regerror are elsewhere - * - * Copyright (c) 1986 by University of Toronto. - * Written by Henry Spencer. Not derived from licensed software. - * - * Permission is granted to anyone to use this software for any - * purpose on any computer system, and to redistribute it freely, - * subject to the following restrictions: - * - * 1. The author is not responsible for the consequences of use of - * this software, no matter how awful, even if they arise - * from defects in it. - * - * 2. The origin of this software must not be misrepresented, either - * by explicit claim or by omission. - * - * 3. Altered versions must be plainly marked as such, and must not - * be misrepresented as being the original software. - * - * Beware that some of this code is subtly aware of the way operator - * precedence is structured in regular expressions. Serious changes in - * regular-expression syntax might require a total rethink. - */ -#include -#include -#include -#include "regexp.h" -#include "regmagic.h" - - -/* - * The "internal use only" fields in regexp.h are present to pass info from - * compile to execute that permits the execute phase to run lots faster on - * simple cases. They are: - * - * regstart char that must begin a match; '\0' if none obvious - * reganch is the match anchored (at beginning-of-line only)? - * regmust string (pointer into program) that match must include, or NULL - * regmlen length of regmust string - * - * Regstart and reganch permit very fast decisions on suitable starting points - * for a match, cutting down the work a lot. Regmust permits fast rejection - * of lines that cannot possibly match. The regmust tests are costly enough - * that regcomp() supplies a regmust only if the r.e. contains something - * potentially expensive (at present, the only such thing detected is * or + - * at the start of the r.e., which can involve a lot of backup). Regmlen is - * supplied because the test in regexec() needs it and regcomp() is computing - * it anyway. - */ - -/* - * Structure for regexp "program". This is essentially a linear encoding - * of a nondeterministic finite-state machine (aka syntax charts or - * "railroad normal form" in parsing technology). Each node is an opcode - * plus a "next" pointer, possibly plus an operand. "Next" pointers of - * all nodes except BRANCH implement concatenation; a "next" pointer with - * a BRANCH on both ends of it is connecting two alternatives. (Here we - * have one of the subtle syntax dependencies: an individual BRANCH (as - * opposed to a collection of them) is never concatenated with anything - * because of operator precedence.) The operand of some types of node is - * a literal string; for others, it is a node leading into a sub-FSM. In - * particular, the operand of a BRANCH node is the first node of the branch. - * (NB this is *not* a tree structure: the tail of the branch connects - * to the thing following the set of BRANCHes.) The opcodes are: - */ - -/* definition number opnd? meaning */ -#define END 0 /* no End of program. */ -#define BOL 1 /* no Match "" at beginning of line. */ -#define EOL 2 /* no Match "" at end of line. */ -#define ANY 3 /* no Match any one character. */ -#define ANYOF 4 /* str Match any character in this string. */ -#define ANYBUT 5 /* str Match any character not in this string. */ -#define BRANCH 6 /* node Match this alternative, or the next... */ -#define BACK 7 /* no Match "", "next" ptr points backward. */ -#define EXACTLY 8 /* str Match this string. */ -#define NOTHING 9 /* no Match empty string. */ -#define STAR 10 /* node Match this (simple) thing 0 or more times. */ -#define PLUS 11 /* node Match this (simple) thing 1 or more times. */ -#define OPEN 20 /* no Mark this point in input as start of #n. */ - /* OPEN+1 is number 1, etc. */ -#define CLOSE 30 /* no Analogous to OPEN. */ - -/* - * Opcode notes: - * - * BRANCH The set of branches constituting a single choice are hooked - * together with their "next" pointers, since precedence prevents - * anything being concatenated to any individual branch. The - * "next" pointer of the last BRANCH in a choice points to the - * thing following the whole choice. This is also where the - * final "next" pointer of each individual branch points; each - * branch starts with the operand node of a BRANCH node. - * - * BACK Normal "next" pointers all implicitly point forward; BACK - * exists to make loop structures possible. - * - * STAR,PLUS '?', and complex '*' and '+', are implemented as circular - * BRANCH structures using BACK. Simple cases (one character - * per match) are implemented with STAR and PLUS for speed - * and to minimize recursive plunges. - * - * OPEN,CLOSE ...are numbered at compile time. - */ - -/* - * A node is one char of opcode followed by two chars of "next" pointer. - * "Next" pointers are stored as two 8-bit pieces, high order first. The - * value is a positive offset from the opcode of the node containing it. - * An operand, if any, simply follows the node. (Note that much of the - * code generation knows about this implicit relationship.) - * - * Using two bytes for the "next" pointer is vast overkill for most things, - * but allows patterns to get big without disasters. - */ -#define OP(p) (*(p)) -#define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377)) -#define OPERAND(p) ((p) + 3) - -/* - * See regmagic.h for one further detail of program structure. - */ - - -/* - * Utility definitions. - */ -#ifndef CHARBITS -#define UCHARAT(p) ((int)*(unsigned char *)(p)) -#else -#define UCHARAT(p) ((int)*(p)&CHARBITS) -#endif - -#define FAIL(m) { regerror(m); return(NULL); } -#define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?') -#define META "^$.[()|?+*\\" - -/* - * Flags to be passed up and down. - */ -#define HASWIDTH 01 /* Known never to match null string. */ -#define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */ -#define SPSTART 04 /* Starts with * or +. */ -#define WORST 0 /* Worst case. */ - -/* - * Global work variables for regcomp(). - */ -static char *regparse; /* Input-scan pointer. */ -static int regnpar; /* () count. */ -static char regdummy; -static char *regcode; /* Code-emit pointer; ®dummy = don't. */ -static long regsize; /* Code size. */ - -/* - * Forward declarations for regcomp()'s friends. - */ -#ifndef STATIC -#define STATIC static -#endif -STATIC char *reg(); -STATIC char *regbranch(); -STATIC char *regpiece(); -STATIC char *regatom(); -STATIC char *regnode(); -STATIC char *regnext(); -STATIC void regc(); -STATIC void reginsert(); -STATIC void regtail(); -STATIC void regoptail(); -#ifdef STRCSPN -STATIC int strcspn(); -#endif - -/* - - regcomp - compile a regular expression into internal code - * - * We can't allocate space until we know how big the compiled form will be, - * but we can't compile it (and thus know how big it is) until we've got a - * place to put the code. So we cheat: we compile it twice, once with code - * generation turned off and size counting turned on, and once "for real". - * This also means that we don't allocate space until we are sure that the - * thing really will compile successfully, and we never have to move the - * code and thus invalidate pointers into it. (Note that it has to be in - * one piece because free() must be able to free it all.) - * - * Beware that the optimization-preparation code in here knows about some - * of the structure of the compiled regexp. - */ -regexp * -regcomp(exp) -char *exp; -{ - register regexp *r; - register char *scan; - register char *longest; - register int len; - int flags; - - if (exp == NULL) - FAIL("NULL argument"); - - /* First pass: determine size, legality. */ - regparse = exp; - regnpar = 1; - regsize = 0L; - regcode = ®dummy; - regc(MAGIC); - if (reg(0, &flags) == NULL) - return(NULL); - - /* Small enough for pointer-storage convention? */ - if (regsize >= 32767L) /* Probably could be 65535L. */ - FAIL("regexp too big"); - - /* Allocate space. */ - r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize); - if (r == NULL) - FAIL("out of space"); - - /* Second pass: emit code. */ - regparse = exp; - regnpar = 1; - regcode = r->program; - regc(MAGIC); - if (reg(0, &flags) == NULL) - return(NULL); - - /* Dig out information for optimizations. */ - r->regstart = '\0'; /* Worst-case defaults. */ - r->reganch = 0; - r->regmust = NULL; - r->regmlen = 0; - scan = r->program+1; /* First BRANCH. */ - if (OP(regnext(scan)) == END) { /* Only one top-level choice. */ - scan = OPERAND(scan); - - /* Starting-point info. */ - if (OP(scan) == EXACTLY) - r->regstart = *OPERAND(scan); - else if (OP(scan) == BOL) - r->reganch++; - - /* - * If there's something expensive in the r.e., find the - * longest literal string that must appear and make it the - * regmust. Resolve ties in favor of later strings, since - * the regstart check works with the beginning of the r.e. - * and avoiding duplication strengthens checking. Not a - * strong reason, but sufficient in the absence of others. - */ - if (flags&SPSTART) { - longest = NULL; - len = 0; - for (; scan != NULL; scan = regnext(scan)) - if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) { - longest = OPERAND(scan); - len = strlen(OPERAND(scan)); - } - r->regmust = longest; - r->regmlen = len; - } - } - - return(r); -} - -/* - - reg - regular expression, i.e. main body or parenthesized thing - * - * Caller must absorb opening parenthesis. - * - * Combining parenthesis handling with the base level of regular expression - * is a trifle forced, but the need to tie the tails of the branches to what - * follows makes it hard to avoid. - */ -static char * -reg(paren, flagp) -int paren; /* Parenthesized? */ -int *flagp; -{ - register char *ret; - register char *br; - register char *ender; - register int parno; - int flags; - - *flagp = HASWIDTH; /* Tentatively. */ - - /* Make an OPEN node, if parenthesized. */ - if (paren) { - if (regnpar >= NSUBEXP) - FAIL("too many ()"); - parno = regnpar; - regnpar++; - ret = regnode(OPEN+parno); - } else - ret = NULL; - - /* Pick up the branches, linking them together. */ - br = regbranch(&flags); - if (br == NULL) - return(NULL); - if (ret != NULL) - regtail(ret, br); /* OPEN -> first. */ - else - ret = br; - if (!(flags&HASWIDTH)) - *flagp &= ~HASWIDTH; - *flagp |= flags&SPSTART; - while (*regparse == '|') { - regparse++; - br = regbranch(&flags); - if (br == NULL) - return(NULL); - regtail(ret, br); /* BRANCH -> BRANCH. */ - if (!(flags&HASWIDTH)) - *flagp &= ~HASWIDTH; - *flagp |= flags&SPSTART; - } - - /* Make a closing node, and hook it on the end. */ - ender = regnode((paren) ? CLOSE+parno : END); - regtail(ret, ender); - - /* Hook the tails of the branches to the closing node. */ - for (br = ret; br != NULL; br = regnext(br)) - regoptail(br, ender); - - /* Check for proper termination. */ - if (paren && *regparse++ != ')') { - FAIL("unmatched ()"); - } else if (!paren && *regparse != '\0') { - if (*regparse == ')') { - FAIL("unmatched ()"); - } else - FAIL("junk on end"); /* "Can't happen". */ - /* NOTREACHED */ - } - - return(ret); -} - -/* - - regbranch - one alternative of an | operator - * - * Implements the concatenation operator. - */ -static char * -regbranch(flagp) -int *flagp; -{ - register char *ret; - register char *chain; - register char *latest; - int flags; - - *flagp = WORST; /* Tentatively. */ - - ret = regnode(BRANCH); - chain = NULL; - while (*regparse != '\0' && *regparse != '|' && *regparse != ')') { - latest = regpiece(&flags); - if (latest == NULL) - return(NULL); - *flagp |= flags&HASWIDTH; - if (chain == NULL) /* First piece. */ - *flagp |= flags&SPSTART; - else - regtail(chain, latest); - chain = latest; - } - if (chain == NULL) /* Loop ran zero times. */ - (void) regnode(NOTHING); - - return(ret); -} - -/* - - regpiece - something followed by possible [*+?] - * - * Note that the branching code sequences used for ? and the general cases - * of * and + are somewhat optimized: they use the same NOTHING node as - * both the endmarker for their branch list and the body of the last branch. - * It might seem that this node could be dispensed with entirely, but the - * endmarker role is not redundant. - */ -static char * -regpiece(flagp) -int *flagp; -{ - register char *ret; - register char op; - register char *next; - int flags; - - ret = regatom(&flags); - if (ret == NULL) - return(NULL); - - op = *regparse; - if (!ISMULT(op)) { - *flagp = flags; - return(ret); - } - - if (!(flags&HASWIDTH) && op != '?') - FAIL("*+ operand could be empty"); - *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH); - - if (op == '*' && (flags&SIMPLE)) - reginsert(STAR, ret); - else if (op == '*') { - /* Emit x* as (x&|), where & means "self". */ - reginsert(BRANCH, ret); /* Either x */ - regoptail(ret, regnode(BACK)); /* and loop */ - regoptail(ret, ret); /* back */ - regtail(ret, regnode(BRANCH)); /* or */ - regtail(ret, regnode(NOTHING)); /* null. */ - } else if (op == '+' && (flags&SIMPLE)) - reginsert(PLUS, ret); - else if (op == '+') { - /* Emit x+ as x(&|), where & means "self". */ - next = regnode(BRANCH); /* Either */ - regtail(ret, next); - regtail(regnode(BACK), ret); /* loop back */ - regtail(next, regnode(BRANCH)); /* or */ - regtail(ret, regnode(NOTHING)); /* null. */ - } else if (op == '?') { - /* Emit x? as (x|) */ - reginsert(BRANCH, ret); /* Either x */ - regtail(ret, regnode(BRANCH)); /* or */ - next = regnode(NOTHING); /* null. */ - regtail(ret, next); - regoptail(ret, next); - } - regparse++; - if (ISMULT(*regparse)) - FAIL("nested *?+"); - - return(ret); -} - -/* - - regatom - the lowest level - * - * Optimization: gobbles an entire sequence of ordinary characters so that - * it can turn them into a single node, which is smaller to store and - * faster to run. Backslashed characters are exceptions, each becoming a - * separate node; the code is simpler that way and it's not worth fixing. - */ -static char * -regatom(flagp) -int *flagp; -{ - register char *ret; - int flags; - - *flagp = WORST; /* Tentatively. */ - - switch (*regparse++) { - case '^': - ret = regnode(BOL); - break; - case '$': - ret = regnode(EOL); - break; - case '.': - ret = regnode(ANY); - *flagp |= HASWIDTH|SIMPLE; - break; - case '[': { - register int class; - register int classend; - - if (*regparse == '^') { /* Complement of range. */ - ret = regnode(ANYBUT); - regparse++; - } else - ret = regnode(ANYOF); - if (*regparse == ']' || *regparse == '-') - regc(*regparse++); - while (*regparse != '\0' && *regparse != ']') { - if (*regparse == '-') { - regparse++; - if (*regparse == ']' || *regparse == '\0') - regc('-'); - else { - class = UCHARAT(regparse-2)+1; - classend = UCHARAT(regparse); - if (class > classend+1) - FAIL("invalid [] range"); - for (; class <= classend; class++) - regc(class); - regparse++; - } - } else - regc(*regparse++); - } - regc('\0'); - if (*regparse != ']') - FAIL("unmatched []"); - regparse++; - *flagp |= HASWIDTH|SIMPLE; - } - break; - case '(': - ret = reg(1, &flags); - if (ret == NULL) - return(NULL); - *flagp |= flags&(HASWIDTH|SPSTART); - break; - case '\0': - case '|': - case ')': - FAIL("internal urp"); /* Supposed to be caught earlier. */ - break; - case '?': - case '+': - case '*': - FAIL("?+* follows nothing"); - break; - case '\\': - if (*regparse == '\0') - FAIL("trailing \\"); - ret = regnode(EXACTLY); - regc(*regparse++); - regc('\0'); - *flagp |= HASWIDTH|SIMPLE; - break; - default: { - register int len; - register char ender; - - regparse--; - len = strcspn(regparse, META); - if (len <= 0) - FAIL("internal disaster"); - ender = *(regparse+len); - if (len > 1 && ISMULT(ender)) - len--; /* Back off clear of ?+* operand. */ - *flagp |= HASWIDTH; - if (len == 1) - *flagp |= SIMPLE; - ret = regnode(EXACTLY); - while (len > 0) { - regc(*regparse++); - len--; - } - regc('\0'); - } - break; - } - - return(ret); -} - -/* - - regnode - emit a node - */ -static char * /* Location. */ -regnode(op) -char op; -{ - register char *ret; - register char *ptr; - - ret = regcode; - if (ret == ®dummy) { - regsize += 3; - return(ret); - } - - ptr = ret; - *ptr++ = op; - *ptr++ = '\0'; /* Null "next" pointer. */ - *ptr++ = '\0'; - regcode = ptr; - - return(ret); -} - -/* - - regc - emit (if appropriate) a byte of code - */ -static void -regc(b) -char b; -{ - if (regcode != ®dummy) - *regcode++ = b; - else - regsize++; -} - -/* - - reginsert - insert an operator in front of already-emitted operand - * - * Means relocating the operand. - */ -static void -reginsert(op, opnd) -char op; -char *opnd; -{ - register char *src; - register char *dst; - register char *place; - - if (regcode == ®dummy) { - regsize += 3; - return; - } - - src = regcode; - regcode += 3; - dst = regcode; - while (src > opnd) - *--dst = *--src; - - place = opnd; /* Op node, where operand used to be. */ - *place++ = op; - *place++ = '\0'; - *place++ = '\0'; -} - -/* - - regtail - set the next-pointer at the end of a node chain - */ -static void -regtail(p, val) -char *p; -char *val; -{ - register char *scan; - register char *temp; - register int offset; - - if (p == ®dummy) - return; - - /* Find last node. */ - scan = p; - for (;;) { - temp = regnext(scan); - if (temp == NULL) - break; - scan = temp; - } - - if (OP(scan) == BACK) - offset = scan - val; - else - offset = val - scan; - *(scan+1) = (offset>>8)&0377; - *(scan+2) = offset&0377; -} - -/* - - regoptail - regtail on operand of first argument; nop if operandless - */ -static void -regoptail(p, val) -char *p; -char *val; -{ - /* "Operandless" and "op != BRANCH" are synonymous in practice. */ - if (p == NULL || p == ®dummy || OP(p) != BRANCH) - return; - regtail(OPERAND(p), val); -} - -/* - * regexec and friends - */ - -/* - * Global work variables for regexec(). - */ -static char *reginput; /* String-input pointer. */ -static char *regbol; /* Beginning of input, for ^ check. */ -static char **regstartp; /* Pointer to startp array. */ -static char **regendp; /* Ditto for endp. */ - -/* - * Forwards. - */ -STATIC int regtry(); -STATIC int regmatch(); -STATIC int regrepeat(); - -#ifdef DEBUG -int regnarrate = 0; -void regdump(); -STATIC char *regprop(); -#endif - -/* - - regexec - match a regexp against a string - */ -int -regexec(prog, string) -register regexp *prog; -register char *string; -{ - register char *s; - - /* Be paranoid... */ - if (prog == NULL || string == NULL) { - regerror("NULL parameter"); - return(0); - } - - /* Check validity of program. */ - if (UCHARAT(prog->program) != MAGIC) { - regerror("corrupted program"); - return(0); - } - - /* If there is a "must appear" string, look for it. */ - if (prog->regmust != NULL) { - s = string; - while ((s = strchr(s, prog->regmust[0])) != NULL) { - if (strncmp(s, prog->regmust, prog->regmlen) == 0) - break; /* Found it. */ - s++; - } - if (s == NULL) /* Not present. */ - return(0); - } - - /* Mark beginning of line for ^ . */ - regbol = string; - - /* Simplest case: anchored match need be tried only once. */ - if (prog->reganch) - return(regtry(prog, string)); - - /* Messy cases: unanchored match. */ - s = string; - if (prog->regstart != '\0') - /* We know what char it must start with. */ - while ((s = strchr(s, prog->regstart)) != NULL) { - if (regtry(prog, s)) - return(1); - s++; - } - else - /* We don't -- general case. */ - do { - if (regtry(prog, s)) - return(1); - } while (*s++ != '\0'); - - /* Failure. */ - return(0); -} - -/* - - regtry - try match at specific point - */ -static int /* 0 failure, 1 success */ -regtry(prog, string) -regexp *prog; -char *string; -{ - register int i; - register char **sp; - register char **ep; - - reginput = string; - regstartp = prog->startp; - regendp = prog->endp; - - sp = prog->startp; - ep = prog->endp; - for (i = NSUBEXP; i > 0; i--) { - *sp++ = NULL; - *ep++ = NULL; - } - if (regmatch(prog->program + 1)) { - prog->startp[0] = string; - prog->endp[0] = reginput; - return(1); - } else - return(0); -} - -/* - - regmatch - main matching routine - * - * Conceptually the strategy is simple: check to see whether the current - * node matches, call self recursively to see whether the rest matches, - * and then act accordingly. In practice we make some effort to avoid - * recursion, in particular by going through "ordinary" nodes (that don't - * need to know whether the rest of the match failed) by a loop instead of - * by recursion. - */ -static int /* 0 failure, 1 success */ -regmatch(prog) -char *prog; -{ - register char *scan; /* Current node. */ - char *next; /* Next node. */ - - scan = prog; -#ifdef DEBUG - if (scan != NULL && regnarrate) - fprintf(stderr, "%s(\n", regprop(scan)); -#endif - while (scan != NULL) { -#ifdef DEBUG - if (regnarrate) - fprintf(stderr, "%s...\n", regprop(scan)); -#endif - next = regnext(scan); - - switch (OP(scan)) { - case BOL: - if (reginput != regbol) - return(0); - break; - case EOL: - if (*reginput != '\0') - return(0); - break; - case ANY: - if (*reginput == '\0') - return(0); - reginput++; - break; - case EXACTLY: { - register int len; - register char *opnd; - - opnd = OPERAND(scan); - /* Inline the first character, for speed. */ - if (*opnd != *reginput) - return(0); - len = strlen(opnd); - if (len > 1 && strncmp(opnd, reginput, len) != 0) - return(0); - reginput += len; - } - break; - case ANYOF: - if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL) - return(0); - reginput++; - break; - case ANYBUT: - if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL) - return(0); - reginput++; - break; - case NOTHING: - break; - case BACK: - break; - case OPEN+1: - case OPEN+2: - case OPEN+3: - case OPEN+4: - case OPEN+5: - case OPEN+6: - case OPEN+7: - case OPEN+8: - case OPEN+9: { - register int no; - register char *save; - - no = OP(scan) - OPEN; - save = reginput; - - if (regmatch(next)) { - /* - * Don't set startp if some later - * invocation of the same parentheses - * already has. - */ - if (regstartp[no] == NULL) - regstartp[no] = save; - return(1); - } else - return(0); - } - break; - case CLOSE+1: - case CLOSE+2: - case CLOSE+3: - case CLOSE+4: - case CLOSE+5: - case CLOSE+6: - case CLOSE+7: - case CLOSE+8: - case CLOSE+9: { - register int no; - register char *save; - - no = OP(scan) - CLOSE; - save = reginput; - - if (regmatch(next)) { - /* - * Don't set endp if some later - * invocation of the same parentheses - * already has. - */ - if (regendp[no] == NULL) - regendp[no] = save; - return(1); - } else - return(0); - } - break; - case BRANCH: { - register char *save; - - if (OP(next) != BRANCH) /* No choice. */ - next = OPERAND(scan); /* Avoid recursion. */ - else { - do { - save = reginput; - if (regmatch(OPERAND(scan))) - return(1); - reginput = save; - scan = regnext(scan); - } while (scan != NULL && OP(scan) == BRANCH); - return(0); - /* NOTREACHED */ - } - } - break; - case STAR: - case PLUS: { - register char nextch; - register int no; - register char *save; - register int min; - - /* - * Lookahead to avoid useless match attempts - * when we know what character comes next. - */ - nextch = '\0'; - if (OP(next) == EXACTLY) - nextch = *OPERAND(next); - min = (OP(scan) == STAR) ? 0 : 1; - save = reginput; - no = regrepeat(OPERAND(scan)); - while (no >= min) { - /* If it could work, try it. */ - if (nextch == '\0' || *reginput == nextch) - if (regmatch(next)) - return(1); - /* Couldn't or didn't -- back up. */ - no--; - reginput = save + no; - } - return(0); - } - break; - case END: - return(1); /* Success! */ - break; - default: - regerror("memory corruption"); - return(0); - break; - } - - scan = next; - } - - /* - * We get here only if there's trouble -- normally "case END" is - * the terminating point. - */ - regerror("corrupted pointers"); - return(0); -} - -/* - - regrepeat - repeatedly match something simple, report how many - */ -static int -regrepeat(p) -char *p; -{ - register int count = 0; - register char *scan; - register char *opnd; - - scan = reginput; - opnd = OPERAND(p); - switch (OP(p)) { - case ANY: - count = strlen(scan); - scan += count; - break; - case EXACTLY: - while (*opnd == *scan) { - count++; - scan++; - } - break; - case ANYOF: - while (*scan != '\0' && strchr(opnd, *scan) != NULL) { - count++; - scan++; - } - break; - case ANYBUT: - while (*scan != '\0' && strchr(opnd, *scan) == NULL) { - count++; - scan++; - } - break; - default: /* Oh dear. Called inappropriately. */ - regerror("internal foulup"); - count = 0; /* Best compromise. */ - break; - } - reginput = scan; - - return(count); -} - -/* - - regnext - dig the "next" pointer out of a node - */ -static char * -regnext(p) -register char *p; -{ - register int offset; - - if (p == ®dummy) - return(NULL); - - offset = NEXT(p); - if (offset == 0) - return(NULL); - - if (OP(p) == BACK) - return(p-offset); - else - return(p+offset); -} - -#ifdef DEBUG - -STATIC char *regprop(); - -/* - - regdump - dump a regexp onto stdout in vaguely comprehensible form - */ -void -regdump(r) -regexp *r; -{ - register char *s; - register char op = EXACTLY; /* Arbitrary non-END op. */ - register char *next; - - - s = r->program + 1; - while (op != END) { /* While that wasn't END last time... */ - op = OP(s); - printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */ - next = regnext(s); - if (next == NULL) /* Next ptr. */ - printf("(0)"); - else - printf("(%d)", (s-r->program)+(next-s)); - s += 3; - if (op == ANYOF || op == ANYBUT || op == EXACTLY) { - /* Literal string, where present. */ - while (*s != '\0') { - putchar(*s); - s++; - } - s++; - } - putchar('\n'); - } - - /* Header fields of interest. */ - if (r->regstart != '\0') - printf("start `%c' ", r->regstart); - if (r->reganch) - printf("anchored "); - if (r->regmust != NULL) - printf("must have \"%s\"", r->regmust); - printf("\n"); -} - -/* - - regprop - printable representation of opcode - */ -static char * -regprop(op) -char *op; -{ - register char *p; - static char buf[50]; - - (void) strcpy(buf, ":"); - - switch (OP(op)) { - case BOL: - p = "BOL"; - break; - case EOL: - p = "EOL"; - break; - case ANY: - p = "ANY"; - break; - case ANYOF: - p = "ANYOF"; - break; - case ANYBUT: - p = "ANYBUT"; - break; - case BRANCH: - p = "BRANCH"; - break; - case EXACTLY: - p = "EXACTLY"; - break; - case NOTHING: - p = "NOTHING"; - break; - case BACK: - p = "BACK"; - break; - case END: - p = "END"; - break; - case OPEN+1: - case OPEN+2: - case OPEN+3: - case OPEN+4: - case OPEN+5: - case OPEN+6: - case OPEN+7: - case OPEN+8: - case OPEN+9: - sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN); - p = NULL; - break; - case CLOSE+1: - case CLOSE+2: - case CLOSE+3: - case CLOSE+4: - case CLOSE+5: - case CLOSE+6: - case CLOSE+7: - case CLOSE+8: - case CLOSE+9: - sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE); - p = NULL; - break; - case STAR: - p = "STAR"; - break; - case PLUS: - p = "PLUS"; - break; - default: - regerror("corrupted opcode"); - break; - } - if (p != NULL) - (void) strcat(buf, p); - return(buf); -} -#endif - -/* - * The following is provided for those people who do not have strcspn() in - * their C libraries. They should get off their butts and do something - * about it; at least one public-domain implementation of those (highly - * useful) string routines has been published on Usenet. - */ -#ifdef STRCSPN -/* - * strcspn - find length of initial segment of s1 consisting entirely - * of characters not from s2 - */ - -static int -strcspn(s1, s2) -char *s1; -char *s2; -{ - register char *scan1; - register char *scan2; - register int count; - - count = 0; - for (scan1 = s1; *scan1 != '\0'; scan1++) { - for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */ - if (*scan1 == *scan2++) - return(count); - count++; - } - return(count); -} -#endif diff --git a/src/regexp/regexp.h b/src/regexp/regexp.h deleted file mode 100644 index 7efb7c4..0000000 --- a/src/regexp/regexp.h +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Definitions etc. for regexp(3) routines. - * - * Caveat: this is V8 regexp(3) [actually, a reimplementation thereof], - * not the System V one. - */ - -#ifndef REGEXP_H -#define REGEXP_H - - -#define NSUBEXP 10 -typedef struct regexp { - char *startp[NSUBEXP]; - char *endp[NSUBEXP]; - char regstart; /* Internal use only. */ - char reganch; /* Internal use only. */ - char *regmust; /* Internal use only. */ - int regmlen; /* Internal use only. */ - char program[1]; /* Unwarranted chumminess with compiler. */ -} regexp; - -extern regexp *regcomp(); -extern int regexec(); -/* extern size_t regsub(); */ -extern size_t myregsub(); -/* extern void regerror(); */ - - -#endif /* REGEXP_H */ diff --git a/src/regexp/regmagic.h b/src/regexp/regmagic.h deleted file mode 100644 index 5acf447..0000000 --- a/src/regexp/regmagic.h +++ /dev/null @@ -1,5 +0,0 @@ -/* - * The first byte of the regexp internal "program" is actually this magic - * number; the start node begins in the second byte. - */ -#define MAGIC 0234 diff --git a/src/regexp/regsub.c b/src/regexp/regsub.c deleted file mode 100644 index 7c2463a..0000000 --- a/src/regexp/regsub.c +++ /dev/null @@ -1,187 +0,0 @@ -/* - * File: regsub.c - * Date created: Copyright (c) 1986 by University of Toronto. - * Author: Henry Spencer. - * Extensions and modifications by Thomas Jensen - * Language: K&R C (traditional) - * Purpose: Perform substitutions after a regexp match - * License: - Not derived from licensed software. - * - Permission is granted to anyone to use this - * software for any purpose on any computer system, - * and to redistribute it freely, subject to the - * following restrictions: - * 1. The author is not responsible for the - * consequences of use of this software, no matter - * how awful, even if they arise from defects in it. - * 2. The origin of this software must not be - * misrepresented, either by explicit claim or by - * omission. - * 3. Altered versions must be plainly marked as such, - * and must not be misrepresented as being the - * original software. - * -* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * - */ - -#include -#include -#include "regexp.h" -#include "regmagic.h" - - -#ifndef CHARBITS -#define UCHARAT(p) ((int)*(unsigned char *)(p)) -#else -#define UCHARAT(p) ((int)*(p)&CHARBITS) -#endif - - - -/* - - regsub - perform substitutions after a regexp match - */ -size_t /* RETURNS length of dest str */ -regsub (prog, source, dest, dest_size) - regexp *prog; - char *source; - char *dest; - size_t dest_size; /* size of destination buffer */ -{ - register char *src; - register char *dst; - register char c; - register int no; - register int len; - size_t fill; /* current number of chars in dest */ - - if (prog == NULL || source == NULL || dest == NULL) { - regerror("NULL parm to regsub"); - return 0; - } - if (UCHARAT(prog->program) != MAGIC) { - regerror("damaged regexp fed to regsub"); - return 0; - } - - src = source; - dst = dest; - fill = 0; - - while ((c = *src++) != '\0') { - if (c == '&') - no = 0; - else if (c == '\\' && '0' <= *src && *src <= '9') - no = *src++ - '0'; - else - no = -1; - - if (no < 0) { /* Ordinary character. */ - if (c == '\\' && (*src == '\\' || *src == '&')) - c = *src++; - *dst++ = c; - ++fill; - } else if (prog->startp[no] != NULL && prog->endp[no] != NULL) { - len = prog->endp[no] - prog->startp[no]; - if (len < dest_size-fill) { - (void) strncpy(dst, prog->startp[no], len); - dst += len; - fill += len; - if (len != 0 && *(dst-1) == '\0') { /* strncpy hit NUL. */ - regerror("damaged match string"); - return fill; - } - } - else { - (void) strncpy (dst, prog->startp[no], dest_size-fill); - dest[dest_size-1] = '\0'; - return dest_size-1; - } - } - if (fill >= dest_size) { - dest[dest_size-1] = '\0'; - return dest_size-1; - } - } - *dst++ = '\0'; - - return fill; -} - - - -size_t /* RETURNS length of str in destination buffer */ -myregsub (prog, orig, orig_len, repstr, dest, dest_size, mode) - regexp *prog; /* pointers for matched regexp to original text */ - char *orig; /* original input line */ - size_t orig_len; /* length of original input line */ - char *repstr; /* source buffer for replaced parts */ - char *dest; /* destination buffer */ - size_t dest_size; /* size of destination buffer */ - char mode; /* 'g' or 'o' */ -{ - size_t fill; /* current number of chars in dest */ - char *sp, *dp; /* source rover, destination rover */ - int rc; /* received return codes */ - size_t rest_size; /* remaining space in dest */ - size_t partlen; /* temp length of a piece handled */ - - fill = 0; - sp = orig; - dp = dest; - rest_size = dest_size; - - do { - rc = regexec (prog, sp); - if (!rc) break; - - partlen = prog->startp[0] - sp; - if (partlen < rest_size) { - strncpy (dp, sp, partlen); - fill += partlen; - sp = prog->startp[0]; - dp += partlen; - rest_size -= partlen; - } - else { - strncpy (dp, sp, rest_size); - dest[dest_size-1] = '\0'; - return dest_size - 1; - } - - /* fprintf (stderr, "regsub (%p, \"%s\", \"%s\", %d);\n", */ - /* prog, repstr, dp, rest_size); */ - fill += regsub (prog, repstr, dp, rest_size); - dp = dest + fill; - sp = prog->endp[0]; - rest_size = dest_size - fill; - - if (fill >= dest_size) { - dest[dest_size-1] = '\0'; - return dest_size - 1; - } - - /* fprintf (stderr, "dest = \"%s\";\n", dest); */ - if (prog->startp[0] == prog->endp[0]) - break; /* match "^" or "$" only once */ - - } while (mode == 'g'); - - partlen = orig + orig_len - sp; - if (partlen < rest_size) { - strncpy (dp, sp, partlen); - fill += partlen; - dp[partlen] = '\0'; - } - else { - strncpy (dp, sp, rest_size); - dest[dest_size-1] = '\0'; - fill = dest_size - 1; - } - - return fill; -} - - - - -/*EOF*/ /* vim: set sw=4: */ diff --git a/src/regulex.c b/src/regulex.c new file mode 100644 index 0000000..c2ebb16 --- /dev/null +++ b/src/regulex.c @@ -0,0 +1,114 @@ +/* + * boxes - Command line filter to draw/remove ASCII boxes around text + * Copyright (C) 1999 Thomas Jensen and the boxes contributors + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License, version 2, as published + * by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * +* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + */ + +/* + * Convenience functions for PCRE2 regular expression processing + */ + +#include "config.h" +#include +#include +#include +#include + +#include "tools.h" +#include "regulex.h" + + + +pcre2_code *compile_pattern(char *pattern) +{ + int errornumber; + PCRE2_SIZE erroroffset; + PCRE2_SPTR pattern32 = u32_strconv_from_locale(pattern); + + pcre2_code *re = pcre2_compile( + pattern32, /* the pattern */ + PCRE2_ZERO_TERMINATED, /* indicates pattern is zero-terminated */ + 0, /* default options */ + &errornumber, + &erroroffset, + NULL); /* use default compile context */ + + if (re == NULL) { + PCRE2_UCHAR buffer[256]; + pcre2_get_error_message(errornumber, buffer, sizeof(buffer)); + fprintf(stderr, "Regular expression pattern \"%s\" failed to compile at offset %d: %s\n", + pattern, (int) erroroffset, u32_strconv_to_locale(buffer)); + } + return re; +} + + + +uint32_t *regex_replace(pcre2_code *search, char *replace, uint32_t *input, const size_t input_len, const int global) +{ + PCRE2_SPTR replacement = u32_strconv_from_locale(replace); + uint32_t options = PCRE2_SUBSTITUTE_OVERFLOW_LENGTH | PCRE2_SUBSTITUTE_EXTENDED + | (global ? PCRE2_SUBSTITUTE_GLOBAL : 0); + PCRE2_SIZE outlen = input_len * 2; /* estimated length of output buffer in characters, fine if too small */ + + PCRE2_SIZE bufsize = (input_len == 0) ? 16 : outlen; + uint32_t *output = (uint32_t *) malloc(sizeof(uint32_t) * bufsize); /* output buffer */ + int pcre2_rc; + + int done = 0; + while (!done) { + if (output == NULL) { + fprintf(stderr, "out of memory"); + return NULL; + } + PCRE2_SIZE outlen = bufsize; + + pcre2_rc = pcre2_substitute(search, (PCRE2_SPTR) input, input_len, + 0, /* start offset */ + options, + NULL, /* ptr to a match data block */ + NULL, /* match context */ + replacement, PCRE2_ZERO_TERMINATED, + output, &outlen); + + if (pcre2_rc != PCRE2_ERROR_NOMEMORY || bufsize >= outlen) { + done = 1; + } else { + #ifdef REGEXP_DEBUG + fprintf(stderr, "Reallocating output buffer from %ld to %ld UTF-32 chars\n", bufsize, outlen); + #endif + bufsize = outlen; + output = (uint32_t *) realloc(output, sizeof(uint32_t) * bufsize); + } + } + BFREE(replacement); + + if (pcre2_rc < 0) { + PCRE2_UCHAR buffer[256]; + pcre2_get_error_message(pcre2_rc, buffer, sizeof(buffer)); + /* buffer will normally contain "invalid replacement string" */ + fprintf(stderr, "Error substituting \"%s\": %s\n", replace, u32_strconv_to_locale(buffer)); + BFREE(output); + return NULL; + } + + return output; +} + + + +/*EOF*/ /* vim: set sw=4: */ diff --git a/src/regulex.h b/src/regulex.h new file mode 100644 index 0000000..a4b2e42 --- /dev/null +++ b/src/regulex.h @@ -0,0 +1,67 @@ +/* + * boxes - Command line filter to draw/remove ASCII boxes around text + * Copyright (C) 1999 Thomas Jensen and the boxes contributors + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License, version 2, as published + * by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * +* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + */ + +/* + * Convenience functions for PCRE2 regular expression processing + */ + +#ifndef REGULEX_H +#define REGULEX_H + +/* Building under Windows: If you want to statically link this program against a non-dll .a file, you must define + * PCRE2_STATIC before including pcre2.h. */ +#ifdef __MINGW32__ + #define PCRE2_STATIC +#endif + +/* The PCRE2_CODE_UNIT_WIDTH macro must be defined before including pcre2.h. For a program that uses only one code unit + * width, setting it to 8, 16, or 32 makes it possible to use generic function names such as pcre2_compile(). Note that + * just changing 8 to 16 (for example) is not sufficient to convert this program to process 16-bit characters. Even in + * a fully 16-bit environment, where string-handling functions such as strcmp() and printf() work with 16-bit + * characters, the code for handling the table of named substrings will still need to be modified. */ +#define PCRE2_CODE_UNIT_WIDTH 32 + + +#include +#include + + + +/* + * Compile the given pattern into a PCRE2 regular expression. + */ +pcre2_code *compile_pattern(char *pattern); + +/* + * Perform a regex replacement on the given string. + * + * @param the compiled pattern to search for + * @param the replacement string + * @param the string to which the replacements shall be applied + * @param the length of in characters, not bytes + * @param flag indicating whether all occurrences shall be replaced (true) or just the first (false) + * @return a new string which is a copy of output with the replacements applied, or NULL on error + */ +uint32_t *regex_replace(pcre2_code *search, char *replace, uint32_t *input, const size_t input_len, const int global); + + +#endif + +/*EOF*/ /* vim: set cindent sw=4: */ diff --git a/src/shape.h b/src/shape.h index 9ca9167..25d482b 100644 --- a/src/shape.h +++ b/src/shape.h @@ -5,12 +5,12 @@ * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License, version 2, as published * by the Free Software Foundation. - * + * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * for more details. - * + * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. @@ -50,7 +50,7 @@ typedef struct { char **chars; size_t height; size_t width; - int elastic; /* elastic is used only in orginial definition */ + int elastic; /* elastic is used only in original definition */ } sentry_t; #define SENTRY_INITIALIZER (sentry_t) {NULL, 0, 0, 0} diff --git a/src/tools.h b/src/tools.h index 6b8bfef..8b2cd6f 100644 --- a/src/tools.h +++ b/src/tools.h @@ -27,14 +27,16 @@ #include +#include "boxes.h" + #define BMAX(a, b) ((a)>(b)? (a):(b)) /* return the larger value */ #define BFREE(p) { /* free memory and clear pointer */ \ - if (p) { \ - free (p); \ - (p) = NULL; \ - } \ + if (p) { \ + free((void *) p); \ + (p) = NULL; \ + } \ } diff --git a/src/unicode.c b/src/unicode.c index bed9f58..dafdfbd 100644 --- a/src/unicode.c +++ b/src/unicode.c @@ -28,7 +28,6 @@ #include #include -#include #include "unicode.h" diff --git a/src/unicode.h b/src/unicode.h index 55fb8c8..02d090a 100644 --- a/src/unicode.h +++ b/src/unicode.h @@ -25,6 +25,9 @@ #ifndef UNICODE_H #define UNICODE_H +#include + + extern const char *encoding; /* the character encoding that we use */ extern const ucs4_t char_tab; /* ucs4_t character '\t' (tab) */