boxes/src/lexer.l

530 lines
14 KiB
Plaintext

%top{
/*
* boxes - Command line filter to draw/remove ASCII boxes around text
* Copyright (c) 1999-2023 Thomas Jensen and the boxes contributors
*
* This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public
* License, version 3, as published by the Free Software Foundation.
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied
* warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
* You should have received a copy of the GNU General Public License along with this program.
* If not, see <https://www.gnu.org/licenses/>.
*
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
*/
/*
* flex lexical analyzer for boxes configuration files
*/
#include "config.h"
#include "bxstring.h"
typedef struct {
int yyerrcnt;
/** the currently active string delimiter character */
char sdel;
/** the currently active string escape character */
char sesc;
} pass_to_flex;
/*
* Valid characters to be used as string delimiters.
* The following list must correspond to the SDELIM definition below.
*/
#define LEX_SDELIM "\"~'`!@%&*=:;<>?/|.\\"
#define LEX_SDELIM_RECOMMENDED "\"~'!|"
/**
* User-defined initializations for the lexer.
*
* Since this scanner must use REJECT in order to be able to process the string delimiter commands, it cannot
* dynamically enlarge its input buffer to accomodate larger tokens. Thus, we simply set the buffer size to the
* input file size plus 10 bytes margin-of-error.
*
* @param yyscanner pointer to the scanner data block
* @param configfile the path to the config file we are reading
*/
void inflate_inbuf(void *yyscanner, const bxstr_t *configfile);
}
%{
#include <string.h>
#include <sys/stat.h>
#include <unistd.h>
#include <unitypes.h>
#include "boxes.h"
#include "shape.h"
#include "tools.h"
#include "parsing.h"
#include "parser.h"
#include "unicode.h"
#define LEX_MAX_WARN 3 /* number of lex errors per design */
static void report_state_char(char *symbol, char c, char *expected_state_str);
static void report_state(char *symbol, char *text, char *expected_state_str);
static int change_string_delimiters(pass_to_flex *extra, char *delim_expr);
%}
%option 8bit
%option bison-bridge
%option case-insensitive
%option ecs
%option extra-type="pass_to_flex *"
%option never-interactive
%option nodefault
%option noinput
%option nounput
%option noyywrap
%option reentrant
%option warn
%option yylineno
%x BOX
%x SAMPLE
%x SHAPES
%x ELASTIC
%x DELIMSPEC
%x PARENT
/*
* The following paragraph contains patterns to recognize UTF-8 characters from a byte stream, based on
* - https://stackoverflow.com/a/10253320/1005481 by Zack Weinberg (under CC-BY-SA 3.0 license)
* - https://www.w3.org/2005/03/23-lex-U by Eric Prud'hommeaux, W3C (under the W3C Document License)
*/
PBOM \xEF\xBB\xBF
U2A [\xC2-\xDF][\x80-\xBF]
U2B \xE0[\xA0-\xBF][\x80-\xBF]
U3A [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}
U3B \xED[\x80-\x9F][\x80-\xBF]
U4A \xF0[\x90-\xBF][\x80-\xBF]{2}
U4B [\xF1-\xF3][\x80-\xBF]{3}
U4C \xF4[\x80-\x8F][\x80-\xBF]{2}
UTF_8 {U2A}|{U2B}|{U3A}|{U3B}|{U4A}|{U4B}|{U4C}
PWORD (?:[a-zA-Z]|{UTF_8})(?:[a-zA-Z0-9_-]|{UTF_8})*
PASCII_ID [a-zA-Z][a-zA-Z0-9_-]*
PWHITE [ \t\r\n]
SDELIM [\"~\'`!@\%\&\*=:;<>\?/|\.\\]
PPARENT parent
PFILENAME [^\r\n]+
%%
/*
* Precedence of rules:
* - The rule that matches the most text wins.
* - If two rules match the same amount of text, the one defined first (further up) wins.
*/
<INITIAL,BOX,DELIMSPEC,ELASTIC,SHAPES>{PWHITE}|{PBOM} /* ignore whitespace and a byte order mark */
<DELIMSPEC>[^ \t\r\n]+ {
/*
* String delimiter spec - like WORD, but allow any character
*/
yylval->s = bxs_from_ascii("IGNORED");
char *str = (char *) strdup(yytext);
BEGIN(BOX);
report_state("YDELIMS", str, "INITIAL");
if (change_string_delimiters(yyextra, str) != 0) {
return YUNREC;
}
return YDELIMSPEC;
}
<BOX,SHAPES>{SDELIM}.*$ {
/*
* Strings -- first match everything starting from a potential string delimiter until the end of the line. We
* will give back what we don't need and also detect unterminated strings. Strings always end on the same line.
*/
int rest_len = yyleng - 1; /* length of string pointed to by p */
int qcnt = 0; /* esc char count in current string */
if (yytext[0] != yyextra->sdel) {
REJECT; /* that was not our delimiter */
}
char *str = (char *) strdup(yytext + 1);
if (str == NULL) {
perror (PROJECT);
exit (EXIT_FAILURE);
}
char *p = str;
while (*p) {
if (*p == yyextra->sesc) {
memmove (p, p+1, rest_len); /* incl. '\0' */
++qcnt;
--rest_len;
if (*p == '\0') {
break;
}
}
else if (*p == yyextra->sdel) {
*p = '\0';
yyless ((p - str) + 2 + qcnt); /* string plus quotes */
#ifdef LEXER_DEBUG
fprintf (stderr, " STRING: \"%s\"\n", str);
#endif
uint32_t *utf8 = u32_strconv_from_arg(str, CONFIG_FILE_ENCODING);
yylval->s = bxs_from_unicode(utf8);
BFREE(utf8);
BFREE(str);
return STRING;
}
--rest_len;
++p;
}
if ((yyextra->yyerrcnt)++ < 5) {
yyerror(NULL, "Unterminated String -- %s", yytext);
}
BFREE(str);
return YUNREC;
}
<INITIAL>{PPARENT} {
BEGIN(PARENT);
report_state("YPARENT", yytext, "PARENT");
return YPARENT;
}
<PARENT>{PFILENAME} {
uint32_t *utf8 = u32_strconv_from_arg(yytext, CONFIG_FILE_ENCODING);
bxstr_t *bxstr = bxs_from_unicode(utf8);
yylval->s = bxs_trim(bxstr);
BFREE(utf8);
bxs_free(bxstr);
BEGIN(INITIAL);
report_state("FILENAM", bxs_to_output(yylval->s), "INITIAL");
return FILENAME;
}
<PARENT>\r?\n {
/* This is triggered only when no parent filename was specified. */
BEGIN(INITIAL);
report_state(" NL", "", "INITIAL");
}
<BOX>Sample {
BEGIN(SAMPLE);
report_state("YSAMPLE", yytext, "SAMPLE");
return YSAMPLE;
}
<SAMPLE>\n {
if (yyleng > 1)
yymore();
}
<SAMPLE>^[ \t]*ends[ \t\r]*$ {
char *p = yytext + yyleng -1;
size_t len; /* length of sample */
while (*p == ' ' || *p == '\t' || *p == '\r')
--p; /* skip trailing whitespace */
p -= 2; /* almost skip "ends" statement */
*p = '\0'; /* p now points to 'n' */
char *sample = (char *) strdup(yytext);
if (sample == NULL) {
perror (PROJECT);
exit (EXIT_FAILURE);
}
*p-- = 'n';
len = p - yytext; /* yyless(n): push back all but the first n */
yyless (len); /* allow the lexer to return YENDSAMPLE */
sample[len] = '\n'; /* replace 'e' with newline */
btrim(sample, &len);
if (len > 0) {
uint32_t *utf8 = u32_strconv_from_arg(sample, CONFIG_FILE_ENCODING);
uint32_t *nl = u32_strconv_from_arg("\n", CONFIG_FILE_ENCODING);
bxstr_t *bxstr = bxs_from_unicode(utf8);
bxstr_t *bxstr2 = bxs_rtrim(bxstr);
bxs_free(bxstr);
bxstr = bxs_strcat(bxstr2, nl);
BFREE(nl);
BFREE(utf8);
BFREE(sample);
bxs_free(bxstr2);
yylval->s = bxstr;
return STRING;
}
else {
if ((yyextra->yyerrcnt)++ < 5) {
yyerror(NULL, "SAMPLE block must not be empty");
}
BFREE(sample);
return YUNREC;
}
}
<SAMPLE>. yymore();
<SAMPLE>ends[ \t\r]*$ {
/* reached because the other rule pushes it back so a proper end token can be returned */
BEGIN(BOX);
report_state("YENDSAM", yytext, "BOX");
return YENDSAMPLE;
}
<BOX>Tags {
#ifdef LEXER_DEBUG
fprintf (stderr, " YTAGS: %s\n", yytext);
#endif
return YTAGS;
}
<BOX>Elastic {
BEGIN(ELASTIC);
report_state("YELASTC", yytext, "ELASTIC");
return YELASTIC;
}
<BOX>Shapes {
BEGIN(SHAPES);
report_state("YSHAPES", yytext, "SHAPES");
return YSHAPES;
}
<INITIAL>Box {
BEGIN(BOX);
report_state(" YBOX", yytext, "BOX");
yyextra->yyerrcnt = 0;
change_string_delimiters(yyextra, "\\\"");
return YBOX;
}
<BOX>Replace { return YREPLACE; }
<BOX>Reverse { return YREVERSE; }
<BOX>Padding { return YPADDING; }
<BOX>To { return YTO; }
<BOX>With { return YWITH; }
<BOX>Global { yylval->c = 'g'; return YRXPFLAG; }
<BOX>Once { yylval->c = 'o'; return YRXPFLAG; }
<BOX>End {
BEGIN(INITIAL);
report_state(" YEND", yytext, "INITIAL");
change_string_delimiters(yyextra, "\\\"");
return YEND;
}
<SHAPES,ELASTIC>nw { yylval->shape = NW; return SHAPE; }
<SHAPES,ELASTIC>nnw { yylval->shape = NNW; return SHAPE; }
<SHAPES,ELASTIC>n { yylval->shape = N; return SHAPE; }
<SHAPES,ELASTIC>nne { yylval->shape = NNE; return SHAPE; }
<SHAPES,ELASTIC>ne { yylval->shape = NE; return SHAPE; }
<SHAPES,ELASTIC>ene { yylval->shape = ENE; return SHAPE; }
<SHAPES,ELASTIC>e { yylval->shape = E; return SHAPE; }
<SHAPES,ELASTIC>ese { yylval->shape = ESE; return SHAPE; }
<SHAPES,ELASTIC>se { yylval->shape = SE; return SHAPE; }
<SHAPES,ELASTIC>sse { yylval->shape = SSE; return SHAPE; }
<SHAPES,ELASTIC>s { yylval->shape = S; return SHAPE; }
<SHAPES,ELASTIC>ssw { yylval->shape = SSW; return SHAPE; }
<SHAPES,ELASTIC>sw { yylval->shape = SW; return SHAPE; }
<SHAPES,ELASTIC>wsw { yylval->shape = WSW; return SHAPE; }
<SHAPES,ELASTIC>w { yylval->shape = W; return SHAPE; }
<SHAPES,ELASTIC>wnw { yylval->shape = WNW; return SHAPE; }
<ELASTIC>\) {
BEGIN(BOX);
report_state_char("SYMBOL", yytext[0], "BOX");
return yytext[0];
}
<SHAPES>\} {
BEGIN(BOX);
report_state_char("SYMBOL", yytext[0], "BOX");
return yytext[0];
}
<BOX>author|designer|created|revision|revdate|indent {
/*
* general key words
*/
#ifdef LEXER_DEBUG
fprintf (stderr, "KEYWORD: %s\n", yytext);
#endif
yylval->ascii = strdup(yytext);
if (yylval->ascii == NULL) {
perror (PROJECT);
exit (EXIT_FAILURE);
}
return KEYWORD;
}
<BOX>Delimiter|Delim {
/*
* Change string delimiting characters
*/
BEGIN(DELIMSPEC);
report_state("YCHGDEL", yytext, "DELIMSPEC");
return YCHGDEL;
}
<INITIAL,BOX>{PASCII_ID} {
/*
* a free-floating word which is not a string, i.e. it does not have delimiting characters (ASCII version)
*/
yylval->ascii = strdup(yytext);
if (yylval->ascii == NULL) {
perror (PROJECT);
exit (EXIT_FAILURE);
}
#ifdef LEXER_DEBUG
fprintf (stderr, "ASCIIID: %s\n", yylval->ascii);
#endif
return ASCII_ID;
}
<INITIAL,BOX>{PWORD} {
/*
* a free-floating word which is not a string, i.e. it does not have delimiting characters
*/
uint32_t *utf8 = u32_strconv_from_arg(yytext, CONFIG_FILE_ENCODING);
yylval->s = bxs_from_unicode(utf8);
if (yylval->s == NULL) {
perror (PROJECT);
exit (EXIT_FAILURE);
}
#ifdef LEXER_DEBUG
fprintf (stderr, " WORD: %s\n", u32_strconv_to_output(utf8));
#endif
BFREE(utf8);
return WORD;
}
<BOX>[\+-]?[0-9]+ {
#ifdef LEXER_DEBUG
fprintf (stderr, "YNUMBER: %s\n", yytext);
#endif
yylval->num = atoi (yytext);
return YNUMBER;
}
<BOX,SHAPES,ELASTIC>[,(){}] {
#ifdef LEXER_DEBUG
fprintf (stderr, " SYMBOL: \'%c\'\n", yytext[0]);
#endif
return yytext[0];
}
<INITIAL,BOX,SHAPES,ELASTIC>#.*$ {
/* ignore comments */
#ifdef LEXER_DEBUG
fprintf (stderr, "COMMENT: %s\n", yytext+1);
#endif
}
<INITIAL,BOX,SHAPES,ELASTIC>. {
/* a character that made no sense where it was encountered. Let the parser handle it. */
#ifdef LEXER_DEBUG
fprintf (stderr, " YUNREC: \'%c\'\n", yytext[0]);
#endif
return YUNREC;
}
%%
void inflate_inbuf(void *yyscanner, const bxstr_t *configfile)
{
struct stat sinf;
char *utf8 = u32_strconv_to_arg(configfile->memory, "UTF-8");
if (stat(utf8, &sinf)) {
perror (PROJECT);
BFREE(utf8);
exit (EXIT_FAILURE);
}
BFREE(utf8);
struct yyguts_t *yyg = (struct yyguts_t *) yyscanner;
yy_delete_buffer(YY_CURRENT_BUFFER, yyscanner);
yy_switch_to_buffer (yy_create_buffer(yyin, sinf.st_size+10, yyscanner), yyscanner);
BEGIN(INITIAL);
}
static void report_state_char(char *symbol, char c, char *expected_state_str)
{
char *s = (char *) malloc(4);
sprintf(s, "'%c'", c >= ' ' && c <= 126 ? c : '?');
report_state(symbol, s, expected_state_str);
BFREE(s);
}
static void report_state(char *symbol, char *text, char *expected_state_str)
{
int lexerDebug = 0;
#ifdef LEXER_DEBUG
lexerDebug = 1;
#endif
if (lexerDebug) {
fprintf(stderr, "%7s: %s -- STATE %s\n", symbol, text, expected_state_str);
}
}
static int change_string_delimiters(pass_to_flex *extra, char *delim_expr)
{
if (strlen(delim_expr) != 2) {
yyerror(NULL, "invalid string delimiter specification -- %s", delim_expr);
return 1;
}
if (delim_expr[0] == delim_expr[1]) {
yyerror(NULL, "string delimiter and escape char may not be the same");
return 1;
}
if (strchr (LEX_SDELIM, delim_expr[1]) == NULL) {
yyerror(NULL, "invalid string delimiter -- %c (try one of %s)", delim_expr[1], LEX_SDELIM_RECOMMENDED);
return 1;
}
#ifdef LEXER_DEBUG
fprintf(stderr, "YDELIMS: change_string_delimiters('%c', '%c')\n", delim_expr[0], delim_expr[1]);
#endif
extra->sesc = delim_expr[0];
extra->sdel = delim_expr[1];
return 0;
}
/*EOF*/ /* vim: set cindent sw=4: */