ctpv/src/lexer.c
2023-03-11 00:13:55 +01:00

458 lines
9.0 KiB
C

#include <ctype.h>
#include <assert.h>
#include "error.h"
#include "lexer.h"
#include "ulist.h"
#define READ_PUNCT(c, t, s) read_punct((c), (t), (s), LEN(s) - 1)
#define EOF_CHAR (-1)
typedef int (*Predicate)(int);
typedef struct {
unsigned int pos, len, eof;
FILE *f;
char buf[1024];
} InputBuffer;
typedef struct {
unsigned int back, front;
Token toks[16];
} TokenQueue;
struct Lexer {
enum LexerOpts opts;
unsigned line, col;
struct {
unsigned int line, col;
} tok_pos;
InputBuffer input_buf;
TokenQueue tok_queue;
UList *text_buf;
};
static char block_open[] = "{{",
block_close[] = "}}",
slash[] = "/",
star[] = "*",
dot[] = ".";
static void add_token_queue(Lexer *ctx, Token tok)
{
ctx->tok_queue.toks[ctx->tok_queue.back] = tok;
ctx->tok_queue.back = (ctx->tok_queue.back + 1) % LEN(ctx->tok_queue.toks);
}
static Token remove_token_queue(Lexer *ctx)
{
Token tok = ctx->tok_queue.toks[ctx->tok_queue.front];
ctx->tok_queue.front = (ctx->tok_queue.front + 1) % LEN(ctx->tok_queue.toks);
return tok;
}
static inline int is_empty_token_queue(Lexer *ctx)
{
return ctx->tok_queue.back == ctx->tok_queue.front;
}
static void init_input_buf(InputBuffer *b, FILE *f)
{
b->pos = 0;
b->len = 0;
b->eof = 0;
b->f = f;
}
static int peekn_char(Lexer *ctx, unsigned int i)
{
InputBuffer *b = &ctx->input_buf;
if (b->pos + i < b->len)
goto exit;
if (b->eof || (i > 0 && i >= b->len))
return EOF_CHAR;
if (i > 0) {
assert(i < LEN(b->buf));
memmove(b->buf, b->buf + (b->len - i), i * sizeof(*b->buf));
}
b->len = i + fread(b->buf + i, sizeof(*b->buf), LEN(b->buf) - i, b->f);
b->pos = 0;
if (b->len != LEN(b->buf)) {
if (feof(b->f))
b->eof = 1;
else if (ferror(b->f))
PRINTINTERR("fread() failed");
if (b->len == 0)
return EOF_CHAR;
}
exit:
return b->buf[b->pos + i];
}
static inline char peek_char(Lexer *ctx)
{
return peekn_char(ctx, 0);
}
static char nextn_char(Lexer *ctx, unsigned int i)
{
char c = peekn_char(ctx, i);
ctx->col++;
if (c == '\n') {
ctx->col = 1;
ctx->line++;
}
ctx->input_buf.pos++;
return c;
}
static inline char next_char(Lexer *ctx)
{
return nextn_char(ctx, 0);
}
static void skipn_char(Lexer *ctx, int n)
{
for (int i = 0; i < n; i++)
next_char(ctx);
}
static inline void add_text_buf(Lexer *ctx, char c)
{
ulist_append(ctx->text_buf, &c);
}
static inline void record_text(Lexer *ctx)
{
ulist_lock(ctx->text_buf);
}
static inline char *get_text(Lexer *ctx)
{
return ulist_unlock(ctx->text_buf);
}
Lexer *lexer_init(FILE *f)
{
Lexer *ctx;
if (!(ctx = malloc(sizeof(*ctx)))) {
FUNCFAILED("malloc", strerror(errno));
abort();
}
init_input_buf(&ctx->input_buf, f);
lexer_set_opts(ctx, LEX_OPT_NONE);
ctx->text_buf = ulist_new(sizeof(char), 1024);
ctx->line = ctx->col = 1;
ctx->tok_queue.back = ctx->tok_queue.front = 0;
return ctx;
}
void lexer_set_opts(Lexer *ctx, enum LexerOpts flags)
{
ctx->opts = flags;
}
void lexer_free(Lexer *ctx)
{
ulist_free(ctx->text_buf);
free(ctx);
}
static int cmp_nextn(Lexer *ctx, int n, char *s)
{
char c;
int i = 0;
while (1) {
c = peekn_char(ctx, i);
if (i >= n || *s == '\0' || c != *s)
break;
s++;
i++;
}
return i == n ? 0 : ((unsigned char)c - *(unsigned char *)s);
}
static void ignore_comments(Lexer *ctx)
{
char c;
while (peek_char(ctx) == '#') {
do {
c = next_char(ctx);
} while (c != '\n');
}
}
static void read_while(Lexer *ctx, Predicate p, int add)
{
char c;
while ((c = peek_char(ctx)) >= 0 && p(c)) {
if (add)
add_text_buf(ctx, c);
next_char(ctx);
}
if (add)
add_text_buf(ctx, '\0');
}
static inline Token get_tok(Lexer *ctx, enum TokenType type)
{
return (Token){ .type = type,
.line = ctx->tok_pos.line,
.col = ctx->tok_pos.col };
}
static inline Token read_new_line(Lexer *ctx)
{
Token tok = get_tok(ctx, TOK_NULL);
while (peek_char(ctx) == '\n') {
next_char(ctx);
tok.type = TOK_NEW_LN;
}
return tok;
}
static inline int issymbol(int c)
{
return isalnum(c) || c == '_' || c == '-';
}
static inline int isnotquote(int c)
{
return (c != '"');
}
static inline Token read_symbol(Lexer *ctx)
{
char c = peek_char(ctx);
if (!isalpha(c))
return get_tok(ctx, TOK_NULL);
record_text(ctx);
read_while(ctx, issymbol, 1);
Token tok = get_tok(ctx, TOK_STR);
tok.val.s = get_text(ctx);
return tok;
}
static inline Token read_string(Lexer *ctx)
{
char c = next_char(ctx);
if (isnotquote(c))
return get_tok(ctx, TOK_NULL);
record_text(ctx);
read_while(ctx, isnotquote, 1);
Token tok = get_tok(ctx, TOK_STR);
tok.val.s = get_text(ctx);
// Skip ending quote
next_char(ctx);
return tok;
}
static inline Token read_int(Lexer *ctx)
{
int positive = 1;
if (peek_char(ctx) == '-') {
positive = 0;
next_char(ctx);
}
if (!isdigit(peek_char(ctx)))
return get_tok(ctx, TOK_NULL);
record_text(ctx);
read_while(ctx, isdigit, 1);
Token tok;
char *text = get_text(ctx);
/* If NUMISTEXT option is set, do not convert string to integer */
if (ctx->opts & LEX_OPT_NUMISTEXT) {
tok = get_tok(ctx, TOK_STR);
tok.val.s = text;
return tok;
}
int i = atoi(text);
if (!positive)
i *= -1;
tok = get_tok(ctx, TOK_INT);
tok.val.i = i;
return tok;
}
static Token read_punct(Lexer *ctx, int type, char *s, int n)
{
Token tok;
if (peek_char(ctx) == EOF_CHAR)
return get_tok(ctx, TOK_EOF);
int ret = cmp_nextn(ctx, n, s);
if (ret == 0)
tok = get_tok(ctx, type);
else
return get_tok(ctx, TOK_NULL);
skipn_char(ctx, n);
return tok;
}
static inline Token read_block_open(Lexer *ctx)
{
return READ_PUNCT(ctx, TOK_BLK_OPEN, block_open);
}
static inline Token read_block_close(Lexer *ctx)
{
return READ_PUNCT(ctx, TOK_BLK_CLS, block_close);
}
static Token read_block(Lexer *ctx)
{
Token open_tok, body_tok, close_tok;
if ((open_tok = read_block_open(ctx)).type == TOK_NULL)
return get_tok(ctx, TOK_NULL);
record_text(ctx);
while (1) {
close_tok = read_block_close(ctx);
if (close_tok.type == TOK_EOF) {
PARSEERROR(*ctx, "unclosed block");
return get_tok(ctx, TOK_ERR);
} else if (close_tok.type != TOK_NULL) {
break;
}
add_text_buf(ctx, next_char(ctx));
}
add_text_buf(ctx, '\0');
body_tok = get_tok(ctx, TOK_STR);
body_tok.val.s = get_text(ctx);
add_token_queue(ctx, body_tok);
if (close_tok.type != TOK_NULL)
add_token_queue(ctx, close_tok);
return open_tok;
}
#define ATTEMPT_READ(c, func) \
do { \
Token t = (func)(c); \
if (t.type != TOK_NULL) \
return t; \
} while (0)
#define ATTEMPT_READ_CHAR(ctx, tok, ch, type_) \
do { \
char c = peek_char(ctx); \
if (c == (ch)) { \
(tok).type = (type_); \
next_char(ctx); \
return (tok); \
} \
} while (0)
Token lexer_get_token(Lexer *ctx)
{
if (!is_empty_token_queue(ctx))
return remove_token_queue(ctx);
read_while(ctx, isblank, 0);
ignore_comments(ctx);
ctx->tok_pos.line = ctx->line;
ctx->tok_pos.col = ctx->col;
Token tok = get_tok(ctx, TOK_NULL);
ATTEMPT_READ_CHAR(ctx, tok, EOF_CHAR, TOK_EOF);
ATTEMPT_READ_CHAR(ctx, tok, '/', TOK_SLASH);
ATTEMPT_READ_CHAR(ctx, tok, '*', TOK_STAR);
ATTEMPT_READ_CHAR(ctx, tok, '.', TOK_DOT);
ATTEMPT_READ(ctx, read_new_line);
ATTEMPT_READ(ctx, read_symbol);
ATTEMPT_READ(ctx, read_int);
ATTEMPT_READ(ctx, read_block);
ATTEMPT_READ(ctx, read_string);
PARSEERROR((*ctx), "cannot handle character: %c", peek_char(ctx));
return get_tok(ctx, TOK_ERR);
}
char *lexer_token_type_str(enum TokenType type)
{
switch (type) {
case TOK_NULL:
return "<null>";
case TOK_EOF:
return "<end of file>";
case TOK_ERR:
return "<TOKEN ERROR>";
case TOK_NEW_LN:
return "<newline>";
case TOK_BLK_OPEN:
return block_open;
case TOK_BLK_CLS:
return block_close;
case TOK_SLASH:
return slash;
case TOK_STAR:
return star;
case TOK_DOT:
return dot;
case TOK_INT:
return "<integer>";
case TOK_STR:
return "<string>";
}
PRINTINTERR("unknown type: %d", type);
abort();
}