Better UTF-8 support, including combined characters. Unicode data is now stored

as UTF-8 in a separate array, the code does a lookup into this every time it gets to a UTF-8 cell. Zero width characters are just appended onto the UTF-8 data for the previous cell. This also means that almost no bytes extra are wasted non-Unicode data (yay). Still some oddities, such as copy mode skips over wide characters in a strange way, and the code could do with some tidying.
2025-08-16 00:28:10 +02:00 · 2009-03-28 20:17:29 +00:00
parent 34dd72f008
commit cf7b384c43
12 changed files with 364 additions and 226 deletions
--- a/input.c
+++ b/input.c
@ -1,4 +1,4 @@
-/* $Id: input.c,v 1.75 2009-03-28 16:30:05 nicm Exp $ */
+/* $Id: input.c,v 1.76 2009-03-28 20:17:29 nicm Exp $ */

 /*
 * Copyright (c) 2007 Nicholas Marriott <nicm@users.sourceforge.net>
@ -528,8 +528,6 @@ input_state_string_escape(u_char ch, struct input_ctx *ictx)
 void
 input_state_utf8(u_char ch, struct input_ctx *ictx)
 {
-	u_int	value;
-
 	log_debug2("-- un %zu: %hhu (%c)", ictx->off, ch, ch);

 	ictx->utf8_buf[ictx->utf8_off++] = ch;
@ -537,14 +535,9 @@ input_state_utf8(u_char ch, struct input_ctx *ictx)
 		return;
 	input_state(ictx, input_state_first);

-	value = utf8_combine(ictx->utf8_buf);
-	if (value > 0xffff)	/* non-BMP not supported */
-		value = '_';
-
-	ictx->text = value;
- 	ictx->cell.flags |= GRID_FLAG_UTF8;
-	screen_write_cell(&ictx->ctx, &ictx->cell, ictx->text);
- 	ictx->cell.flags &= ~GRID_FLAG_UTF8;
+	ictx->cell.flags |= GRID_FLAG_UTF8;
+	screen_write_cell(&ictx->ctx, &ictx->cell, ictx->utf8_buf);
+	ictx->cell.flags &= ~GRID_FLAG_UTF8;
 }

 void
@ -585,8 +578,8 @@ input_handle_character(u_char ch, struct input_ctx *ictx)
 	}
 	log_debug2("-- ch %zu: %hhu (%c)", ictx->off, ch, ch);

-	ictx->text = ch;
-	screen_write_cell(&ictx->ctx, &ictx->cell, ictx->text);
+	ictx->cell.data = ch;
+	screen_write_cell(&ictx->ctx, &ictx->cell, ictx->utf8_buf);
 }

 void