diff --git a/src/Makefile b/src/Makefile
index c768193..3d580e8 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -26,9 +26,9 @@ GEN_HDR    = parser.h boxes.h
 GEN_SRC    = parser.c lex.yy.c
 GEN_FILES  = $(GEN_SRC) $(GEN_HDR)
 ORIG_HDRCL = boxes.h.in config.h
-ORIG_HDR   = $(ORIG_HDRCL) lexer.h tools.h shape.h generate.h remove.h
+ORIG_HDR   = $(ORIG_HDRCL) lexer.h tools.h shape.h generate.h remove.h unicode.h
 ORIG_GEN   = lexer.l parser.y
-ORIG_NORM  = boxes.c tools.c shape.c generate.c remove.c
+ORIG_NORM  = boxes.c tools.c shape.c generate.c remove.c unicode.c
 ORIG_SRC   = $(ORIG_GEN) $(ORIG_NORM)
 ORIG_FILES = $(ORIG_SRC) $(ORIG_HDR)
 OTH_FILES  = Makefile
@@ -47,7 +47,7 @@ debug: flags_$(BOXES_PLATFORM)
 
 boxes: $(ALL_OBJ)
 	$(MAKE) -C regexp CC=$(CC) libregexp.a
-	$(CC) $(LDFLAGS) $(ALL_OBJ) -o $(BOXES_EXECUTABLE_NAME) -lregexp
+	$(CC) $(LDFLAGS) $(ALL_OBJ) -o $(BOXES_EXECUTABLE_NAME) -lunistring -lpcre2-32 -lregexp
 	if [ "$(STRIP)" = "true" ] ; then strip $(BOXES_EXECUTABLE_NAME) ; fi
 
 boxes.exe: $(ALL_OBJ)
@@ -81,8 +81,9 @@ lex.yy.c: lexer.l boxes.h
 	rm lexer.tmp.c
 
 
-boxes.o: boxes.c boxes.h regexp/regexp.h shape.h tools.h generate.h remove.h config.h
+boxes.o: boxes.c boxes.h regexp/regexp.h shape.h tools.h unicode.h generate.h remove.h config.h
 tools.o: tools.c tools.h boxes.h shape.h config.h
+unicode.o: unicode.c unicode.h config.h
 shape.o: shape.c shape.h boxes.h config.h tools.h
 generate.o: generate.c generate.h boxes.h shape.h tools.h config.h
 remove.o: remove.c remove.h boxes.h shape.h tools.h config.h
diff --git a/src/boxes.c b/src/boxes.c
index a679f3b..4e856ea 100644
--- a/src/boxes.c
+++ b/src/boxes.c
@@ -21,6 +21,7 @@
 #include "config.h"
 #include <errno.h>
 #include <limits.h>
+#include <locale.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
@@ -29,6 +30,7 @@
 #include <sys/stat.h>
 
 #include <uniconv.h>
+#include <unictype.h>
 #include <unistdio.h>
 #include <unistr.h>
 #include <unitypes.h>
@@ -42,6 +44,7 @@
 #include "regexp.h"
 #include "generate.h"
 #include "remove.h"
+#include "unicode.h"
 
 #ifdef __MINGW32__
     #include <windows.h>
@@ -92,14 +95,6 @@ int anz_designs = 0;                     /* no of designs after parsing */
 
 opt_t opt;                               /* command line options */
 
-char *encoding;                          /* the character encoding that we use */
-
-ucs4_t char_tab     = 0x00000009;        /* ucs4_t character '\t' (tab)  */
-ucs4_t char_space   = 0x00000020;        /* ucs4_t character ' '  (space) */
-ucs4_t char_cr      = 0x0000000d;        /* ucs4_t character '\r' (carriage return) */
-ucs4_t char_newline = 0x0000000a;        /* ucs4_t character '\n' (newline) */
-ucs4_t char_nul     = 0x00000000;        /* ucs4_t character '\0' (zero) */
-
 input_t input = INPUT_INITIALIZER;       /* input lines */
 
 
@@ -1175,12 +1170,12 @@ static int list_styles()
 
 
 
-static int get_indent (const line_t *lines, const size_t lanz)
+static int get_indent (const line_t *lines, const size_t lines_size)
 /*
  *  Determine indentation of given lines in spaces.
  *
- *      lines   the lines to examine
- *      lanz    number of lines to examine
+ *      lines      the lines to examine
+ *      lines_size number of lines to examine
  *
  *  Lines are assumed to be free of trailing whitespace.
  *
@@ -1190,31 +1185,32 @@ static int get_indent (const line_t *lines, const size_t lanz)
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
  */
 {
-    size_t j;
-    int    res = LINE_MAX_BYTES;               /* result */
-    int    nonblank = 0;                 /* true if one non-blank line found */
+    int res = LINE_MAX_BYTES;  /* result */
+    int nonblank = 0;          /* true if one non-blank line found */
 
     if (lines == NULL) {
-        fprintf (stderr, "%s: internal error\n", PROJECT);
+        fprintf(stderr, "%s: internal error\n", PROJECT);
         return -1;
     }
-    if (lanz == 0)
+    if (lines_size == 0) {
         return 0;
+    }
 
-    for (j=0; j<lanz; ++j) {
+    for (size_t j = 0; j < lines_size; ++j) {
         if (lines[j].len > 0) {
-            size_t ispc;
             nonblank = 1;
-            ispc = strspn (lines[j].text, " ");
-            if ((int) ispc < res)
+            size_t ispc = strspn(lines[j].text, " ");
+            if ((int) ispc < res) {
                 res = ispc;
+            }
         }
     }
 
-    if (nonblank)
-        return res;                      /* success */
-    else
-        return 0;                        /* success, but only blank lines */
+    if (nonblank) {
+        return res;            /* success */
+    } else {
+        return 0;              /* success, but only blank lines */
+    }
 }
 
 
@@ -1327,12 +1323,12 @@ static int apply_substitutions (const int mode)
 
 
 
-static int has_linebreak (const char *s, const int len)
+static int has_linebreak (const uint32_t *s, const int len)
 /*
  *  Determine if the given line of raw text is ended by a line break.
  *
  *  s: the string to check
- *  len: length of s
+ *  len: length of s in characters
  *
  *  RETURNS:  != 0   line break found
  *            == 0   line break not found
@@ -1342,10 +1338,10 @@ static int has_linebreak (const char *s, const int len)
 {
     int result = 0;
     if (s != NULL && len > 0) {
-        char the_last = s[len - 1];
-        result = the_last == '\r' || the_last == '\n';
+        ucs4_t the_last = s[len - 1];
+        result = u32_cmp(&char_cr, &the_last, 1) == 0 || u32_cmp(&char_newline, &the_last, 1) == 0;
         #if defined(DEBUG)
-            fprintf(stderr, "has_linebreak: (%d) %d\n", the_last, result);
+            fprintf(stderr, "has_linebreak: (%#010x) %d\n", (int) the_last, result);
         #endif
     }
     return result;
@@ -1353,6 +1349,60 @@ static int has_linebreak (const char *s, const int len)
 
 
 
+static size_t count_invisible_chars(const uint32_t *s, const size_t buflen, size_t *num_esc, char **ascii)
+{
+    size_t invis = 0;  /* counts invisible characters */
+    int ansipos = 0;   /* progression of ansi sequence */
+    *num_esc = 0;      /* counts the number of escape sequences found */
+
+    if (is_empty(s)) {
+        (*ascii) = (char *) strdup("");
+        return 0;
+    }
+    (*ascii) = (char *) calloc(buflen, sizeof(char));
+    char *p = *ascii;
+
+    ucs4_t c;
+    const uint32_t *rest = s;
+    while ((rest = u32_next(&c, rest))) {
+        if (ansipos == 0 && c == 0x0000001b) {
+            /* Found an ESC char, count it as invisible and move 1 forward in the detection of CSI sequences */
+            ansipos++;
+            invis++;
+            (*num_esc)++;
+        } else if (ansipos == 1 && c == '[') {
+            /* Found '[' char after ESC. A CSI sequence has started. */
+            ansipos++;
+            invis++;
+        } else if (ansipos == 1 && c >= 0x40 && c <= 0x5f) {
+            /* Found a byte designating the end of a two-byte escape sequence */
+            invis++;
+            ansipos = 0;
+        } else if (ansipos == 2) {
+            /* Inside CSI sequence - Keep counting bytes as invisible */
+            invis++;
+
+            /* A char between 0x40 and 0x7e signals the end of an CSI or escape sequence */
+            if (c >= 0x40 && c <= 0x7e) {
+                ansipos = 0;
+            }
+        } else if (is_ascii_printable(c)) {
+            *p = c & 0xff;
+            ++p;
+        } else {
+            int cols = uc_width(c, encoding);
+            if (cols > 0) {
+                memset(p, (int) 'x', cols);
+                p += cols;
+            }
+        }
+    }
+    *p = '\0';
+    return invis;
+}
+
+
+
 static int read_all_input (const int use_stdin)
 /*
  *  Read entire input (possibly from stdin) and store it in 'input' array.
@@ -1369,18 +1419,12 @@ static int read_all_input (const int use_stdin)
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
  */
 {
-    char     buf[LINE_MAX_BYTES + 2];    /* input buffer */
-    size_t   len_bytes;
-    char     c;
-    size_t   invis;                      /* counts invisible characters */
-    int      ansipos;                    /* progression of ansi sequence */
-    size_t   input_size = 0;             /* number of elements allocated */
-    line_t  *tmp = NULL;
-    char    *temp = NULL;                /* string resulting from tab exp. */
-    uint8_t *mbtemp = NULL;              /* temp string for preparing the multi-byte input */
-    size_t   newlen;                     /* line length after tab expansion */
-    size_t   i;
-    int      rc;
+    char buf[LINE_MAX_BYTES + 2];    /* input buffer */
+    size_t len_chars;
+    size_t input_size = 0;           /* number of elements allocated */
+    uint32_t *mbtemp = NULL;         /* temp string for preparing the multi-byte input */
+    size_t i;
+    int rc;
 
     input.indent = LINE_MAX_BYTES;
     input.maxline = 0;
@@ -1391,96 +1435,72 @@ static int read_all_input (const int use_stdin)
         /*
          *  Start reading
          */
-        while (fgets (buf, LINE_MAX_BYTES+1, opt.infile))
-        {
+        while (fgets(buf, LINE_MAX_BYTES + 1, opt.infile)) {
             if (input_size % 100 == 0) {
                 input_size += 100;
-                tmp = (line_t *) realloc (input.lines, input_size*sizeof(line_t));
+                line_t *tmp = (line_t *) realloc(input.lines, input_size * sizeof(line_t));
                 if (tmp == NULL) {
-                    perror (PROJECT);
+                    perror(PROJECT);
                     BFREE (input.lines);
                     return 1;
                 }
                 input.lines = tmp;
             }
 
-            len_bytes = strlen(buf);
-            mbtemp = u8_strconv_from_locale(buf);
-            input.lines[input.anz_lines].len = u8_strwidth(mbtemp, encoding);
-            input.lines[input.anz_lines].num_leading_blanks = 0;
-            input.final_newline = has_linebreak(buf, len_bytes);
+            mbtemp = u32_strconv_from_locale(buf);
+            len_chars = u32_strlen(mbtemp);
+            input.final_newline = has_linebreak(mbtemp, len_chars);
 
             if (opt.r) {
-                input.lines[input.anz_lines].len -= 1;    /* TODO HERE */
-                if (buf[input.lines[input.anz_lines].len] == '\n')
-                    buf[input.lines[input.anz_lines].len] = '\0';
+                if (is_char_at(mbtemp, len_chars - 1, char_newline)) {
+                    set_char_at(mbtemp, len_chars - 1, char_nul);
+                    --len_chars;
+                }
             }
             else {
-                btrim (buf, &(input.lines[input.anz_lines].len));
+                btrim32(mbtemp, &len_chars);
             }
 
-            if (input.lines[input.anz_lines].len > 0) {
-                newlen = expand_tabs_into (buf,
-                        input.lines[input.anz_lines].len, opt.tabstop, &temp,
-                        &(input.lines[input.anz_lines].tabpos),
-                        &(input.lines[input.anz_lines].tabpos_len));
-                if (newlen == 0) {
-                    perror (PROJECT);
+            /*
+             * Expand tabs
+             */
+            if (len_chars > 0) {
+                uint32_t *temp = NULL;
+                len_chars = expand_tabs_into(mbtemp, opt.tabstop, &temp,
+                                             &(input.lines[input.anz_lines].tabpos),
+                                             &(input.lines[input.anz_lines].tabpos_len));
+                if (len_chars == 0) {
+                    perror(PROJECT);
                     BFREE (input.lines);
                     return 1;
                 }
-                input.lines[input.anz_lines].text = temp;
-                input.lines[input.anz_lines].len = newlen;
+                input.lines[input.anz_lines].mbtext = temp;
                 temp = NULL;
             }
             else {
-                input.lines[input.anz_lines].text = (char *) strdup (buf);
+                input.lines[input.anz_lines].mbtext = mbtemp;
             }
+            input.lines[input.anz_lines].num_chars = len_chars;
 
             /*
              * Find ANSI CSI/ESC sequences
              */
-            invis = 0;
-            ansipos = 0;
-            for (i=0; i<input.lines[input.anz_lines].len; ++i) {
-                c = input.lines[input.anz_lines].text[i];
-                if (ansipos == 0 && c == 0x1b){
-                    /* Found an ESC char, count it as invisible and move 1 forward in the
-                     * detection of CSI sequences */
-                    ansipos++;
-                    invis++;
-                } else if (ansipos == 1 && c == '[') {
-                    /* Found '[' char after ESC. A CSI sequence has started. */
-                    ansipos++;
-                    invis++;
-                } else if (ansipos == 1 && c >= 0x40 && c <= 0x5f) {
-                    /* Found a byte designating the end of a two-byte
-                     * escape sequence */
-                    invis++;
-                    ansipos = 0;
-                } else if (ansipos == 2) {
-                    /* Inside CSI sequence - Keep counting bytes as invisible */
-                    invis++;
-
-                    /* A char between 0x40 and 0x7e signals the end of an CSI or escape sequence */
-                    if (c >= 0x40 && c <= 0x7e)
-                        ansipos = 0;
-                }
-            }
-
-            /* Save the count of invisible chars and visible chars.
-             * I'm happy about suggestions for a more elegant handling
-             * of this and the use of .invis and .vischar (and .len)
-             * in the other functions.
-             */
+            size_t num_esc = 0;
+            size_t invis = count_invisible_chars(input.lines[input.anz_lines].mbtext, strlen(buf), &num_esc,
+                                                 &(input.lines[input.anz_lines].text));
             input.lines[input.anz_lines].invis = invis;
-            input.lines[input.anz_lines].vischar = input.lines[input.anz_lines].len - invis;
+            input.lines[input.anz_lines].vischar = len_chars - invis;
+
+            /* u32_strwidth() does not count control characters, i.e. ESC characters, for which we must correct */
+            input.lines[input.anz_lines].len =
+                    u32_strwidth(input.lines[input.anz_lines].mbtext, encoding) - invis + num_esc;
+            input.lines[input.anz_lines].num_leading_blanks = 0;
 
             /*
              *  Update length of longest line
              */
-            if (input.lines[input.anz_lines].vischar > input.maxline) {
-                input.maxline = input.lines[input.anz_lines].vischar;
+            if (input.lines[input.anz_lines].len > input.maxline) {
+                input.maxline = input.lines[input.anz_lines].len;
             }
 
             /*
@@ -1489,8 +1509,8 @@ static int read_all_input (const int use_stdin)
             ++input.anz_lines;
         }
 
-        if (ferror (stdin)) {
-            perror (PROJECT);
+        if (ferror(stdin)) {
+            perror(PROJECT);
             BFREE (input.lines);
             return 1;
         }
@@ -1498,39 +1518,51 @@ static int read_all_input (const int use_stdin)
 
     else {
         /* recalculate input statistics for redrawing the mended box */
-        for (i=0; i<input.anz_lines; ++i) {
-            input.lines[i].len = strlen (input.lines[i].text);
-            if (input.lines[i].len > input.maxline)
+        for (i = 0; i < input.anz_lines; ++i) {
+            size_t num_esc = 0;
+            char *dummy;
+            size_t invis = count_invisible_chars(input.lines[i].mbtext, strlen(input.lines[i].text), &num_esc, &dummy);
+            BFREE(dummy);
+            input.lines[i].len = u32_strwidth(input.lines[i].mbtext, encoding) - invis + num_esc;
+            input.lines[i].num_chars = u32_strlen(input.lines[i].mbtext);
+            if (input.lines[i].len > input.maxline) {
                 input.maxline = input.lines[i].len;
+            }
         }
     }
 
     /*
      *  Exit if there was no input at all
      */
-    if (input.lines == NULL || input.lines[0].text == NULL)
+    if (input.lines == NULL || input.lines[0].text == NULL) {
         return 0;
+    }
 
     /*
      *  Compute indentation
      */
-    rc = get_indent (input.lines, input.anz_lines);
-    if (rc >= 0)
+    rc = get_indent(input.lines, input.anz_lines);
+    if (rc >= 0) {
         input.indent = (size_t) rc;
-    else
+    } else {
         return 1;
+    }
 
     /*
      *  Remove indentation, unless we want to preserve it (when removing
      *  a box or if the user wants to retain it inside the box)
      */
     if (opt.design->indentmode != 't' && opt.r == 0) {
-        for (i=0; i<input.anz_lines; ++i) {
-            if (input.lines[i].len >= input.indent) {
-                memmove (input.lines[i].text, input.lines[i].text+input.indent,
-                        input.lines[i].len-input.indent+1);
+        for (i = 0; i < input.anz_lines; ++i) {
+            if (input.lines[i].num_chars >= input.indent) {
+                memmove(input.lines[i].text, input.lines[i].text + input.indent,
+                        input.lines[i].len - input.indent + 1);
                 input.lines[i].len -= input.indent;
                 input.lines[i].vischar -= input.indent;
+
+                u32_move(input.lines[i].mbtext, input.lines[i].mbtext + input.indent,
+                         input.lines[i].num_chars - input.indent + 1);
+                input.lines[i].num_chars -= input.indent;
             }
         }
         input.maxline -= input.indent;
@@ -1540,32 +1572,38 @@ static int read_all_input (const int use_stdin)
      *  Apply regular expression substitutions
      */
     if (opt.r == 0) {
-        if (apply_substitutions(0) != 0)
+        if (apply_substitutions(0) != 0) { // TODO HERE
             return 1;
+        }
     }
 
-#if 0
+#if 1
     /*
      *  Debugging Code: Display contents of input structure
      */
+    fprintf (stderr, "Encoding: %s\n", encoding);
+    fprintf (stderr, "Input Lines:\n");
+    fprintf (stderr, "     [num_chars] \"real text\" [num_cols] \"ascii_text\"\n");
     for (i=0; i<input.anz_lines; ++i) {
-        fprintf (stderr, "%3d [%02d] \"%s\"", i, input.lines[i].len,
-                input.lines[i].text);
+        fprintf (stderr, "%4d [%02d] \"%s\"  [%02d] \"%s\"", (int) i,
+                 (int) input.lines[i].num_chars, u32_strconv_to_locale(input.lines[i].mbtext),
+                 (int) input.lines[i].len, input.lines[i].text);
         fprintf (stderr, "\tTabs: [");
         if (input.lines[i].tabpos != NULL) {
             size_t j;
             for (j=0; j<input.lines[i].tabpos_len; ++j) {
-                fprintf (stderr, "%d", input.lines[i].tabpos[j]);
+                fprintf (stderr, "%d", (int) input.lines[i].tabpos[j]);
                 if (j < input.lines[i].tabpos_len - 1) {
                     fprintf (stderr, ", ");
                 }
             }
         }
-        fprintf (stderr, "] (%d)\n", input.lines[i].tabpos_len);
+        fprintf (stderr, "] (%d)", (int) input.lines[i].tabpos_len);
+        fprintf (stderr, "\tvis=%d, invis=%d\n", (int) input.lines[i].vischar, (int) input.lines[i].invis);
     }
-    fprintf (stderr, "\n Longest line: %d characters.\n", input.maxline);
-    fprintf (stderr, "  Indentation: %2d spaces.\n", input.indent);
-    fprintf (stderr, "Final newline:  %d.\n", input.final_newline);
+    fprintf (stderr, " Longest line: %d columns\n", (int) input.maxline);
+    fprintf (stderr, "  Indentation: %2d spaces\n", (int) input.indent);
+    fprintf (stderr, "Final newline: %s\n", input.final_newline ? "yes" : "no");
 #endif
 
     return 0;
@@ -1607,6 +1645,9 @@ int main (int argc, char *argv[])
      */
     setlocale(LC_ALL, "");    /* switch from default "C" encoding to system encoding */
     encoding = locale_charset();
+    #ifdef DEBUG
+        fprintf (stderr, "Character Encoding = %s\n", encoding);
+    #endif
 
     /*
      *  Parse config file, then reset design pointer
diff --git a/src/boxes.h.in b/src/boxes.h.in
index 522a0ee..3a72217 100644
--- a/src/boxes.h.in
+++ b/src/boxes.h.in
@@ -145,26 +145,19 @@ typedef struct {                         /* Command line options: */
 extern opt_t opt;
 
 
-extern char *encoding;                   /* the character encoding that we use */
-
-extern ucs4_t char_tab;                  /* ucs4_t character '\t' (tab)  */
-extern ucs4_t char_space;                /* ucs4_t character ' '  (space) */
-extern ucs4_t char_cr;                   /* ucs4_t character '\r' (carriage return) */
-extern ucs4_t char_newline;              /* ucs4_t character '\n' (newline) */
-extern ucs4_t char_nul;                  /* ucs4_t character '\0' (zero) */
-
-
 typedef struct {
-    size_t   len;                        /* length of text in columns (character positions in a text terminal) */
-    char    *text;                       /* ASCII line content, tabs expanded, multi-byte chars replaced with 'x' */
-    uint8_t *mbtext;                     /* multi-byte (original) line content, tabs expanded. We use UTF-8 so that our old regex code can find ASCII characters in it. */
-    size_t   invis;                      /* number of characters part of an ansi sequence */
-    size_t   vischar;                    /* number of normal printable characters */
-    size_t  *tabpos;                     /* tab positions in expanded work strings */
-    size_t   tabpos_len;                 /* number of tabs in a line */
-    size_t   num_leading_blanks;         /* number of spaces at the start of the line after justification */
+    size_t    len;                       /* length of text in columns (character positions in a text terminal) */
+    char     *text;                      /* ASCII line content, tabs expanded, multi-byte chars replaced with one or more 'x' */
+    uint32_t *mbtext;                    /* multi-byte (original) line content, tabs expanded. We use UTF-32 in order to enable pointer arithmetic. */
+    size_t    num_chars;                 /* number of characters in mbtext, visible + invisible */
+    size_t    invis;                     /* number of characters part of an ansi sequence (aka "invisible") */
+    size_t    vischar;                   /* number of normal printable characters (aka "visible") */
+    size_t   *tabpos;                    /* tab positions in expanded work strings, or NULL if not needed */
+    size_t    tabpos_len;                /* number of tabs in a line */
+    size_t    num_leading_blanks;        /* number of spaces at the start of the line after justification */
 } line_t;
 
+
 #ifndef FILE_LEXER_L
 typedef struct {
     line_t *lines;
diff --git a/src/tools.c b/src/tools.c
index 9f28d9f..21a28e7 100644
--- a/src/tools.c
+++ b/src/tools.c
@@ -30,11 +30,13 @@
 #include <string.h>
 #include <strings.h>
 
+#include <unictype.h>
 #include <unistr.h>
 #include <unitypes.h>
 
 #include "shape.h"
 #include "boxes.h"
+#include "unicode.h"
 #include "tools.h"
 
 
@@ -239,13 +241,12 @@ int empty_line(const line_t *line)
 
 
 
-size_t expand_tabs_into(const uint32_t *input_buffer, const size_t in_len,
-                        const int tabstop, uint32_t **text, size_t **tabpos, size_t *tabpos_len)
+size_t expand_tabs_into(const uint32_t *input_buffer, const int tabstop, uint32_t **text,
+                        size_t **tabpos, size_t *tabpos_len)
 /*
  *  Expand tab chars in input_buffer and store result in text.
  *
  *  input_buffer   Line of text with tab chars
- *  in_len         length of the string in input_buffer in characters
  *  tabstop        tab stop distance
  *  text           address of the pointer that will take the result
  *  tabpos         array of ints giving the positions of the first
@@ -262,18 +263,21 @@ size_t expand_tabs_into(const uint32_t *input_buffer, const size_t in_len,
  */
 {
     static uint32_t temp[LINE_MAX_BYTES * MAX_TABSTOP + 1];  /* work string */
-    size_t io;       /* character position in work string */
-    size_t tabnum;   /* index of the current tab */
+    size_t io;         /* character position in work string */
+    size_t tabnum = 0; /* index of the current tab */
 
     *text = NULL;
 
     if (opt.tabexp != 'k') {
+        /* We need to know the exact tab positions only if expansion type 'k' is requested (keep tabs as much as they
+         * were as possible). Else we'll just convert spaces and tabs without having to know where exactly the tabs
+         * were in the first place. */
         *tabpos_len = 0;
     } else {
         ucs4_t puc;
         const uint32_t *rest = input_buffer;
-        while (rest = u32_next(&puc, rest)) {
-            if (u32_cmp(&char_tab, &puc, 1) == 0) {
+        while ((rest = u32_next(&puc, rest))) {
+            if (puc == char_tab) {
                 (*tabpos_len)++;
             }
         }
@@ -289,8 +293,8 @@ size_t expand_tabs_into(const uint32_t *input_buffer, const size_t in_len,
     ucs4_t puc;
     const uint32_t *rest = input_buffer;
     io = 0;
-    while (rest = u32_next(&puc, rest)) {
-        if (u32_cmp(&char_tab, &puc, 1) == 0) { /* Is it a tab char? */
+    while ((rest = u32_next(&puc, rest))) {
+        if (puc == char_tab) {
             if (*tabpos_len > 0) {
                 (*tabpos)[tabnum++] = io;
             }
@@ -299,7 +303,7 @@ size_t expand_tabs_into(const uint32_t *input_buffer, const size_t in_len,
             io += num_spc;
         }
         else {
-            u32_set(temp + io, puc, 1);
+            set_char_at(temp, io, puc);
             ++io;
         }
     }
@@ -335,6 +339,34 @@ void btrim(char *text, size_t *len)
 
 
 
+void btrim32(uint32_t *text, size_t *len)
+/*
+ *  Remove trailing whitespace from line (unicode version).
+ *
+ *      text     string to trim
+ *      len      pointer to the length of the string in characters
+ *
+ *  Both the string and the length will be modified as trailing whitespace is removed.
+ *
+* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+ */
+{
+    int idx = (int) (*len - 1);
+
+    for (; idx >= 0; --idx) {
+        ucs4_t c = text[idx];
+        if (uc_is_c_whitespace(c) || uc_is_property_white_space(c) || uc_is_property_bidi_whitespace(c)) {
+            set_char_at(text, idx, char_nul);
+        } else {
+            break;
+        }
+    }
+
+    *len = idx + 1;
+}
+
+
+
 char *my_strnrstr(const char *s1, const char *s2, const size_t s2_len, int skip)
 /*
  *  Return pointer to last occurrence of string s2 in string s1.
diff --git a/src/tools.h b/src/tools.h
index b3ab5af..35e0516 100644
--- a/src/tools.h
+++ b/src/tools.h
@@ -43,11 +43,13 @@ void regerror(char *msg);
 
 int empty_line(const line_t *line);
 
-size_t expand_tabs_into(const uint32_t *input_buffer, const size_t in_len,
-                        const int tabstop, uint32_t **text, size_t **tabpos, size_t *tabpos_len);
+size_t expand_tabs_into(const uint32_t *input_buffer, const int tabstop, uint32_t **text,
+                        size_t **tabpos, size_t *tabpos_len);
 
 void btrim(char *text, size_t *len);
 
+void btrim32(uint32_t *text, size_t *len);
+
 char *my_strnrstr(const char *s1, const char *s2, const size_t s2_len,
                   int skip);
 
diff --git a/src/unicode.c b/src/unicode.c
new file mode 100644
index 0000000..38d9ad7
--- /dev/null
+++ b/src/unicode.c
@@ -0,0 +1,96 @@
+/*
+ * boxes - Command line filter to draw/remove ASCII boxes around text
+ * Copyright (C) 1999  Thomas Jensen and the boxes contributors
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License, version 2, as published
+ * by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+ */
+
+/*
+ * Functions and constants for handling unicode strings with libunistring.
+ */
+
+#include "config.h"
+#include <errno.h>
+#include <stdlib.h>
+
+#include <unictype.h>
+#include <unistr.h>
+#include <unitypes.h>
+
+#include "unicode.h"
+
+
+const char *encoding;                          /* the character encoding that we use */
+
+const ucs4_t char_tab     = 0x00000009;        /* ucs4_t character '\t' (tab)  */
+const ucs4_t char_space   = 0x00000020;        /* ucs4_t character ' '  (space) */
+const ucs4_t char_cr      = 0x0000000d;        /* ucs4_t character '\r' (carriage return) */
+const ucs4_t char_newline = 0x0000000a;        /* ucs4_t character '\n' (newline) */
+const ucs4_t char_nul     = 0x00000000;        /* ucs4_t character '\0' (zero) */
+
+
+
+/**
+ * Check whether the character at the given index has the given value.
+ *
+ * @param <text> the string to check
+ * @param <idx> the index position of the character to check
+ * @param <expected_char> the expected character value
+ * @return flag indicating whether the character has the expected value
+ */
+int is_char_at(const uint32_t *text, const size_t idx, const ucs4_t expected_char)
+{
+    return text != NULL && u32_cmp(text + idx, &expected_char, 1) == 0;
+}
+
+
+
+/**
+ * Set the character at the given index to the given value.
+ *
+ * @param <text> the string to modify
+ * @param <idx> the index position of the character to modify
+ * @param <char_to_set> the new character value
+ */
+void set_char_at(uint32_t *text, const size_t idx, const ucs4_t char_to_set)
+{
+    u32_set(text + idx, char_to_set, 1);
+}
+
+
+
+/**
+ *  Determine if a string is NULL/empty or not.
+ *
+ *  @param <text> the string to check
+ *  @return > 0: the string is empty or NULL
+ *            0: the string contains at least 1 character
+ */
+int is_empty(const uint32_t *text)
+{
+    return text == NULL || is_char_at(text, 0, char_nul);
+}
+
+
+
+int is_ascii_printable(const ucs4_t c)
+{
+    return c >= 0x20 && c < 0x7f;
+}
+
+
+
+/*EOF*/                                                  /* vim: set sw=4: */
diff --git a/src/unicode.h b/src/unicode.h
new file mode 100644
index 0000000..8c8d165
--- /dev/null
+++ b/src/unicode.h
@@ -0,0 +1,49 @@
+/*
+ * boxes - Command line filter to draw/remove ASCII boxes around text
+ * Copyright (C) 1999  Thomas Jensen and the boxes contributors
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License, version 2, as published
+ * by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+ */
+
+/*
+ * Functions and constants for handling unicode strings with libunistring.
+ */
+
+#ifndef UNICODE_H
+#define UNICODE_H
+
+extern const char *encoding;                   /* the character encoding that we use */
+
+extern const ucs4_t char_tab;                  /* ucs4_t character '\t' (tab)  */
+extern const ucs4_t char_space;                /* ucs4_t character ' '  (space) */
+extern const ucs4_t char_cr;                   /* ucs4_t character '\r' (carriage return) */
+extern const ucs4_t char_newline;              /* ucs4_t character '\n' (newline) */
+extern const ucs4_t char_nul;                  /* ucs4_t character '\0' (zero) */
+
+
+
+int is_char_at(const uint32_t *text, const size_t idx, const ucs4_t expected_char);
+
+void set_char_at(uint32_t *text, const size_t idx, const ucs4_t char_to_set);
+
+int is_empty(const uint32_t *text);
+
+int is_ascii_printable(const ucs4_t c);
+
+
+#endif
+
+/*EOF*/                                          /* vim: set cindent sw=4: */