Extend bxstring functionality

This commit is contained in:
Thomas Jensen 2023-03-31 21:00:00 +02:00
parent 5026357fc2
commit 4ff37eb5ba
No known key found for this signature in database
GPG Key ID: A4ACEE270D0FB7DB
9 changed files with 1607 additions and 125 deletions

View File

@ -36,10 +36,6 @@ bxstr_t *bxs_from_ascii(char *pAscii)
bx_fprintf(stderr, "%s: internal error: from_ascii() called with NULL\n", PROJECT);
return NULL;
}
if (strchr(pAscii, '\t') != NULL) {
bx_fprintf(stderr, "%s: internal error: from_ascii() called with tabs: \"%s\"\n", PROJECT, pAscii);
return NULL;
}
bxstr_t *result = (bxstr_t *) calloc(1, sizeof(bxstr_t));
result->memory = u32_strconv_from_arg(pAscii, "ASCII");
@ -49,6 +45,14 @@ bxstr_t *bxs_from_ascii(char *pAscii)
}
result->ascii = strdup(pAscii);
size_t error_pos = 0;
if (!bxs_valid_anywhere(result, &error_pos)) {
ucs4_t c = result->memory[error_pos];
bx_fprintf(stderr, "%s: illegal character '%lc' (%#010x) encountered in string\n", PROJECT, c, (int) c);
bxs_free(result);
return NULL;
}
size_t num_esc = 0;
char *ascii_copy;
size_t *map;
@ -62,9 +66,9 @@ bxstr_t *bxs_from_ascii(char *pAscii)
result->indent = strspn(pAscii, " ");
result->trailing = my_strrspn(pAscii, " ");
result->first_char = malloc(result->num_chars_visible * sizeof(size_t));
result->visible_char = malloc(result->num_chars_visible * sizeof(size_t));
for (size_t i = 0; i < result->num_chars_visible; i++) {
result->first_char = calloc(result->num_chars_visible + 1, sizeof(size_t));
result->visible_char = calloc(result->num_chars_visible + 1, sizeof(size_t));
for (size_t i = 0; i <= result->num_chars_visible; i++) {
result->first_char[i] = i;
result->visible_char[i] = i;
}
@ -77,14 +81,14 @@ bxstr_t *bxs_from_ascii(char *pAscii)
bxstr_t *bxs_from_unicode(uint32_t *pInput)
{
if (pInput == NULL) {
bx_fprintf(stderr, "%s: internal error: from_unicode() called with NULL\n", PROJECT);
bx_fprintf(stderr, "%s: internal error: bxs_from_unicode() called with NULL\n", PROJECT);
return NULL;
}
bxstr_t *result = (bxstr_t *) calloc(1, sizeof(bxstr_t));
result->memory = u32_strdup(pInput);
result->num_chars = u32_strlen(pInput);
size_t ascii_len = ((size_t) u32_strwidth(pInput, encoding)) + 1; /* often generous, but always enough */
size_t ascii_len = ((size_t) u32_strwidth(pInput, encoding)) + 1;
result->ascii = (char *) calloc(ascii_len, sizeof(char));
size_t map_size = 5;
result->first_char = (size_t *) calloc(map_size, sizeof(size_t));
@ -96,6 +100,7 @@ bxstr_t *bxs_from_unicode(uint32_t *pInput)
int indent_active = 1;
size_t blank_streak = 0;
int first_candidate = -1;
int non_blank_encountered = 0;
size_t idx = 0;
for (ucs4_t c = pInput[0]; c != char_nul; c = rest[0]) {
@ -105,8 +110,8 @@ bxstr_t *bxs_from_unicode(uint32_t *pInput)
result->visible_char = (size_t *) realloc(result->visible_char, map_size * sizeof(size_t));
}
if (c == char_tab) {
bx_fprintf(stderr, "%s: internal error: tab encountered in from_unicode()\n", PROJECT);
if (!is_allowed_anywhere(c)) { /* CHECK currently used for config only, reconsider when using on input data */
bx_fprintf(stderr, "%s: illegal character '%lc' (%#010x) encountered in string\n", PROJECT, c, (int) c);
bxs_free(result);
return NULL;
}
@ -124,6 +129,10 @@ bxstr_t *bxs_from_unicode(uint32_t *pInput)
*ascii_ptr = c & 0xff;
++ascii_ptr;
}
else if (c == char_tab) {
*ascii_ptr = ' ';
++ascii_ptr;
}
else {
cols = BMAX(0, uc_width(c, encoding));
if (cols > 0) {
@ -131,7 +140,7 @@ bxstr_t *bxs_from_unicode(uint32_t *pInput)
ascii_ptr += cols;
}
}
if (uc_is_blank(c)) {
if (is_blank(c)) {
if (indent_active) {
result->indent += cols;
}
@ -143,9 +152,10 @@ bxstr_t *bxs_from_unicode(uint32_t *pInput)
first_candidate = -1;
}
if (!uc_is_blank(c) && c != char_esc) {
if (!is_blank(c) && c != char_esc) {
indent_active = 0;
blank_streak = 0;
non_blank_encountered = 1;
}
rest = advance_next32(rest, &step_invis);
@ -163,12 +173,261 @@ bxstr_t *bxs_from_unicode(uint32_t *pInput)
*ascii_ptr = '\0';
result->visible_char[result->num_chars_visible] = idx; // both point to the terminator
result->first_char[result->num_chars_visible] = idx;
result->trailing = blank_streak;
result->trailing = non_blank_encountered ? blank_streak : 0;
return result;
}
bxstr_t *bxs_strdup(bxstr_t *pString)
{
if (pString == NULL) {
return NULL;
}
bxstr_t *result = (bxstr_t *) calloc(1, sizeof(bxstr_t));
if (result != NULL) {
result->memory = u32_strdup(pString->memory);
result->ascii = strdup(pString->ascii);
result->indent = pString->indent;
result->num_columns = pString->num_columns;
result->num_chars = pString->num_chars;
result->num_chars_visible = pString->num_chars_visible;
result->num_chars_invisible = pString->num_chars_invisible;
result->trailing = pString->trailing;
result->first_char = malloc((pString->num_chars_visible + 1) * sizeof(size_t));
memcpy(result->first_char, pString->first_char, (pString->num_chars_visible + 1) * sizeof(size_t));
result->visible_char = malloc((pString->num_chars_visible + 1) * sizeof(size_t));
memcpy(result->visible_char, pString->visible_char, (pString->num_chars_visible + 1) * sizeof(size_t));
}
return result;
}
bxstr_t *bxs_trimdup(bxstr_t *pString, size_t start_idx, size_t end_idx)
{
if (pString == NULL) {
return NULL;
}
if (start_idx > pString->num_chars_visible) {
/* a start_idx on the terminating NUL is a valid input */
bx_fprintf(stderr, "%s: internal error: start_idx out of bounds in bxs_trimdup()\n", PROJECT);
return NULL;
}
if (end_idx > pString->num_chars_visible) {
bx_fprintf(stderr, "%s: internal error: end_idx out of bounds in bxs_trimdup()\n", PROJECT);
return NULL;
}
if (end_idx < start_idx) {
bx_fprintf(stderr, "%s: internal error: end_idx before start_idx in bxs_trimdup()\n", PROJECT);
return NULL;
}
while (start_idx < end_idx && uc_is_blank(pString->memory[pString->visible_char[start_idx]])) {
start_idx++;
}
while (start_idx < end_idx && uc_is_blank(pString->memory[pString->visible_char[end_idx - 1]])) {
end_idx--;
}
ucs4_t save = char_nul;
if (end_idx < pString->num_chars_visible) {
save = pString->memory[pString->first_char[end_idx]];
set_char_at(pString->memory, pString->first_char[end_idx], char_nul);
}
bxstr_t *result = bxs_from_unicode(pString->memory + pString->first_char[start_idx]);
if (end_idx < pString->num_chars_visible) {
set_char_at(pString->memory, pString->first_char[end_idx], save);
}
return result;
}
bxstr_t *bxs_strcat(bxstr_t *pString, uint32_t *pToAppend)
{
if (pToAppend == NULL) {
return bxs_strdup(pString);
}
size_t appened_num_chars = u32_strlen(pToAppend);
if (appened_num_chars == 0) {
return bxs_strdup(pString);
}
if (pString == NULL || pString->num_chars == 0) {
return bxs_from_unicode(pToAppend);
}
size_t combined_num_chars = pString->num_chars + appened_num_chars;
uint32_t *s = (uint32_t *) malloc((combined_num_chars + 1) * sizeof(uint32_t));
memcpy(s, pString->memory, pString->num_chars * sizeof(uint32_t));
memcpy(s + pString->num_chars, pToAppend, appened_num_chars * sizeof(uint32_t));
set_char_at(s, combined_num_chars, char_nul);
bxstr_t *result = bxs_from_unicode(s);
BFREE(s);
return result;
}
uint32_t *bxs_strchr(bxstr_t *pString, ucs4_t c, int *cursor)
{
uint32_t *result = NULL;
if (pString != NULL && pString->num_chars_visible > 0) {
size_t start_idx = cursor != NULL ? *cursor + 1 : 0;
for (size_t i = start_idx; i < pString->num_chars_visible; i++) {
if (pString->memory[pString->visible_char[i]] == c) {
result = pString->memory + pString->visible_char[i];
if (cursor != NULL) {
*cursor = (int) i;
}
break;
}
}
}
return result;
}
bxstr_t *bxs_trim(bxstr_t *pString)
{
if (pString == NULL) {
return NULL;
}
if (pString->indent == 0 && pString->trailing == 0) {
return bxs_strdup(pString);
}
if (pString->indent + pString->trailing == pString->num_chars_visible) {
return bxs_from_ascii("");
}
uint32_t *e = u32_strdup(pString->memory);
set_char_at(e, pString->first_char[pString->num_chars_visible - pString->trailing], char_nul);
uint32_t *s = e + pString->first_char[pString->indent];
bxstr_t *result = bxs_from_unicode(s);
BFREE(e);
return result;
}
bxstr_t *bxs_rtrim(bxstr_t *pString)
{
if (pString == NULL) {
return NULL;
}
if (pString->trailing == 0) {
return bxs_strdup(pString);
}
uint32_t *s = u32_strdup(pString->memory);
set_char_at(s, pString->first_char[pString->num_chars_visible - pString->trailing], char_nul);
bxstr_t *result = bxs_from_unicode(s);
BFREE(s);
return result;
}
char *bxs_to_output(bxstr_t *pString)
{
if (pString == NULL) {
return strdup("NULL");
}
return u32_strconv_to_output(pString->memory);
}
int bxs_is_empty(bxstr_t *pString)
{
if (pString == NULL) {
return 1;
}
return pString->num_chars > 0 ? 0 : 1;
}
int bxs_strcmp(bxstr_t *s1, bxstr_t *s2)
{
if (s1 == NULL) {
if (s2 == NULL) {
return 0;
}
else {
return 1;
}
}
if (s2 == NULL) {
return -1;
}
return u32_strcmp(s1->memory, s2->memory);
}
static int bxs_valid_in_context(bxstr_t *pString, size_t *error_pos, int (*predicate)(const ucs4_t))
{
if (pString == NULL) {
if (error_pos != NULL) {
*error_pos = 0;
}
return 0; /* invalid */
}
for (size_t i = 0; pString->memory[i] != char_nul; i++) {
if ((*predicate)(pString->memory[i]) == 0) {
if (error_pos != NULL) {
*error_pos = i;
}
return 0; /* invalid */
}
}
return 1; /* valid */
}
int bxs_valid_anywhere(bxstr_t *pString, size_t *error_pos)
{
return bxs_valid_in_context(pString, error_pos, &is_allowed_anywhere);
}
int bxs_valid_in_shape(bxstr_t *pString, size_t *error_pos)
{
return pString->num_chars_visible > 0 && bxs_valid_in_context(pString, error_pos, &is_allowed_in_shape);
}
int bxs_valid_in_sample(bxstr_t *pString, size_t *error_pos)
{
return pString->num_chars_visible > 0 && bxs_valid_in_context(pString, error_pos, &is_allowed_in_sample);
}
int bxs_valid_in_filename(bxstr_t *pString, size_t *error_pos)
{
return pString->num_chars_visible > 0 && pString->num_chars_invisible == 0
&& bxs_valid_in_context(pString, error_pos, &is_allowed_in_filename);
}
int bxs_valid_in_kv_string(bxstr_t *pString, size_t *error_pos)
{
return bxs_valid_in_context(pString, error_pos, &is_allowed_in_kv_string);
}
void bxs_free(bxstr_t *pString)
{
if (pString != NULL) {

View File

@ -23,22 +23,48 @@
#include <unitypes.h>
/**
* A boxes-internal string. Should be treated as immutable.
*/
typedef struct {
uint32_t *memory; /* Immutable. Pointer to the original memory area for the string, NUL terminated */
char *ascii; /* Immutable. ASCII version of the original string, tabs expanded, invisible characters removed, multi-byte chars replaced with one or more 'x'. NUL terminated. CHECK remove this eventually */
size_t indent; /* Immutable. Number of spaces at the beginning of the original string */
size_t num_columns; /* Immutable. Total number of screen columns required to display the string (important for double-wide characters such as Chinese) */
size_t num_chars; /* Immutable. Number of characters in the original string (visible + invisible) */
size_t num_chars_visible; /* Immutable. Number of visible characters in the original string */
size_t num_chars_invisible; /* Immutable. Number of invisible characters in the original string */
size_t trailing; /* Immutable. Number of trailing spaces in the original string */
size_t *first_char; /* Immutable. Array of index values into `memory` of the first actual character (possibly invisible) of each visible character */
size_t *visible_char; /* Immutable. Array of index values into `memory` of the visible characters themselves */
int offset_start; /* Number of visible characters to cut from the beginning of the string. Must be smaller than or equal to `indent`. Can be negative, in which case spaces will be prepended. */
int offset_end; /* Number of visible characters to cut from the end of the string. Must be smaller than or equal to `trailing`. Can be negative, in which case spaces will be appended. */
/** Pointer to the original memory area for the string, NUL terminated */
uint32_t *memory;
/** ASCII version of the original string, tabs expanded, invisible characters removed, multi-byte chars replaced
* with one or more 'x'. NUL terminated. */
char *ascii;
/** Number of spaces at the beginning of the original string */
size_t indent;
/** Total number of screen columns required to display the string
* (important for double-wide characters such as Chinese) */
size_t num_columns;
/** Number of characters in the original string (visible + invisible) */
size_t num_chars;
/** Number of visible characters in the original string */
size_t num_chars_visible;
/** Number of invisible characters in the original string */
size_t num_chars_invisible;
/** Number of trailing spaces in the original string */
size_t trailing;
/** Array of index values into `memory` of the first actual character (possibly invisible) of each visible
* character. Length of this array is `num_chars_visible` + 1, as the last value is the index of the NUL
* terminator. */
size_t *first_char;
/** Array of index values into `memory` of the visible characters themselves.
* Length of this array is `num_chars_visible` + 1, as the last value is the index of the NUL terminator. */
size_t *visible_char;
} bxstr_t;
/**
* Construct a `bxstr_t` from an ASCII string.
* @param pAscii the ASCII string, which does not contain tabs or ANSI escapes
@ -48,7 +74,8 @@ bxstr_t *bxs_from_ascii(char *pAscii);
/**
* Construct a `bxstr_t` from a Unicode string.
* Construct a `bxstr_t` from a Unicode string. When the string consists only of blanks, they count as indentation,
* not as trailing blanks.
* @param pInput the utf-8 encoded string, which may include ANSI escapes, but NOT tabs
* @return a pointer to a new `bxstr_t` for which memory has been allocated,
* or NULL if `pInput` was NULL or an error occurred (such as tabs encountered)
@ -56,6 +83,143 @@ bxstr_t *bxs_from_ascii(char *pAscii);
bxstr_t *bxs_from_unicode(uint32_t *pInput);
/**
* Create an exact copy of a string.
* @param pString the string to copy
* @return the copied string, for which new memory was allocated
*/
bxstr_t *bxs_strdup(bxstr_t *pString);
/**
* Take a substring from the given string, trim leading and trailing space from it, and duplicate the result in a new
* string. If invisible characters are included in the string, they are also duplicated.
* @param pString the source string
* @param start_idx the index of the first visible character of the substring
* @param end_idx the index of the first visible character following the substring
* @return the trimmed and duplicated substring, for which new memory was allocated
*/
bxstr_t *bxs_trimdup(bxstr_t *pString, size_t start_idx, size_t end_idx);
/**
* Combine `pString + pToAppend` into a new string. Memory for the input strings is NOT modified or freed.
* @param pString the string to append to
* @param pToAppend the string to append
* @return the concatenated string, for which new memory was allocated
*/
bxstr_t *bxs_strcat(bxstr_t *pString, uint32_t *pToAppend);
/**
* Return a pointer to the first visible occurrence of the character `c` in the string `pString`.
* Invisible characters are ignored.
* @param pString the string to search
* @param c the character to find
* @param cursor If specified, indicates the start position for the search (search will start on the next character,
* so the caller does not need to manage this value), and will be updated with the next position found (if found).
* On the first call, this value should point to -1. May be NULL, in which case the cursor feature is not used.
* @return a pointer into the memory of `pString`, or NULL if the character was not found
*/
uint32_t *bxs_strchr(bxstr_t *pString, ucs4_t c, int *cursor);
/**
* Create a new string from which all leading and trailing whitespace have been removed.
* @param pString the string to trim, which will not be modified
* @return a new, trimmed string
*/
bxstr_t *bxs_trim(bxstr_t *pString);
/**
* Create a new string from which all trailing whitespace have been removed.
* @param pString the string to trim, which will not be modified
* @return a new, trimmed string
*/
bxstr_t *bxs_rtrim(bxstr_t *pString);
/**
* Convert the string into boxes' output encoding for proper printing on stdout.
* @param pString the string to convert
* @return the same string in the target (output) encoding
*/
char *bxs_to_output(bxstr_t *pString);
/**
* Determine if the given string is empty.
* @param pString the string to check (may be NULL, which counts as empty)
* @return 1 for empty, 0 for not empty
*/
int bxs_is_empty(bxstr_t *pString);
/**
* Determine whether the given `pString` is a valid string under at least one condition. This will return `false` for
* strings which should really never occur anywhere.
* @param pString the string to check (may be NULL, which counts as invalid)
* @param error_pos if invalid, this address is used to store the first invalid character position in the string;
* may be NULL, in which case the information is not provided
* @return == 0: invalid; otherwise: valid
*/
int bxs_valid_anywhere(bxstr_t *pString, size_t *error_pos);
/**
* Determine whether the given `pString` is a valid string representing a line of a shape in the config file.
* @param pString the string to check (may be NULL, which counts as invalid)
* @param error_pos if invalid, this address is used to store the first invalid character position in the string;
* may be NULL, in which case the information is not provided
* @return == 0: invalid; otherwise: valid
*/
int bxs_valid_in_shape(bxstr_t *pString, size_t *error_pos);
/**
* Determine whether the given `pString` is a valid string representing a complete SAMPLE block in the config file.
* @param pString the string to check (may be NULL, which counts as invalid)
* @param error_pos if invalid, this address is used to store the first invalid character position in the string;
* may be NULL, in which case the information is not provided
* @return == 0: invalid; otherwise: valid
*/
int bxs_valid_in_sample(bxstr_t *pString, size_t *error_pos);
/**
* Determine whether the given `pString` is a valid string representing a parent filename in the config file.
* @param pString the string to check (may be NULL, which counts as invalid)
* @param error_pos if invalid, this address is used to store the first invalid character position in the string;
* may be NULL, in which case the information is not provided
* @return == 0: invalid; otherwise: valid
*/
int bxs_valid_in_filename(bxstr_t *pString, size_t *error_pos);
/**
* Determine whether the given `pString` is a valid string representing a value from a key/value pair in the config
* file.
* @param pString the string to check (may be NULL, which counts as invalid)
* @param error_pos if invalid, this address is used to store the first invalid character position in the string;
* may be NULL, in which case the information is not provided
* @return == 0: invalid; otherwise: valid
*/
int bxs_valid_in_kv_string(bxstr_t *pString, size_t *error_pos);
/**
* Compares `s1` and `s2` lexicographically. Returns a negative value if `s1` compares smaller than `s2`, a positive
* value if `s1` compares larger than `s2`, or 0 if they compare equal. NULL values are considered "greather than"
* non-NULL values.
* This function is similar to `strcmp()`, except that it operates on `bxstr_t *` strings and supports NULL values.
* @param s1 the first string (may be NULL)
* @param s2 the second string (may be NULL)
* @return comparison result as described above
*/
int bxs_strcmp(bxstr_t *s1, bxstr_t *s2);
/**
* Free the memory allocated by the given `bxstr_t`.
* @param pString the `bxstr_t` to free
@ -63,8 +227,6 @@ bxstr_t *bxs_from_unicode(uint32_t *pInput);
void bxs_free(bxstr_t *pString);
/* TODO */
#endif
/*EOF*/ /* vim: set cindent sw=4: */
/* vim: set cindent sw=4: */

View File

@ -33,8 +33,6 @@
const char *config_encoding = "ISO_8859-15";
/* effective character encoding of input and output text */
const char *encoding;
@ -58,14 +56,6 @@ const ucs4_t char_nul = 0x00000000;
/**
* Check whether the character at the given index has the given value.
*
* @param <text> the string to check
* @param <idx> the index position of the character to check
* @param <expected_char> the expected character value
* @return flag indicating whether the character has the expected value
*/
int is_char_at(const uint32_t *text, const size_t idx, const ucs4_t expected_char)
{
return text != NULL && u32_cmp(text + idx, &expected_char, 1) == 0;
@ -73,13 +63,6 @@ int is_char_at(const uint32_t *text, const size_t idx, const ucs4_t expected_cha
/**
* Set the character at the given index to the given value.
*
* @param <text> the string to modify
* @param <idx> the index position of the character to modify
* @param <char_to_set> the new character value
*/
void set_char_at(uint32_t *text, const size_t idx, const ucs4_t char_to_set)
{
u32_set(text + idx, char_to_set, 1);
@ -87,13 +70,6 @@ void set_char_at(uint32_t *text, const size_t idx, const ucs4_t char_to_set)
/**
* Determine if a string is NULL/empty or not.
*
* @param <text> the string to check
* @return > 0: the string is empty or NULL
* 0: the string contains at least 1 character
*/
int is_empty(const uint32_t *text)
{
return text == NULL || is_char_at(text, 0, char_nul);
@ -108,6 +84,49 @@ int is_ascii_printable(const ucs4_t c)
int is_allowed_anywhere(const ucs4_t c)
{
/* ESC, CR, LF, and TAB are control characters */
return !uc_is_cntrl(c) || c == char_tab || c == char_cr || c == char_newline || c == char_esc;
}
int is_allowed_in_shape(const ucs4_t c)
{
return is_allowed_anywhere(c) && c != char_cr && c != char_newline;
}
int is_allowed_in_sample(const ucs4_t c)
{
return is_allowed_anywhere(c);
}
int is_allowed_in_filename(const ucs4_t c)
{
return is_allowed_anywhere(c) && c != char_cr && c != char_newline && c != char_esc;
}
int is_allowed_in_kv_string(const ucs4_t c)
{
return is_allowed_anywhere(c) && c != char_cr && c != char_newline && c != char_esc;
}
int is_blank(const ucs4_t c)
{
return c == char_tab || uc_is_blank(c);
}
uint32_t *new_empty_string32()
{
return (uint32_t *) calloc(1, sizeof(uint32_t));
@ -115,6 +134,18 @@ uint32_t *new_empty_string32()
ucs4_t to_utf32(char ascii)
{
ucs4_t c = char_nul;
if (ascii >= 0x20 && ascii < 0x7f) {
char *bytes = (char *) (&c);
bytes[0] = ascii;
}
return c;
}
uint32_t *advance_next32(const uint32_t *s, size_t *invis)
{
if (is_empty(s)) {
@ -266,4 +297,22 @@ const char *check_encoding(const char *manual_encoding, const char *system_encod
}
char *to_utf8(uint32_t *src)
{
if (src == NULL) {
return NULL;
}
if (is_empty(src)) {
return (char *) strdup("");
}
char *result = u32_strconv_to_encoding(src, "UTF-8", iconveh_error);
if (result == NULL) {
bx_fprintf(stderr, "%s: failed to convert a string to UTF-8: %s\n", PROJECT, strerror(errno));
return NULL;
}
return result;
}
/*EOF*/ /* vim: set sw=4: */

View File

@ -24,9 +24,8 @@
/** The boxes config file is still encoded with a single-byte character set. Officially, it is ASCII!
* However, people might not conform to this, so we use ISO_8859-15 as a reasonable superset. */
extern const char *config_encoding;
/** Character encosing of the boxes configuration file */
#define CONFIG_FILE_ENCODING "UTF-8"
/* effective character encoding of input and output text */
extern const char *encoding;
@ -49,92 +48,176 @@ extern const ucs4_t char_esc;
/** ucs4_t character '\0' (zero) */
extern const ucs4_t char_nul;
/**
* Check whether the character at the given index has the given value.
*
* @param text the string to check
* @param idx the index position of the character to check
* @param expected_char the expected character value
* @return flag indicating whether the character has the expected value
*/
int is_char_at(const uint32_t *text, const size_t idx, const ucs4_t expected_char);
/**
* Set the character at the given index to the given value.
*
* @param text the string to modify
* @param idx the index position of the character to modify
* @param char_to_set the new character value
*/
void set_char_at(uint32_t *text, const size_t idx, const ucs4_t char_to_set);
/**
* Determine if a string is NULL/empty or not.
*
* @param text the string to check
* @return > 0: the string is empty or NULL;
* == 0: the string contains at least 1 character
*/
int is_empty(const uint32_t *text);
int is_ascii_printable(const ucs4_t c);
/** Return a freshly allocated empty UTF-32 string. */
uint32_t *new_empty_string32();
/**
* Return the next position in <s> in accordance with escape sequences. The result can be the next normal character,
* Determine if the character can occur in a boxes string under at least one condition. This will return false for
* characters which can really never occur anywhere.
* @param c the character to check
* @return 0 for `false` or non-zero for `true`
*/
int is_allowed_anywhere(const ucs4_t c);
int is_allowed_in_shape(const ucs4_t c);
int is_allowed_in_sample(const ucs4_t c);
int is_allowed_in_filename(const ucs4_t c);
int is_allowed_in_kv_string(const ucs4_t c);
/**
* Determine if the given character represents a blank or something else that is rendered thusly (like a tab). This
* would be variations of space, including Unicode (for example em-space), plus tab. CR and LF are *not* blanks.
* @param c a Unicode character
* @return 1 if it's a blank, 0 otherwise
*/
int is_blank(const ucs4_t c);
/**
* Return a freshly allocated empty UTF-32 string.
* @return a new empty string
*/
uint32_t *new_empty_string32();
/**
* Convert an ASCII character into a Unicode character.
* @param ascii a printable ASCII character in the range 0x20 - 0x7E
* @return the corresponding Unicode character, or NUL if `ascii` out of range
*/
ucs4_t to_utf32(char ascii);
/**
* Return the next position in `s` in accordance with escape sequences. The result can be the next normal character,
* or again an escape sequence, if it directly follows the first.
*
* @param <s> The pointer to the start position. Is assumed to point either at the ESC at the start of an escape
* @param s The pointer to the start position. Is assumed to point either at the ESC at the start of an escape
* sequence, or to be positioned outside an escape sequence.
* @param <invis> Will contain the number of invisible characters skipped in order to get to the new position.
* This will be 0 unless <s> pointed to an ESC char, in which case it contains the length in characters of that
* @param invis Will contain the number of invisible characters skipped in order to get to the new position.
* This will be 0 unless `s` pointed to an ESC char, in which case it contains the length in characters of that
* escape sequence.
* @return The next position, or 0 if the end of the string was reached
*/
uint32_t *advance_next32(const uint32_t *s, size_t *invis);
/**
* Determine a new position in the given string s with the given offset of visible characters.
* If the character right in front of the target character is invisible, then the pointer is moved to the start of
* that invisible sequence. The purpose is to catch any escape sequences which would for example color the character.
*
* @param <s> The pointer to the start position. Is assumed to point either at the ESC at the start of an escape
* @param s The pointer to the start position. Is assumed to point either at the ESC at the start of an escape
* sequence, or to be positioned outside an escape sequence.
* @param <offset> the number of visible character positions to advance the pointer
* @param offset the number of visible character positions to advance the pointer
* @return a pointer to the new position in s, or 0 if the end of the string was reached
*/
uint32_t *advance32(uint32_t *s, const size_t offset);
/**
* Convert a string from the input/output encoding (`encoding` in this .h file) to UTF-32 internal representation.
* Memory will be allocated for the converted string.
*
* @param <src> string to convert, zero-terminated
* @param src string to convert, zero-terminated
* @return UTF-32 string, or NULL in case of error (then an error message was already printed on stderr)
*/
uint32_t *u32_strconv_from_input(const char *src);
/**
* Convert a string from the given source encoding to UTF-32 internal representation.
* Memory will be allocated for the converted string.
*
* @param <src> string to convert, zero-terminated
* @param <sourceEncoding> the character encoding of <src>
* @param src string to convert, zero-terminated
* @param sourceEncoding the character encoding of `src`
* @return UTF-32 string, or NULL in case of error (then an error message was already printed on stderr)
*/
uint32_t *u32_strconv_from_arg(const char *src, const char *sourceEncoding);
/**
* Convert a string from UTF-32 internal representation to input/output encoding (`encoding` in this .h file).
* Memory will be allocated for the converted string.
*
* @param <src> UTF-32 string to convert, zero-terminated
* @param src UTF-32 string to convert, zero-terminated
* @return string in input/output encoding, or NULL on error (then an error message was already printed on stderr)
*/
char *u32_strconv_to_output(const uint32_t *src);
/**
* Convert a string from UTF-32 internal representation to the given target encoding.
* Memory will be allocated for the converted string.
*
* @param <src> UTF-32 string to convert, zero-terminated
* @param <targetEncoding> the character encoding of the result
* @param src UTF-32 string to convert, zero-terminated
* @param targetEncoding the character encoding of the result
* @return string in target encoding, or NULL in case of error (then an error message was already printed on stderr)
*/
char *u32_strconv_to_arg(const uint32_t *src, const char *targetEncoding);
/**
* Check if the given <manual_encoding> can be used to covert anything. This should reveal invalid encoding names that
* have been specified on the command line. If no <manual_encoding> was specified, or if an invalid encoding is
* Check if the given `manual_encoding` can be used to covert anything. This should reveal invalid encoding names that
* have been specified on the command line. If no `manual_encoding` was specified, or if an invalid encoding is
* detected, we fall back to the system encoding. No new memory is allocated.
*
* @param <manual_encoding> the encoding set on the command line, may be NULL
* @param <system_encoding> the system encoding
* @return <manual_encoding> if it is set to a valid value, <system_encoding> otherwise
* @param manual_encoding the encoding set on the command line, may be NULL
* @param system_encoding the system encoding
* @return `manual_encoding` if it is set to a valid value, `system_encoding` otherwise
*/
const char *check_encoding(const char *manual_encoding, const char *system_encoding);
/**
* Convert the given UTF-32 string into UTF-8 as `char *` byte sequence. The conversion must be fully successful, or
* an error will be returned (no question marks will be generated in the output).
* @param src the UTF-32 string
* @return a byte sequence in UTF-8 encoding
*/
char *to_utf8(uint32_t *src);
#endif
/*EOF*/ /* vim: set cindent sw=4: */

View File

@ -19,8 +19,9 @@ SRC_DIR = ../src
UTEST_DIR = ../utest
VPATH = $(SRC_DIR):$(SRC_DIR)/misc:$(UTEST_DIR)
UTEST_NORM = global_mock.c bxstring_test.o cmdline_test.c tools_test.c regulex_test.o main.o utest_tools.o
MOCKS = bx_fprintf
UTEST_NORM = global_mock.c bxstring_test.o cmdline_test.c tools_test.c regulex_test.o main.o unicode_test.o \
utest_tools.o
MOCKS = bx_fprintf
.PHONY: check_dir flags_unix flags_win32 flags_ utest
@ -36,7 +37,7 @@ $(OUT_DIR):
mkdir $(OUT_DIR)
flags_unix:
$(eval CFLAGS := -I. -I$(SRC_DIR) -O -Wall -W $(CFLAGS_ADDTL))
$(eval CFLAGS := -I. -I$(SRC_DIR) -O -Wall -W -Wno-stringop-overflow $(CFLAGS_ADDTL))
$(eval LDFLAGS := $(LDFLAGS) $(foreach MOCK,$(MOCKS),-Wl,--wrap=$(MOCK)) --coverage $(LDFLAGS_ADDTL))
$(eval UTEST_EXECUTABLE_NAME := unittest)
$(eval UTEST_OBJ := $(UTEST_NORM:.c=.o))
@ -72,4 +73,5 @@ cmdline_test.o: cmdline_test.c cmdline_test.h global_mock.h boxes.h cmdline.h c
tools_test.o: tools_test.c tools_test.h tools.h config.h | check_dir
regulex_test.o: regulex_test.c regulex_test.h global_mock.h regulex.h config.h | check_dir
main.o: main.c global_mock.h tools_test.h regulex_test.h config.h | check_dir
unicode_test.o: unicode_test.c unicode_test.h global_mock.h config.h | check_dir
utest_tools.o: utest_tools.c utest_tools.h config.h | check_dir

View File

@ -24,7 +24,9 @@
#include <stddef.h>
#include <cmocka.h>
#include <string.h>
#include "boxes.h"
#include "bxstring.h"
#include "bxstring_test.h"
#include "global_mock.h"
@ -34,25 +36,9 @@
void test_bxsfree_null(void **state)
{
(void) state; /* unused */
bxstr_t *bstr = (bxstr_t *) calloc(1, sizeof(bxstr_t));
bstr->ascii = NULL;
bstr->memory = NULL;
bstr->first_char = NULL;
bstr->visible_char = NULL;
bxs_free(bstr);
bxs_free(NULL);
}
void test_ascii_simple(void **state)
{
(void) state; /* unused */
UNUSED(state);
bxstr_t *actual = bxs_from_ascii("foo");
@ -68,30 +54,28 @@ void test_ascii_simple(void **state)
int expected_indexes[] = {0, 1, 2};
assert_array_equal(expected_indexes, actual->first_char, 3);
assert_array_equal(expected_indexes, actual->visible_char, 3);
assert_int_equal(0, actual->offset_start);
assert_int_equal(0, actual->offset_end);
bxs_free(actual);
}
void test_ascii_tabs(void **state)
void test_ascii_illegalchar(void **state)
{
(void) state; /* unused */
UNUSED(state);
bxstr_t *actual = bxs_from_ascii("illegal \t tab");
bxstr_t *actual = bxs_from_ascii("illegal \b backspace");
assert_null(actual);
assert_int_equal(1, collect_err_size);
assert_string_equal("boxes: internal error: from_ascii() called with tabs: \"illegal \t tab\"\n", collect_err[0]);
assert_string_equal("boxes: illegal character '\b' (0x00000008) encountered in string\n", collect_err[0]);
}
void test_ascii_null(void **state)
{
(void) state; /* unused */
UNUSED(state);
bxstr_t *actual = bxs_from_ascii(NULL);
@ -104,7 +88,7 @@ void test_ascii_null(void **state)
void test_ansi_unicode_book(void **state)
{
(void) state; /* unused */
UNUSED(state);
uint32_t *ustr32 = u32_strconv_from_arg(
"\x1b[38;5;203m \x1b[0m \x1b[38;5;198m x\x1b[0mxx\x1b[38;5;208m\xc3\xa4\x1b[0m\x1b[38;5;203mb\x1b[0m ",
@ -125,8 +109,6 @@ void test_ansi_unicode_book(void **state)
assert_array_equal(expected_firstchar_idx, actual->first_char, 11);
int expected_vischar_idx[] = {11, 16, 17, 29, 30, 35, 36, 48, 64, 69, 70};
assert_array_equal(expected_vischar_idx, actual->visible_char, 11);
assert_int_equal(0, actual->offset_start);
assert_int_equal(0, actual->offset_end);
BFREE(ustr32);
bxs_free(actual);
@ -136,10 +118,9 @@ void test_ansi_unicode_book(void **state)
void test_ansi_unicode_space_kinds(void **state)
{
(void) state; /* unused */
UNUSED(state);
uint32_t *ustr32 = u32_strconv_from_arg(
"\xe2\x80\x83\xe2\x80\x82 X", /* em-space, en-space, space, x */
uint32_t *ustr32 = u32_strconv_from_arg("\xe2\x80\x83\xe2\x80\x82 X", /* em-space, en-space, space, x */
"UTF-8");
assert_non_null(ustr32);
bxstr_t *actual = bxs_from_unicode(ustr32);
@ -157,8 +138,6 @@ void test_ansi_unicode_space_kinds(void **state)
assert_array_equal(expected_firstchar_idx, actual->first_char, 4);
int expected_vischar_idx[] = {0, 1, 2, 3};
assert_array_equal(expected_vischar_idx, actual->visible_char, 4);
assert_int_equal(0, actual->offset_start);
assert_int_equal(0, actual->offset_end);
BFREE(ustr32);
bxs_free(actual);
@ -168,7 +147,7 @@ void test_ansi_unicode_space_kinds(void **state)
void test_ansi_unicode_chinese(void **state)
{
(void) state; /* unused */
UNUSED(state);
uint32_t *ustr32 = u32_strconv_from_arg(
"\xe5\x85\xac\xe7\x88\xb8\xe8\xa6\x81\xe9\x81\x93\xef\xbc\x81", /* 公爸要道! */
@ -189,8 +168,6 @@ void test_ansi_unicode_chinese(void **state)
assert_array_equal(expected_firstchar_idx, actual->first_char, 5);
int expected_vischar_idx[] = {0, 1, 2, 3, 4};
assert_array_equal(expected_vischar_idx, actual->visible_char, 5);
assert_int_equal(0, actual->offset_start);
assert_int_equal(0, actual->offset_end);
BFREE(ustr32);
bxs_free(actual);
@ -200,7 +177,7 @@ void test_ansi_unicode_chinese(void **state)
void test_ansi_unicode_empty(void **state)
{
(void) state; /* unused */
UNUSED(state);
uint32_t *ustr32 = u32_strconv_from_arg("", "UTF-8");
assert_non_null(ustr32);
@ -219,8 +196,6 @@ void test_ansi_unicode_empty(void **state)
assert_array_equal(expected_firstchar_idx, actual->first_char, 0);
int expected_vischar_idx[] = {};
assert_array_equal(expected_vischar_idx, actual->visible_char, 0);
assert_int_equal(0, actual->offset_start);
assert_int_equal(0, actual->offset_end);
BFREE(ustr32);
bxs_free(actual);
@ -228,35 +203,754 @@ void test_ansi_unicode_empty(void **state)
void test_ansi_unicode_tabs(void **state)
void test_ansi_unicode_blanks(void **state)
{
(void) state; /* unused */
UNUSED(state);
uint32_t *ustr32 = u32_strconv_from_arg("illegal \t tab", "UTF-8");
uint32_t *ustr32 = u32_strconv_from_arg(" ", "UTF-8");
assert_non_null(ustr32);
bxstr_t *actual = bxs_from_unicode(ustr32);
assert_non_null(actual);
assert_non_null(actual->memory);
assert_string_equal(" ", actual->ascii);
assert_int_equal(4, (int) actual->indent);
assert_int_equal(4, (int) actual->num_columns);
assert_int_equal(4, (int) actual->num_chars);
assert_int_equal(4, (int) actual->num_chars_visible);
assert_int_equal(0, (int) actual->num_chars_invisible);
assert_int_equal(0, (int) actual->trailing);
int expected_firstchar_idx[] = {0, 1, 2, 3};
assert_array_equal(expected_firstchar_idx, actual->first_char, 0);
int expected_vischar_idx[] = {0, 1, 2, 3};
assert_array_equal(expected_vischar_idx, actual->visible_char, 0);
BFREE(ustr32);
bxs_free(actual);
}
void test_ansi_unicode_invisible_only(void **state)
{
UNUSED(state);
uint32_t *ustr32 = u32_strconv_from_arg("\x1b[38;5;203m\x1b[0m\x1b[38;5;198m\x1b[m", "UTF-8");
assert_non_null(ustr32);
bxstr_t *actual = bxs_from_unicode(ustr32);
assert_non_null(actual);
assert_non_null(actual->memory);
assert_string_equal("", actual->ascii);
assert_int_equal(0, (int) actual->indent);
assert_int_equal(0, (int) actual->num_columns);
assert_int_equal(29, (int) actual->num_chars);
assert_int_equal(0, (int) actual->num_chars_visible);
assert_int_equal(29, (int) actual->num_chars_invisible);
assert_int_equal(0, (int) actual->trailing);
int expected_firstchar_idx[] = {29};
assert_array_equal(expected_firstchar_idx, actual->first_char, 1);
int expected_vischar_idx[] = {29};
assert_array_equal(expected_vischar_idx, actual->visible_char, 1);
BFREE(ustr32);
bxs_free(actual);
}
void test_ansi_unicode_illegalchar(void **state)
{
UNUSED(state);
uint32_t *ustr32 = u32_strconv_from_arg("illegal \b backspace", "UTF-8");
assert_non_null(ustr32);
bxstr_t *actual = bxs_from_unicode(ustr32);
BFREE(ustr32);
assert_null(actual);
assert_int_equal(1, collect_err_size);
assert_string_equal("boxes: internal error: tab encountered in from_unicode()\n", collect_err[0]);
assert_string_equal("boxes: illegal character '\b' (0x00000008) encountered in string\n", collect_err[0]);
}
void test_ansi_unicode_tabs(void **state)
{
UNUSED(state);
uint32_t *ustr32 = u32_strconv_from_arg(" \t", "ASCII");
assert_non_null(ustr32);
bxstr_t *actual = bxs_from_unicode(ustr32);
assert_non_null(actual);
assert_non_null(actual->memory);
assert_string_equal(" ", actual->ascii); /* tab converted to space in ascii representation */
assert_int_equal(2, (int) actual->indent);
assert_int_equal(2, (int) actual->num_columns);
assert_int_equal(2, (int) actual->num_chars);
assert_int_equal(2, (int) actual->num_chars_visible);
assert_int_equal(0, (int) actual->num_chars_invisible);
assert_int_equal(0, (int) actual->trailing);
int expected_firstchar_idx[] = {0, 1, 2};
assert_array_equal(expected_firstchar_idx, actual->first_char, 3);
int expected_vischar_idx[] = {0, 1, 2};
assert_array_equal(expected_vischar_idx, actual->visible_char, 3);
BFREE(ustr32);
bxs_free(actual);
}
void test_ansi_unicode_null(void **state)
{
(void) state; /* unused */
UNUSED(state);
bxstr_t *actual = bxs_from_unicode(NULL);
assert_null(actual);
assert_int_equal(1, collect_err_size);
assert_string_equal("boxes: internal error: from_unicode() called with NULL\n", collect_err[0]);
assert_string_equal("boxes: internal error: bxs_from_unicode() called with NULL\n", collect_err[0]);
}
void test_bxs_strdup(void **state)
{
UNUSED(state);
bxstr_t *actual = bxs_strdup(NULL);
assert_null(actual);
uint32_t *ustr32 = u32_strconv_from_arg(" x\x1b[38;5;203mc\x1b[0m\x1b[38;5;198mc\x1b[0mx ", "UTF-8");
assert_non_null(ustr32);
bxstr_t *bxstr = bxs_from_unicode(ustr32);
actual = bxs_strdup(bxstr);
assert_non_null(actual);
assert_non_null(actual->memory);
assert_string_equal(" xccx ", actual->ascii);
assert_int_equal(1, (int) actual->indent);
assert_int_equal(7, (int) actual->num_columns);
assert_int_equal(37, (int) actual->num_chars);
assert_int_equal(7, (int) actual->num_chars_visible);
assert_int_equal(30, (int) actual->num_chars_invisible);
assert_int_equal(2, (int) actual->trailing);
int expected_firstchar_idx[] = {0, 1, 2, 18, 34, 35, 36, 37};
assert_array_equal(expected_firstchar_idx, actual->first_char, 8);
int expected_vischar_idx[] = {0, 1, 13, 29, 34, 35, 36, 37};
assert_array_equal(expected_vischar_idx, actual->visible_char, 8);
BFREE(ustr32);
bxs_free(actual);
bxs_free(bxstr);
}
void test_bxs_trimdup_null(void **state)
{
UNUSED(state);
bxstr_t *actual = bxs_trimdup(NULL, 0, 0);
assert_null(actual);
assert_int_equal(0, collect_err_size);
}
void test_bxs_trimdup_invalid_startidx(void **state)
{
UNUSED(state);
bxstr_t *input = bxs_from_ascii("foo");
bxstr_t *actual = bxs_trimdup(input, 1000, 1000);
assert_null(actual);
assert_int_equal(1, collect_err_size);
assert_string_equal("boxes: internal error: start_idx out of bounds in bxs_trimdup()\n", collect_err[0]);
bxs_free(input);
}
void test_bxs_trimdup_invalid_endidx(void **state)
{
UNUSED(state);
bxstr_t *input = bxs_from_ascii("foo");
bxstr_t *actual = bxs_trimdup(input, 0, 1000);
assert_null(actual);
assert_int_equal(1, collect_err_size);
assert_string_equal("boxes: internal error: end_idx out of bounds in bxs_trimdup()\n", collect_err[0]);
bxs_free(input);
}
void test_bxs_trimdup_invalid_endidx2(void **state)
{
UNUSED(state);
bxstr_t *input = bxs_from_ascii("foo");
bxstr_t *actual = bxs_trimdup(input, 2, 1);
assert_null(actual);
assert_int_equal(1, collect_err_size);
assert_string_equal("boxes: internal error: end_idx before start_idx in bxs_trimdup()\n", collect_err[0]);
bxs_free(input);
}
void test_bxs_trimdup_normal(void **state)
{
UNUSED(state);
bxstr_t *input = bxs_from_ascii("A, foo, B");
bxstr_t *actual = bxs_trimdup(input, 2, 6);
assert_non_null(actual);
assert_string_equal("foo", actual->ascii);
assert_int_equal(3, actual->num_chars);
bxs_free(input);
bxs_free(actual);
}
void test_bxs_trimdup_vanish(void **state)
{
UNUSED(state);
bxstr_t *input = bxs_from_ascii("A, , B");
bxstr_t *actual = bxs_trimdup(input, 2, 6);
assert_non_null(actual);
assert_string_equal("", actual->ascii);
assert_int_equal(0, actual->num_chars);
bxs_free(input);
bxs_free(actual);
}
void test_bxs_trimdup_ansi(void **state)
{
UNUSED(state);
uint32_t *ustr32 = u32_strconv_from_arg("\x1b[38;5;203m \x1b[0m \x1b[38;5;203m \x1b[0m"
"\x1b[38;5;203mX\x1b[0m\x1b[38;5;203mX\x1b[0m\x1b[38;5;198m \x1b[0m ", "UTF-8");
assert_non_null(ustr32);
bxstr_t *input = bxs_from_unicode(ustr32);
assert_int_equal(7, input->num_chars_visible);
assert_int_equal(3, input->indent);
assert_int_equal(2, input->trailing);
bxstr_t *actual = bxs_trimdup(input, 0, input->num_chars_visible);
assert_non_null(actual);
assert_string_equal("XX", actual->ascii);
assert_int_equal(32, actual->num_chars);
assert_int_equal(30, actual->num_chars_invisible);
assert_int_equal(2, actual->num_chars_visible);
assert_int_equal(2, actual->num_columns);
assert_int_equal(0, actual->indent);
BFREE(ustr32);
bxs_free(input);
bxs_free(actual);
}
void test_bxs_trimdup_ansi_same(void **state)
{
UNUSED(state);
uint32_t *ustr32 = u32_strconv_from_arg("X \x1b[38;5;203mX\x1b[0m\x1b[38;5;198m \x1b[0mX", "UTF-8");
assert_non_null(ustr32);
bxstr_t *input = bxs_from_unicode(ustr32);
assert_int_equal(5, input->num_chars_visible);
assert_int_equal(0, input->indent);
assert_int_equal(0, input->trailing);
bxstr_t *actual = bxs_trimdup(input, 0, input->num_chars_visible);
assert_non_null(actual);
assert_string_equal("X X X", actual->ascii);
assert_int_equal(input->indent, actual->indent);
assert_int_equal(input->num_columns, actual->num_columns);
assert_int_equal(input->num_chars, actual->num_chars);
assert_int_equal(input->num_chars_visible, actual->num_chars_visible);
assert_int_equal(input->num_chars_invisible, actual->num_chars_invisible);
assert_int_equal(input->trailing, actual->trailing);
assert_memory_equal(input->memory, actual->memory, input->num_chars * sizeof(ucs4_t));
assert_memory_equal(input->ascii, actual->ascii, input->num_columns * sizeof(char));
assert_memory_equal(input->first_char, actual->first_char, input->num_chars_visible * sizeof(size_t));
assert_memory_equal(input->visible_char, actual->visible_char, input->num_chars_visible * sizeof(size_t));
BFREE(ustr32);
bxs_free(input);
bxs_free(actual);
}
void test_bxs_strcat_empty(void **state)
{
UNUSED(state);
uint32_t *ustr32 = u32_strconv_from_arg("x", "UTF-8");
assert_non_null(ustr32);
bxstr_t *bxstr = bxs_from_unicode(ustr32);
bxstr_t *actual = bxs_strcat(NULL, ustr32);
assert_non_null(actual);
assert_string_equal("x", actual->ascii);
bxs_free(actual);
actual = bxs_strcat(bxstr, NULL);
assert_non_null(actual);
assert_string_equal("x", actual->ascii);
bxs_free(actual);
BFREE(ustr32);
bxs_free(bxstr);
}
void test_bxs_strcat_empty2(void **state)
{
UNUSED(state);
uint32_t *ustr32_x = u32_strconv_from_arg("x", "UTF-8");
uint32_t *ustr32_empty = u32_strconv_from_arg("", "UTF-8");
assert_non_null(ustr32_x);
assert_non_null(ustr32_empty);
bxstr_t *bxstr = bxs_from_unicode(ustr32_x);
bxstr_t *actual = bxs_strcat(bxstr, ustr32_empty);
assert_non_null(actual);
assert_string_equal("x", actual->ascii);
BFREE(ustr32_x);
BFREE(ustr32_empty);
bxs_free(actual);
bxs_free(bxstr);
}
void test_bxs_strcat_empty3(void **state)
{
UNUSED(state);
uint32_t *ustr32_empty = u32_strconv_from_arg("", "UTF-8");
uint32_t *ustr32_x = u32_strconv_from_arg("x", "UTF-8");
assert_non_null(ustr32_empty);
assert_non_null(ustr32_x);
bxstr_t *bxstr = bxs_from_unicode(ustr32_empty);
bxstr_t *actual = bxs_strcat(bxstr, ustr32_x);
assert_non_null(actual);
assert_string_equal("x", actual->ascii);
BFREE(ustr32_empty);
BFREE(ustr32_x);
bxs_free(actual);
bxs_free(bxstr);
}
void test_bxs_strcat(void **state)
{
UNUSED(state);
uint32_t *ustr32_1 = u32_strconv_from_arg("\x1b[38;5;203mA\x1b[0m\x1b[38;5;198mB\x1b[0m", "UTF-8");
uint32_t *ustr32_2 = u32_strconv_from_arg("-\x1b[38;5;203ma\x1b[0m\x1b[38;5;198mb\x1b[0m", "UTF-8");
assert_non_null(ustr32_1);
assert_non_null(ustr32_2);
bxstr_t *bxstr = bxs_from_unicode(ustr32_1);
bxstr_t *actual = bxs_strcat(bxstr, ustr32_2);
assert_non_null(actual);
assert_non_null(actual->memory);
assert_string_equal("AB-ab", actual->ascii);
assert_int_equal(0, (int) actual->indent);
assert_int_equal(5, (int) actual->num_columns);
assert_int_equal(65, (int) actual->num_chars);
assert_int_equal(5, (int) actual->num_chars_visible);
assert_int_equal(60, (int) actual->num_chars_invisible);
assert_int_equal(0, (int) actual->trailing);
int expected_firstchar_idx[] = {0, 16, 32, 33, 49, 65};
assert_array_equal(expected_firstchar_idx, actual->first_char, 6);
int expected_vischar_idx[] = {11, 27, 32, 44, 60, 65};
assert_array_equal(expected_vischar_idx, actual->visible_char, 6);
BFREE(ustr32_1);
BFREE(ustr32_2);
bxs_free(actual);
bxs_free(bxstr);
}
void test_bxs_strchr(void **state)
{
UNUSED(state);
uint32_t *ustr32 = u32_strconv_from_arg("\x1b[38;5;203ma\x1b[0m\x1b[38;5;198mb\x1b[0m", "UTF-8");
assert_non_null(ustr32);
bxstr_t *bxstr = bxs_from_unicode(ustr32);
ucs4_t char_a = 0x00000061;
ucs4_t char_m = 0x0000006D;
uint32_t *found = bxs_strchr(bxstr, char_a, NULL);
assert_int_equal(11, found - bxstr->memory);
found = bxs_strchr(bxstr, char_m, NULL);
assert_null(found);
found = bxs_strchr(NULL, char_a, NULL);
assert_null(found);
BFREE(ustr32);
bxs_free(bxstr);
}
void test_bxs_strchr_empty(void **state)
{
UNUSED(state);
uint32_t *ustr32 = u32_strconv_from_arg("\x1b[38;5;203m\x1b[0m", "UTF-8");
assert_non_null(ustr32);
bxstr_t *bxstr = bxs_from_unicode(ustr32);
ucs4_t char_m = 0x0000006D;
uint32_t *found = bxs_strchr(bxstr, char_m, NULL);
assert_null(found);
BFREE(ustr32);
bxs_free(bxstr);
}
void test_bxs_strchr_cursor(void **state)
{
UNUSED(state);
uint32_t *ustr32 = u32_strconv_from_arg("foo, \x1b[38;5;203mBAR\x1b[0m, \x1b[38;5;198mBAZ\x1b[0m", "UTF-8");
assert_non_null(ustr32);
bxstr_t *bxstr = bxs_from_unicode(ustr32);
ucs4_t char_comma = 0x0000002C; /* ',' */
int cursor = -1;
uint32_t *found = bxs_strchr(bxstr, char_comma, &cursor);
assert_non_null(found);
assert_int_equal(3, cursor);
assert_memory_equal(&char_comma, found, sizeof(ucs4_t));
found = bxs_strchr(bxstr, char_comma, &cursor);
assert_non_null(found);
assert_int_equal(8, cursor);
assert_memory_equal(&char_comma, found, sizeof(ucs4_t));
found = bxs_strchr(bxstr, char_comma, &cursor);
assert_null(found);
assert_int_equal(8, cursor);
BFREE(ustr32);
bxs_free(bxstr);
}
void test_bxs_trim(void **state)
{
UNUSED(state);
uint32_t *ustr32 = /* em-space */
u32_strconv_from_arg("\xe2\x80\x83 \x1b[38;5;203m \x1b[0mtrimmed\x1b[38;5;198m \x1b[0m ", "UTF-8");
bxstr_t *bxstr = bxs_from_unicode(ustr32);
assert_int_equal(3, (int) bxstr->indent);
bxstr_t *actual = bxs_trim(bxstr);
assert_non_null(actual);
assert_non_null(actual->memory);
assert_string_equal("trimmed", actual->ascii);
assert_int_equal(0, (int) actual->indent);
assert_int_equal(7, (int) actual->num_columns);
assert_int_equal(7, (int) actual->num_chars);
assert_int_equal(7, (int) actual->num_chars_visible);
assert_int_equal(0, (int) actual->num_chars_invisible);
assert_int_equal(0, (int) actual->trailing);
int expected_firstchar_idx[] = {0, 1, 2, 3, 4, 5, 6, 7};
assert_array_equal(expected_firstchar_idx, actual->first_char, 8);
int expected_vischar_idx[] = {0, 1, 2, 3, 4, 5, 6, 7};
assert_array_equal(expected_vischar_idx, actual->visible_char, 8);
BFREE(ustr32);
bxs_free(actual);
bxs_free(bxstr);
}
void test_bxs_trim_blanks(void **state)
{
UNUSED(state);
/* em-space */
uint32_t *ustr32 = u32_strconv_from_arg(" \xe2\x80\x83\x1b[38;5;203m \x1b[0m \x1b[38;5;198m \x1b[0m ", "UTF-8");
bxstr_t *bxstr = bxs_from_unicode(ustr32);
assert_int_equal(7, (int) bxstr->indent);
assert_int_equal(0, (int) bxstr->trailing);
bxstr_t *actual = bxs_trim(bxstr);
assert_non_null(actual);
assert_non_null(actual->memory);
assert_string_equal("", actual->ascii);
assert_int_equal(0, (int) actual->indent);
assert_int_equal(0, (int) actual->num_columns);
assert_int_equal(0, (int) actual->num_chars);
assert_int_equal(0, (int) actual->num_chars_visible);
assert_int_equal(0, (int) actual->num_chars_invisible);
assert_int_equal(0, (int) actual->trailing);
int expected_firstchar_idx[] = {0};
assert_array_equal(expected_firstchar_idx, actual->first_char, 1);
int expected_vischar_idx[] = {0};
assert_array_equal(expected_vischar_idx, actual->visible_char, 1);
BFREE(ustr32);
bxs_free(actual);
bxs_free(bxstr);
}
void test_bxs_trim_none(void **state)
{
UNUSED(state);
bxstr_t *actual = bxs_trim(NULL);
assert_null(actual);
uint32_t *ustr32 = u32_strconv_from_arg("\x1b[38;5;203mX\x1b[0mX\x1b[38;5;198mX\x1b[0mX", "ASCII");
bxstr_t *bxstr = bxs_from_unicode(ustr32);
actual = bxs_trim(bxstr);
assert_non_null(actual);
assert_non_null(actual->memory);
assert_string_equal("XXXX", actual->ascii);
assert_int_equal(0, (int) actual->indent);
assert_int_equal(4, (int) actual->num_columns);
assert_int_equal(34, (int) actual->num_chars);
assert_int_equal(4, (int) actual->num_chars_visible);
assert_int_equal(30, (int) actual->num_chars_invisible);
assert_int_equal(0, (int) actual->trailing);
int expected_firstchar_idx[] = {0, 16, 17, 33, 34};
assert_array_equal(expected_firstchar_idx, actual->first_char, 5);
int expected_vischar_idx[] = {11, 16, 28, 33, 34};
assert_array_equal(expected_vischar_idx, actual->visible_char, 5);
BFREE(ustr32);
bxs_free(actual);
bxs_free(bxstr);
}
void test_bxs_rtrim(void **state)
{
UNUSED(state);
uint32_t *ustr32 = u32_strconv_from_arg("xx\x1b[38;5;203m \x1b[0m \x1b[38;5;198m \x1b[0m ", "UTF-8");
assert_non_null(ustr32);
bxstr_t *bxstr = bxs_from_unicode(ustr32);
bxstr_t *actual = bxs_rtrim(bxstr);
assert_non_null(actual);
assert_non_null(actual->memory);
assert_string_equal("xx", actual->ascii);
assert_int_equal(0, (int) actual->indent);
assert_int_equal(2, (int) actual->num_columns);
assert_int_equal(2, (int) actual->num_chars);
assert_int_equal(2, (int) actual->num_chars_visible);
assert_int_equal(0, (int) actual->num_chars_invisible);
assert_int_equal(0, (int) actual->trailing);
int expected_firstchar_idx[] = {0, 1, 2};
assert_array_equal(expected_firstchar_idx, actual->first_char, 3);
int expected_vischar_idx[] = {0, 1, 2};
assert_array_equal(expected_vischar_idx, actual->visible_char, 3);
BFREE(ustr32);
bxs_free(actual);
bxs_free(bxstr);
}
void test_bxs_rtrim_empty(void **state)
{
UNUSED(state);
bxstr_t *actual = bxs_rtrim(NULL);
assert_null(actual);
uint32_t *ustr32 = u32_strconv_from_arg("X\x1b[38;5;203m \x1b[0mX", "UTF-8");
assert_non_null(ustr32);
bxstr_t *bxstr = bxs_from_unicode(ustr32);
actual = bxs_rtrim(bxstr);
assert_non_null(actual);
assert_non_null(actual->memory);
assert_string_equal("X X", actual->ascii);
assert_int_equal(0, (int) actual->indent);
assert_int_equal(3, (int) actual->num_columns);
assert_int_equal(18, (int) actual->num_chars);
assert_int_equal(3, (int) actual->num_chars_visible);
assert_int_equal(15, (int) actual->num_chars_invisible);
assert_int_equal(0, (int) actual->trailing);
int expected_firstchar_idx[] = {0, 1, 17, 18};
assert_array_equal(expected_firstchar_idx, actual->first_char, 4);
int expected_vischar_idx[] = {0, 12, 17, 18};
assert_array_equal(expected_vischar_idx, actual->visible_char, 4);
BFREE(ustr32);
bxs_free(actual);
bxs_free(bxstr);
}
void test_bxs_to_output(void **state)
{
UNUSED(state);
char *actual = bxs_to_output(NULL);
assert_string_equal("NULL", actual);
BFREE(actual);
bxstr_t *bxstr = bxs_from_ascii("foobar");
actual = bxs_to_output(bxstr);
assert_string_equal("foobar", actual);
BFREE(actual);
bxs_free(bxstr);
}
void test_bxs_is_empty_null(void **state)
{
UNUSED(state);
int actual = bxs_is_empty(NULL);
assert_int_equal(1, actual);
}
void test_bxs_strcmp(void **state)
{
UNUSED(state);
int actual = bxs_strcmp(NULL, NULL);
assert_int_equal(0, actual);
bxstr_t *bxstr1 = bxs_from_ascii("alice");
bxstr_t *bxstr2 = bxs_from_ascii("bob");
actual = bxs_strcmp(NULL, bxstr1);
assert_int_equal(1, actual);
actual = bxs_strcmp(bxstr1, NULL);
assert_int_equal(-1, actual);
actual = bxs_strcmp(bxstr1, bxstr2);
assert_int_equal(-1, actual);
actual = bxs_strcmp(bxstr2, bxstr1);
assert_int_equal(1, actual);
bxs_free(bxstr1);
bxs_free(bxstr2);
}
void test_bxs_valid_anywhere_error(void **state)
{
UNUSED(state);
size_t error_pos = 42;
assert_int_equal(0, bxs_valid_anywhere(NULL, NULL));
assert_int_equal(0, bxs_valid_anywhere(NULL, &error_pos));
assert_int_equal(0, (int) error_pos);
bxstr_t *bxstr = bxs_from_ascii("illegal X backspace");
ucs4_t char_backspace = 0x00000008;
bxstr->memory[8] = char_backspace;
assert_int_equal(0, bxs_valid_anywhere(bxstr, NULL));
bxs_free(bxstr);
}
void test_bxs_valid_in_filename_error(void **state)
{
UNUSED(state);
uint32_t *ustr32 = u32_strconv_from_arg("\x1b[38;5;203m\x1b[0m", "ASCII");
bxstr_t *bxstr_no_vis = bxs_from_unicode(ustr32);
BFREE(ustr32);
ustr32 = u32_strconv_from_arg("\x1b[38;5;203m_VISIBLE_\x1b[0m", "ASCII");
bxstr_t *bxstr_vis_invis = bxs_from_unicode(ustr32);
assert_int_equal(0, bxs_valid_in_filename(bxstr_no_vis, NULL));
assert_int_equal(0, bxs_valid_in_filename(bxstr_vis_invis, NULL));
BFREE(ustr32);
bxs_free(bxstr_no_vis);
bxs_free(bxstr_vis_invis);
}
// TODO test case for incomplete/broken escape sequences
void test_bxs_free_null(void **state)
{
UNUSED(state);
bxstr_t *bstr = (bxstr_t *) calloc(1, sizeof(bxstr_t));
bstr->ascii = NULL;
bstr->memory = NULL;
bstr->first_char = NULL;
bstr->visible_char = NULL;
bxs_free(bstr);
bxs_free(NULL);
}
/* vim: set cindent sw=4: */

View File

@ -22,17 +22,56 @@
void test_ascii_simple(void **state);
void test_ascii_tabs(void **state);
void test_ascii_illegalchar(void **state);
void test_ascii_null(void **state);
void test_ansi_unicode_book(void **state);
void test_ansi_unicode_space_kinds(void **state);
void test_ansi_unicode_chinese(void **state);
void test_ansi_unicode_empty(void **state);
void test_ansi_unicode_blanks(void **state);
void test_ansi_unicode_invisible_only(void **state);
void test_ansi_unicode_illegalchar(void **state);
void test_ansi_unicode_tabs(void **state);
void test_ansi_unicode_null(void **state);
void test_bxsfree_null(void **state);
void test_bxs_strdup(void **state);
void test_bxs_trimdup_null(void **state);
void test_bxs_trimdup_invalid_startidx(void **state);
void test_bxs_trimdup_invalid_endidx(void **state);
void test_bxs_trimdup_invalid_endidx2(void **state);
void test_bxs_trimdup_normal(void **state);
void test_bxs_trimdup_vanish(void **state);
void test_bxs_trimdup_ansi(void **state);
void test_bxs_trimdup_ansi_same(void **state);
void test_bxs_strcat(void **state);
void test_bxs_strcat_empty(void **state);
void test_bxs_strcat_empty2(void **state);
void test_bxs_strcat_empty3(void **state);
void test_bxs_strchr(void **state);
void test_bxs_strchr_empty(void **state);
void test_bxs_strchr_cursor(void **state);
void test_bxs_trim(void **state);
void test_bxs_trim_blanks(void **state);
void test_bxs_trim_none(void **state);
void test_bxs_rtrim(void **state);
void test_bxs_rtrim_empty(void **state);
void test_bxs_to_output(void **state);
void test_bxs_is_empty_null(void **state);
void test_bxs_strcmp(void **state);
void test_bxs_valid_anywhere_error(void **state);
void test_bxs_valid_in_filename_error(void **state);
void test_bxs_free_null(void **state);
#endif

161
utest/unicode_test.c Normal file
View File

@ -0,0 +1,161 @@
/*
* boxes - Command line filter to draw/remove ASCII boxes around text
* Copyright (c) 1999-2023 Thomas Jensen and the boxes contributors
*
* This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public
* License, version 3, as published by the Free Software Foundation.
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied
* warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
* You should have received a copy of the GNU General Public License along with this program.
* If not, see <https://www.gnu.org/licenses/>.
*
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
*/
/*
* Unit tests of the 'unicode' module
*/
#include "config.h"
#include <setjmp.h>
#include <stdarg.h>
#include <stddef.h>
#include <cmocka.h>
#include <stdio.h>
#include <string.h>
#include "boxes.h"
#include "unicode.h"
#include "unicode_test.h"
void test_to_utf32(void **state)
{
UNUSED(state);
uint32_t *ustr32 = u32_strconv_from_arg("A", "ASCII");
assert_non_null(ustr32);
ucs4_t actual_A = to_utf32('A');
ucs4_t actual_space = to_utf32(' ');
ucs4_t actual_invalid = to_utf32('\x1B');
assert_int_equal(0, memcmp(ustr32, &actual_A, sizeof(ucs4_t)));
assert_int_equal(0, memcmp(&char_space, &actual_space, sizeof(ucs4_t)));
assert_int_equal(0, memcmp(&char_nul, &actual_invalid, sizeof(ucs4_t)));
}
void test_is_blank(void **state)
{
UNUSED(state);
const ucs4_t char_emspace = 0x00002003;
const ucs4_t char_enspace = 0x00002002;
assert_int_equal(1, is_blank(char_space));
assert_int_equal(1, is_blank(char_emspace));
assert_int_equal(1, is_blank(char_enspace));
assert_int_equal(1, is_blank(char_tab));
assert_int_equal(0, is_blank(to_utf32('x')));
assert_int_equal(0, is_blank(char_cr));
assert_int_equal(0, is_blank(char_newline));
assert_int_equal(0, is_blank(char_esc));
}
void test_is_allowed_in_sample(void **state)
{
UNUSED(state);
const ucs4_t char_bell = 0x00000007;
const ucs4_t char_backsp = 0x00000008;
const ucs4_t char_u_umlaut = 0x000000fc;
assert_int_equal(1, is_allowed_in_sample(to_utf32('x')));
assert_int_equal(1, is_allowed_in_sample(char_u_umlaut));
assert_int_equal(1, is_allowed_in_sample(char_space));
assert_int_equal(1, is_allowed_in_sample(char_esc));
assert_int_equal(1, is_allowed_in_sample(char_cr));
assert_int_equal(1, is_allowed_in_sample(char_newline));
assert_int_equal(1, is_allowed_in_sample(char_tab));
assert_int_equal(0, is_allowed_in_sample(char_bell));
assert_int_equal(0, is_allowed_in_sample(char_backsp));
}
void test_is_allowed_in_shape(void **state)
{
UNUSED(state);
const ucs4_t char_bell = 0x00000007;
const ucs4_t char_backsp = 0x00000008;
const ucs4_t char_u_umlaut = 0x000000fc;
assert_int_equal(1, is_allowed_in_shape(to_utf32('x')));
assert_int_equal(1, is_allowed_in_shape(char_u_umlaut));
assert_int_equal(1, is_allowed_in_shape(char_space));
assert_int_equal(1, is_allowed_in_shape(char_esc));
assert_int_equal(1, is_allowed_in_shape(char_tab)); /* But tabs are deprecated in shapes! Temporary only. */
assert_int_equal(0, is_allowed_in_shape(char_bell));
assert_int_equal(0, is_allowed_in_shape(char_backsp));
assert_int_equal(0, is_allowed_in_shape(char_cr));
assert_int_equal(0, is_allowed_in_shape(char_newline));
}
void test_is_allowed_in_filename(void **state)
{
UNUSED(state);
const ucs4_t char_bell = 0x00000007;
const ucs4_t char_backsp = 0x00000008;
const ucs4_t char_u_umlaut = 0x000000fc;
assert_int_equal(1, is_allowed_in_filename(to_utf32('x')));
assert_int_equal(1, is_allowed_in_filename(char_u_umlaut));
assert_int_equal(1, is_allowed_in_filename(char_space));
assert_int_equal(1, is_allowed_in_filename(char_tab));
assert_int_equal(0, is_allowed_in_filename(char_esc));
assert_int_equal(0, is_allowed_in_filename(char_bell));
assert_int_equal(0, is_allowed_in_filename(char_backsp));
assert_int_equal(0, is_allowed_in_filename(char_cr));
assert_int_equal(0, is_allowed_in_filename(char_newline));
}
void test_is_allowed_in_kv_string(void **state)
{
UNUSED(state);
const ucs4_t char_bell = 0x00000007;
const ucs4_t char_backsp = 0x00000008;
const ucs4_t char_u_umlaut = 0x000000fc;
assert_int_equal(1, is_allowed_in_kv_string(to_utf32('x')));
assert_int_equal(1, is_allowed_in_kv_string(char_u_umlaut));
assert_int_equal(1, is_allowed_in_kv_string(char_space));
assert_int_equal(1, is_allowed_in_kv_string(char_tab));
assert_int_equal(0, is_allowed_in_kv_string(char_esc));
assert_int_equal(0, is_allowed_in_kv_string(char_bell));
assert_int_equal(0, is_allowed_in_kv_string(char_backsp));
assert_int_equal(0, is_allowed_in_kv_string(char_cr));
assert_int_equal(0, is_allowed_in_kv_string(char_newline));
}
/* vim: set cindent sw=4: */

33
utest/unicode_test.h Normal file
View File

@ -0,0 +1,33 @@
/*
* boxes - Command line filter to draw/remove ASCII boxes around text
* Copyright (c) 1999-2023 Thomas Jensen and the boxes contributors
*
* This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public
* License, version 3, as published by the Free Software Foundation.
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied
* warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
* You should have received a copy of the GNU General Public License along with this program.
* If not, see <https://www.gnu.org/licenses/>.
*
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
*/
/*
* Unit tests of the 'unicode' module
*/
#ifndef UNICODE_TEST_H
#define UNICODE_TEST_H
void test_to_utf32(void **state);
void test_is_blank(void **state);
void test_is_allowed_in_sample(void **state);
void test_is_allowed_in_shape(void **state);
void test_is_allowed_in_filename(void **state);
void test_is_allowed_in_kv_string(void **state);
#endif
/* vim: set cindent sw=4: */