mirror of
https://codeberg.org/dnkl/foot.git
synced 2026-02-04 04:06:06 -05:00
398 lines
8.4 KiB
C
398 lines
8.4 KiB
C
|
|
#include "char32.h"
|
|||
|
|
|
|||
|
|
#include <stdlib.h>
|
|||
|
|
#include <string.h>
|
|||
|
|
#include <locale.h>
|
|||
|
|
|
|||
|
|
#include <wctype.h>
|
|||
|
|
#include <wchar.h>
|
|||
|
|
|
|||
|
|
|
|||
|
|
#define LOG_MODULE "char32"
|
|||
|
|
#define LOG_ENABLE_DBG 0
|
|||
|
|
#include "log.h"
|
|||
|
|
#include "debug.h"
|
|||
|
|
#include "macros.h"
|
|||
|
|
#include "xmalloc.h"
|
|||
|
|
|
|||
|
|
/*
|
|||
|
|
* For now, assume we can map directly to the corresponding wchar_t
|
|||
|
|
* functions. This is true if:
|
|||
|
|
*
|
|||
|
|
* - both data types have the same size
|
|||
|
|
* - both use the same encoding (though we require that encoding to be UTF-32)
|
|||
|
|
*/
|
|||
|
|
|
|||
|
|
_Static_assert(
|
|||
|
|
sizeof(wchar_t) == sizeof(char32_t), "wchar_t vs. char32_t size mismatch");
|
|||
|
|
|
|||
|
|
#if !defined(__STDC_UTF_32__) || !__STDC_UTF_32__
|
|||
|
|
#error "char32_t does not use UTF-32"
|
|||
|
|
#endif
|
|||
|
|
#if (!defined(__STDC_ISO_10646__) || !__STDC_ISO_10646__) && !defined(__FreeBSD__)
|
|||
|
|
#error "wchar_t does not use UTF-32"
|
|||
|
|
#endif
|
|||
|
|
|
|||
|
|
UNITTEST
|
|||
|
|
{
|
|||
|
|
xassert(c32len(U"") == 0);
|
|||
|
|
xassert(c32len(U"foobar") == 6);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
UNITTEST
|
|||
|
|
{
|
|||
|
|
xassert(c32cmp(U"foobar", U"foobar") == 0);
|
|||
|
|
xassert(c32cmp(U"foo", U"foobar") < 0);
|
|||
|
|
xassert(c32cmp(U"foobar", U"foo") > 0);
|
|||
|
|
xassert(c32cmp(U"a", U"b") < 0);
|
|||
|
|
xassert(c32cmp(U"b", U"a") > 0);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
UNITTEST
|
|||
|
|
{
|
|||
|
|
char32_t copy[16];
|
|||
|
|
char32_t *ret = c32ncpy(copy, U"foobar", 16);
|
|||
|
|
|
|||
|
|
xassert(ret == copy);
|
|||
|
|
xassert(copy[0] == U'f');
|
|||
|
|
xassert(copy[1] == U'o');
|
|||
|
|
xassert(copy[2] == U'o');
|
|||
|
|
xassert(copy[3] == U'b');
|
|||
|
|
xassert(copy[4] == U'a');
|
|||
|
|
xassert(copy[5] == U'r');
|
|||
|
|
|
|||
|
|
unsigned char zeroes[(16 - 6) * sizeof(copy[0])] = {0};
|
|||
|
|
xassert(memcmp(©[6], zeroes, sizeof(zeroes)) == 0);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
UNITTEST
|
|||
|
|
{
|
|||
|
|
char32_t copy[16];
|
|||
|
|
memset(copy, 0x55, sizeof(copy));
|
|||
|
|
|
|||
|
|
char32_t *ret = c32cpy(copy, U"foobar");
|
|||
|
|
|
|||
|
|
xassert(ret == copy);
|
|||
|
|
xassert(copy[0] == U'f');
|
|||
|
|
xassert(copy[1] == U'o');
|
|||
|
|
xassert(copy[2] == U'o');
|
|||
|
|
xassert(copy[3] == U'b');
|
|||
|
|
xassert(copy[4] == U'a');
|
|||
|
|
xassert(copy[5] == U'r');
|
|||
|
|
xassert(copy[6] == U'\0');
|
|||
|
|
|
|||
|
|
unsigned char fives[(16 - 6 - 1) * sizeof(copy[0])];
|
|||
|
|
memset(fives, 0x55, sizeof(fives));
|
|||
|
|
xassert(memcmp(©[7], fives, sizeof(fives)) == 0);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
UNITTEST
|
|||
|
|
{
|
|||
|
|
xassert(c32casecmp(U"foobar", U"FOOBAR") == 0);
|
|||
|
|
xassert(c32casecmp(U"foo", U"FOOO") < 0);
|
|||
|
|
xassert(c32casecmp(U"FOOO", U"foo") > 0);
|
|||
|
|
xassert(c32casecmp(U"a", U"B") < 0);
|
|||
|
|
xassert(c32casecmp(U"B", U"a") > 0);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
UNITTEST
|
|||
|
|
{
|
|||
|
|
xassert(c32ncasecmp(U"foo", U"FOObar", 3) == 0);
|
|||
|
|
xassert(c32ncasecmp(U"foo", U"FOOO", 4) < 0);
|
|||
|
|
xassert(c32ncasecmp(U"FOOO", U"foo", 4) > 0);
|
|||
|
|
xassert(c32ncasecmp(U"a", U"BB", 1) < 0);
|
|||
|
|
xassert(c32ncasecmp(U"BB", U"a", 1) > 0);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
UNITTEST
|
|||
|
|
{
|
|||
|
|
char32_t dst[32] = U"foobar";
|
|||
|
|
char32_t *ret = c32ncat(dst, U"12345678XXXXXXXXX", 8);
|
|||
|
|
|
|||
|
|
xassert(ret == dst);
|
|||
|
|
xassert(c32cmp(dst, U"foobar12345678") == 0);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
UNITTEST
|
|||
|
|
{
|
|||
|
|
char32_t dst[32] = U"foobar";
|
|||
|
|
char32_t *ret = c32cat(dst, U"12345678");
|
|||
|
|
|
|||
|
|
xassert(ret == dst);
|
|||
|
|
xassert(c32cmp(dst, U"foobar12345678") == 0);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
UNITTEST
|
|||
|
|
{
|
|||
|
|
char32_t *c = c32dup(U"foobar");
|
|||
|
|
xassert(c32cmp(c, U"foobar") == 0);
|
|||
|
|
free(c);
|
|||
|
|
|
|||
|
|
c = c32dup(U"");
|
|||
|
|
xassert(c32cmp(c, U"") == 0);
|
|||
|
|
free(c);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
size_t
|
|||
|
|
mbsntoc32(char32_t *dst, const char *src, size_t nms, size_t len)
|
|||
|
|
{
|
|||
|
|
mbstate_t ps = {0};
|
|||
|
|
|
|||
|
|
char32_t *out = dst;
|
|||
|
|
const char *in = src;
|
|||
|
|
|
|||
|
|
size_t consumed = 0;
|
|||
|
|
size_t chars = 0;
|
|||
|
|
size_t rc;
|
|||
|
|
|
|||
|
|
while ((out == NULL || chars < len) &&
|
|||
|
|
consumed < nms &&
|
|||
|
|
(rc = mbrtoc32(out, in, nms - consumed, &ps)) != 0)
|
|||
|
|
{
|
|||
|
|
switch (rc) {
|
|||
|
|
case 0:
|
|||
|
|
goto done;
|
|||
|
|
|
|||
|
|
case (size_t)-1:
|
|||
|
|
case (size_t)-2:
|
|||
|
|
case (size_t)-3:
|
|||
|
|
goto err;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
in += rc;
|
|||
|
|
consumed += rc;
|
|||
|
|
chars++;
|
|||
|
|
|
|||
|
|
if (out != NULL)
|
|||
|
|
out++;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
done:
|
|||
|
|
return chars;
|
|||
|
|
|
|||
|
|
err:
|
|||
|
|
return (char32_t)-1;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
UNITTEST
|
|||
|
|
{
|
|||
|
|
const char input[] = "foobarzoo";
|
|||
|
|
char32_t c32[32];
|
|||
|
|
|
|||
|
|
size_t ret = mbsntoc32(NULL, input, sizeof(input), 0);
|
|||
|
|
xassert(ret == 9);
|
|||
|
|
|
|||
|
|
memset(c32, 0x55, sizeof(c32));
|
|||
|
|
ret = mbsntoc32(c32, input, sizeof(input), 32);
|
|||
|
|
|
|||
|
|
xassert(ret == 9);
|
|||
|
|
xassert(c32[0] == U'f');
|
|||
|
|
xassert(c32[1] == U'o');
|
|||
|
|
xassert(c32[2] == U'o');
|
|||
|
|
xassert(c32[3] == U'b');
|
|||
|
|
xassert(c32[4] == U'a');
|
|||
|
|
xassert(c32[5] == U'r');
|
|||
|
|
xassert(c32[6] == U'z');
|
|||
|
|
xassert(c32[7] == U'o');
|
|||
|
|
xassert(c32[8] == U'o');
|
|||
|
|
xassert(c32[9] == U'\0');
|
|||
|
|
xassert(c32[10] == 0x55555555);
|
|||
|
|
|
|||
|
|
memset(c32, 0x55, sizeof(c32));
|
|||
|
|
ret = mbsntoc32(c32, input, 1, 32);
|
|||
|
|
|
|||
|
|
xassert(ret == 1);
|
|||
|
|
xassert(c32[0] == U'f');
|
|||
|
|
xassert(c32[1] == 0x55555555);
|
|||
|
|
|
|||
|
|
memset(c32, 0x55, sizeof(c32));
|
|||
|
|
ret = mbsntoc32(c32, input, sizeof(input), 1);
|
|||
|
|
|
|||
|
|
xassert(ret == 1);
|
|||
|
|
xassert(c32[0] == U'f');
|
|||
|
|
xassert(c32[1] == 0x55555555);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
UNITTEST
|
|||
|
|
{
|
|||
|
|
const char input[] = "foobarzoo";
|
|||
|
|
char32_t c32[32];
|
|||
|
|
|
|||
|
|
size_t ret = mbstoc32(NULL, input, 0);
|
|||
|
|
xassert(ret == 9);
|
|||
|
|
|
|||
|
|
memset(c32, 0x55, sizeof(c32));
|
|||
|
|
ret = mbstoc32(c32, input, 32);
|
|||
|
|
|
|||
|
|
xassert(ret == 9);
|
|||
|
|
xassert(c32[0] == U'f');
|
|||
|
|
xassert(c32[1] == U'o');
|
|||
|
|
xassert(c32[2] == U'o');
|
|||
|
|
xassert(c32[3] == U'b');
|
|||
|
|
xassert(c32[4] == U'a');
|
|||
|
|
xassert(c32[5] == U'r');
|
|||
|
|
xassert(c32[6] == U'z');
|
|||
|
|
xassert(c32[7] == U'o');
|
|||
|
|
xassert(c32[8] == U'o');
|
|||
|
|
xassert(c32[9] == U'\0');
|
|||
|
|
xassert(c32[10] == 0x55555555);
|
|||
|
|
|
|||
|
|
memset(c32, 0x55, sizeof(c32));
|
|||
|
|
ret = mbstoc32(c32, input, 1);
|
|||
|
|
|
|||
|
|
xassert(ret == 1);
|
|||
|
|
xassert(c32[0] == U'f');
|
|||
|
|
xassert(c32[1] == 0x55555555);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
|
|||
|
|
char32_t *
|
|||
|
|
ambstoc32(const char *src)
|
|||
|
|
{
|
|||
|
|
if (src == NULL)
|
|||
|
|
return NULL;
|
|||
|
|
|
|||
|
|
const size_t src_len = strlen(src);
|
|||
|
|
|
|||
|
|
char32_t *ret = xmalloc((src_len + 1) * sizeof(ret[0]));
|
|||
|
|
mbstate_t ps = {0};
|
|||
|
|
|
|||
|
|
char32_t *out = ret;
|
|||
|
|
const char *in = src;
|
|||
|
|
const char *const end = src + src_len + 1;
|
|||
|
|
|
|||
|
|
size_t chars = 0;
|
|||
|
|
size_t rc;
|
|||
|
|
|
|||
|
|
while ((rc = mbrtoc32(out, in, end - in, &ps)) != 0) {
|
|||
|
|
switch (rc) {
|
|||
|
|
case (size_t)-1:
|
|||
|
|
case (size_t)-2:
|
|||
|
|
case (size_t)-3:
|
|||
|
|
goto err;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
in += rc;
|
|||
|
|
out++;
|
|||
|
|
chars++;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
*out = U'\0';
|
|||
|
|
|
|||
|
|
ret = xrealloc(ret, (chars + 1) * sizeof(ret[0]));
|
|||
|
|
return ret;
|
|||
|
|
|
|||
|
|
err:
|
|||
|
|
free(ret);
|
|||
|
|
return NULL;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
UNITTEST
|
|||
|
|
{
|
|||
|
|
setlocale(LC_CTYPE, "en_US.UTF-8");
|
|||
|
|
|
|||
|
|
char32_t *hello = ambstoc32(u8"hello");
|
|||
|
|
xassert(hello != NULL);
|
|||
|
|
xassert(hello[0] == U'h');
|
|||
|
|
xassert(hello[1] == U'e');
|
|||
|
|
xassert(hello[2] == U'l');
|
|||
|
|
xassert(hello[3] == U'l');
|
|||
|
|
xassert(hello[4] == U'o');
|
|||
|
|
xassert(hello[5] == U'\0');
|
|||
|
|
free(hello);
|
|||
|
|
|
|||
|
|
char32_t *swedish = ambstoc32(u8"åäö");
|
|||
|
|
xassert(swedish != NULL);
|
|||
|
|
xassert(swedish[0] == U'å');
|
|||
|
|
xassert(swedish[1] == U'ä');
|
|||
|
|
xassert(swedish[2] == U'ö');
|
|||
|
|
xassert(swedish[3] == U'\0');
|
|||
|
|
free(swedish);
|
|||
|
|
|
|||
|
|
char32_t *emoji = ambstoc32(u8"👨👩👧👦");
|
|||
|
|
xassert(emoji != NULL);
|
|||
|
|
xassert(emoji[0] == U'👨');
|
|||
|
|
xassert(emoji[1] == U'');
|
|||
|
|
xassert(emoji[2] == U'👩');
|
|||
|
|
xassert(emoji[3] == U'');
|
|||
|
|
xassert(emoji[4] == U'👧');
|
|||
|
|
xassert(emoji[5] == U'');
|
|||
|
|
xassert(emoji[6] == U'👦');
|
|||
|
|
xassert(emoji[7] == U'\0');
|
|||
|
|
free(emoji);
|
|||
|
|
|
|||
|
|
xassert(ambstoc32(NULL) == NULL);
|
|||
|
|
setlocale(LC_CTYPE, "C");
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
char *
|
|||
|
|
ac32tombs(const char32_t *src)
|
|||
|
|
{
|
|||
|
|
if (src == NULL)
|
|||
|
|
return NULL;
|
|||
|
|
|
|||
|
|
const size_t src_len = c32len(src);
|
|||
|
|
|
|||
|
|
size_t allocated = src_len + 1;
|
|||
|
|
char *ret = xmalloc(allocated);
|
|||
|
|
mbstate_t ps = {0};
|
|||
|
|
|
|||
|
|
char *out = ret;
|
|||
|
|
const char32_t *const end = src + src_len + 1;
|
|||
|
|
|
|||
|
|
size_t bytes = 0;
|
|||
|
|
|
|||
|
|
char mb[MB_CUR_MAX];
|
|||
|
|
|
|||
|
|
for (const char32_t *in = src; in < end; in++) {
|
|||
|
|
size_t rc = c32rtomb(mb, *in, &ps);
|
|||
|
|
|
|||
|
|
switch (rc) {
|
|||
|
|
case (size_t)-1:
|
|||
|
|
goto err;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (bytes + rc > allocated) {
|
|||
|
|
allocated *= 2;
|
|||
|
|
ret = xrealloc(ret, allocated);
|
|||
|
|
out = &ret[bytes];
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
for (size_t i = 0; i < rc; i++, out++)
|
|||
|
|
*out = mb[i];
|
|||
|
|
|
|||
|
|
bytes += rc;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
xassert(ret[bytes - 1] == '\0');
|
|||
|
|
ret = xrealloc(ret, bytes);
|
|||
|
|
return ret;
|
|||
|
|
|
|||
|
|
err:
|
|||
|
|
free(ret);
|
|||
|
|
return NULL;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
UNITTEST
|
|||
|
|
{
|
|||
|
|
setlocale(LC_CTYPE, "en_US.UTF-8");
|
|||
|
|
|
|||
|
|
char *s = ac32tombs(U"foobar");
|
|||
|
|
xassert(s != NULL);
|
|||
|
|
xassert(strcmp(s, "foobar") == 0);
|
|||
|
|
free(s);
|
|||
|
|
|
|||
|
|
s = ac32tombs(U"åäö");
|
|||
|
|
xassert(s != NULL);
|
|||
|
|
xassert(strcmp(s, u8"åäö") == 0);
|
|||
|
|
free(s);
|
|||
|
|
|
|||
|
|
s = ac32tombs(U"👨👩👧👦");
|
|||
|
|
xassert(s != NULL);
|
|||
|
|
xassert(strcmp(s, u8"👨👩👧👦") == 0);
|
|||
|
|
free(s);
|
|||
|
|
|
|||
|
|
xassert(ac32tombs(NULL) == NULL);
|
|||
|
|
setlocale(LC_CTYPE, "C");
|
|||
|
|
}
|