wcwidth: provide our own implementation of wcwidth() and wcswidth()

This commit adds a new script, generate-wcwidth.py, that generates wcwidth tables from the bundled files (Unicode 14.0): * DerivedGeneralCategory.txt * EastAsianWidth.txt This commit also adds the functions my_wcwidth() and my_wcswidth() that replaces the system’s wcwidth()+wcswidth(), and uses the generated tables to map Unicode codepoints to widths. This is inspired by both XTerm’s wcwidth implementation, as well as https://github.com/jquast/wcwidth. Both of those are based on/inspired by https://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
2026-02-21 01:40:16 -05:00 · 2022-01-05 21:10:21 +01:00 · 2022-01-05 21:10:21 +01:00 · c758949145
commit c758949145
parent 99ebff5a51
7 changed files with 7123 additions and 0 deletions
--- a/meson.build
+++ b/meson.build
@ -35,6 +35,14 @@ add_project_arguments(
  language: 'c',
 )

+if get_option('system-wcwidth')
+  wcwidth_method = 'system'
+  add_project_arguments('-DFOOT_SYSTEM_WCWIDTH=1', language: 'c')
+else
+  wcwidth_method = 'builtin'
+  add_project_arguments('-DFOOT_SYSTEM_WCWIDTH=0', language: 'c')
+endif
+
 terminfo_install_location = get_option('custom-terminfo-install-location')

 if terminfo_install_location != ''
@ -136,6 +144,15 @@ version = custom_target(
  output: 'version.h',
  command: [env, 'LC_ALL=C', generate_version_sh, meson.project_version(), '@CURRENT_SOURCE_DIR@', '@OUTPUT@'])

+python = find_program('python3', native: true)
+generate_wcwidth_py = files('scripts/generate-wcwidth.py')
+wcwidth_tables = custom_target(
+  'generate_wcwidth',
+  output: 'my-wcwidth-tables.h',
+  input: ['unicode/DerivedGeneralCategory.txt',
+          'unicode/EastAsianWidth.txt'],
+  command: [env, 'LC_ALL=C', python, generate_wcwidth_py, '@INPUT0@', '@INPUT1@', '@OUTPUT@'])
+
 common = static_library(
  'common',
  'log.c', 'log.h',
@ -149,6 +166,7 @@ misc = static_library(
  'hsl.c', 'hsl.h',
  'macros.h',
  'misc.c', 'misc.h',
+  'my-wcwidth.c', 'my-wcwidth.h', wcwidth_tables,
  'uri.c', 'uri.h'
 )

@ -280,6 +298,7 @@ summary(
    'Themes': get_option('themes'),
    'IME': get_option('ime'),
    'Grapheme clustering': utf8proc.found(),
+    'wcwidth()': wcwidth_method,
    'Build terminfo': tic.found(),
    'Terminfo install location': terminfo_install_location,
    'Default TERM': get_option('default-terminfo'),
--- a/meson_options.txt
+++ b/meson_options.txt
@ -10,6 +10,9 @@ option('ime', type: 'boolean', value: true,
 option('grapheme-clustering', type: 'feature',
       description: 'Enables grapheme clustering using libutf8proc. Requires fcft with harfbuzz support to be useful.')

+option('system-wcwidth', type: 'boolean', value: false,
+       description: 'Use the system’s (e.g. glibc or musl) wcwidth(). Foot’s builtin i susually more up-to-date with the latest Unicode specification, but using the system’s version reduces the size of the foot binary.')
+
 option('terminfo', type: 'feature', value: 'enabled', description: 'Build and install foot\'s terminfo files.')
 option('default-terminfo', type: 'string', value: 'foot',
       description: 'Default value of the "term" option in foot.ini.')
--- a/my-wcwidth.c
+++ b/my-wcwidth.c
@ -0,0 +1,136 @@
+#include "my-wcwidth.h"
+
+#if FOOT_SYSTEM_WCWIDTH == 0
+
+#include <stdlib.h>
+
+#define LOG_MODULE "wcwidth"
+#define LOG_ENABLE_DBG 0
+#include "log.h"
+#include "debug.h"
+#include "util.h"
+
+#include "my-wcwidth-tables.h"
+
+UNITTEST
+{
+    uint32_t last_stop;
+
+    last_stop = ucs_invalid[0].stop;
+    xassert(last_stop >= ucs_invalid[0].start);
+
+    for (size_t i = 1; i < ALEN(ucs_invalid); i++) {
+        uint32_t start = ucs_invalid[i].start;
+        uint32_t stop = ucs_invalid[i].stop;
+
+        xassert(stop >= start);
+        xassert(start > last_stop);
+
+        last_stop = stop;
+    }
+
+    last_stop = ucs_zero_width[0].stop;
+    xassert(last_stop >= ucs_zero_width[0].start);
+
+    for (size_t i = 1; i < ALEN(ucs_zero_width); i++) {
+        uint32_t start = ucs_zero_width[i].start;
+        uint32_t stop = ucs_zero_width[i].stop;
+
+        xassert(stop >= start);
+        xassert(start > last_stop);
+
+        last_stop = stop;
+    }
+
+    last_stop = ucs_double_width[0].stop;
+    xassert(last_stop >= ucs_double_width[0].start);
+
+    for (size_t i = 1; i < ALEN(ucs_double_width); i++) {
+        uint32_t start = ucs_double_width[i].start;
+        uint32_t stop = ucs_double_width[i].stop;
+
+        xassert(stop >= start);
+        xassert(start > last_stop);
+
+        last_stop = stop;
+    }
+}
+
+static int
+ucs_compar(const void *_key, const void *_range)
+{
+    uint32_t key = (uintptr_t)_key;
+    const struct ucs_range *range = _range;
+
+    if (key < range->start)
+        return -1;
+    else if (key > range->stop)
+        return 1;
+    else
+        return 0;
+}
+
+IGNORE_WARNING("-Wpedantic")
+
+int
+my_wcwidth(wchar_t wc)
+{
+#define lookup(table)                                           \
+    wc >= table[0].start &&                                     \
+    wc <= table[ALEN(table) - 1].stop &&                        \
+    bsearch(key, table, ALEN(table), sizeof(table[0]), &ucs_compar) != NULL
+
+    if (unlikely(wc == 0))
+        return 0;
+
+    else if (unlikely(wc < 32 || (wc >= 0x7f && wc < 0xa0)))  /* C0/C1/DEL */
+        return -1;
+
+    else if (unlikely(wc == 0xad)) { /* SOFT HYPHEN */
+        /* TODO: return 0 instead? */
+        return 1;
+    }
+
+    else {
+        const void *key = (const void *)(uintptr_t)wc;
+
+        if (unlikely(lookup(ucs_double_width)))
+            return 2;
+
+        if (unlikely(lookup(ucs_zero_width)))
+            return 0;
+
+        if (unlikely(lookup(ucs_invalid)))
+            return -1;
+
+#undef lookup
+
+        return 1;
+    }
+}
+
+UNITTEST
+{
+    xassert(my_wcwidth(L'a') == 1);
+    xassert(my_wcwidth(L'🥲') == 2);
+    xassert(my_wcwidth(L'') == 1);  /* SOFT HYPHEN */
+}
+
+UNIGNORE_WARNINGS
+
+int
+my_wcswidth(const wchar_t *s, size_t n)
+{
+    int width = 0;
+
+    for (; *s != L'\0' && n-- > 0; s++) {
+        int w = my_wcwidth(*s);
+        if (w < 0)
+            return -1;
+        width += w;
+    }
+
+    return width;
+}
+
+#endif /* FOOT_SYSTEM_WCWIDTH == 0 */
--- a/my-wcwidth.h
+++ b/my-wcwidth.h
@ -0,0 +1,14 @@
+#pragma once
+#include <wchar.h>
+
+#if FOOT_SYSTEM_WCWIDTH == 0
+
+int my_wcwidth(wchar_t wc);
+int my_wcswidth(const wchar_t *s, size_t n);
+
+#else
+
+static inline int my_wcwidth(wchar_t wc) { return wcwidth(wc); }
+static inline int my_wcswidth(const wchar_t *s, size_t n) { return wcswidth(s, n); }
+
+#endif
--- a/scripts/generate-wcwidth.py
+++ b/scripts/generate-wcwidth.py
@ -0,0 +1,172 @@
+#!/usr/bin/env python3
+
+import argparse
+import sys
+
+from typing import List, Tuple
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('derived', type=argparse.FileType('r'), help='path to DerivedGeneralCategory.txt')
+    parser.add_argument('east_asian', type=argparse.FileType('r'), help='path to EastEasianWidth.txt')
+    parser.add_argument('output', type=argparse.FileType('w'), help='output, C header file')
+
+    opts = parser.parse_args()
+
+    invalid, zero_width = parse_derived(opts.derived)
+    double_width = parse_east_asian(opts.east_asian)
+
+    output = opts.output
+    output.write('#pragma once\n')
+    output.write('#include <stdint.h>\n')
+    output.write('\n')
+    output.write('struct ucs_range {\n')
+    output.write('    uint32_t start;\n')
+    output.write('    uint32_t stop;\n')
+    output.write('};\n')
+    output.write('\n')
+
+    output.write('static const struct ucs_range ucs_invalid[] = {\n')
+    for i, (start, stop) in enumerate(invalid):
+        if i % 3 == 0:
+            output.write('    ')
+        output.write(f'{{0x{start:05x}, 0x{stop:05x}}}')
+        if i + 1 < len(invalid):
+            output.write(',')
+        if i % 3 == 2:
+            output.write('\n')
+        else:
+            output.write(' ')
+    if len(invalid) % 3 != 0:
+        output.write('\n')
+    output.write('};\n')
+    output.write('\n')
+
+    output.write('static const struct ucs_range ucs_zero_width[] = {\n')
+    for i, (start, stop) in enumerate(zero_width):
+        if i % 3 == 0:
+            output.write('    ')
+        output.write(f'{{0x{start:05x}, 0x{stop:05x}}}')
+        if i + 1 < len(zero_width):
+            output.write(',')
+        if i % 3 == 2:
+            output.write('\n')
+        else:
+            output.write(' ')
+    if len(zero_width) % 3 != 0:
+        output.write('\n')
+    output.write('};\n')
+    output.write('\n')
+
+    output.write('static const struct ucs_range ucs_double_width[] = {\n')
+    for i, (start, stop) in enumerate(double_width):
+        if i % 3 == 0:
+            output.write('    ')
+        output.write(f'{{0x{start:05x}, 0x{stop:05x}}}')
+        if i + 1 < len(double_width):
+            output.write(',')
+        if i % 3 == 2:
+            output.write('\n')
+        else:
+            output.write(' ')
+    if len(double_width) % 3 != 0:
+        output.write('\n')
+    output.write('};\n')
+
+
+def parse_derived(f) -> List[Tuple[int, int]]:
+    """Returns a list of (start, stop) tuples of zero-width codepoints."""
+
+    zero_width = []
+    invalid = []
+
+    for line in f.readlines():
+        line = line.strip()
+        if not line:
+            continue
+        if line.startswith('#'):
+            continue
+
+        ucs, details = line.split(';', maxsplit=1)
+        ucs, details = ucs.rstrip(), details.lstrip()
+
+        if '..' in ucs:
+            start, stop = ucs.split('..')
+        else:
+            start, stop = ucs, ucs
+
+        start = int(start, 16)
+        stop = int(stop, 16)
+
+        details = details.split('#', maxsplit=1)[0].strip()
+
+        # Me: Mark, enclosing, Mn: Mark, nonspacing, Cf: ???
+        if details in ['Me', 'Mn', 'Cf']:
+            zero_width.append((start, stop))
+
+        # Cn: unassigned
+        if details == 'Cn':
+            invalid.append((start, stop))
+
+    zero_width = sorted(zero_width)
+    invalid = sorted(invalid)
+
+    # Merge consecutive invalid ranges
+    merged_invalid = [invalid[0]]
+    for start, stop in invalid[1:]:
+        if merged_invalid[-1][1] + 1 == start:
+            merged_invalid[-1] = merged_invalid[-1][0], stop
+        else:
+            merged_invalid.append((start, stop))
+
+    # Merge consecutive zero-width ranges
+    merged_zero_width = [zero_width[0]]
+    for start, stop in zero_width[1:]:
+        if merged_zero_width[-1][1] + 1 == start:
+            merged_zero_width[-1] = merged_zero_width[-1][0], stop
+        else:
+            merged_zero_width.append((start, stop))
+
+    return merged_invalid, merged_zero_width
+
+
+def parse_east_asian(f) -> List[Tuple[int, int]]:
+    """Returns a list of (start, stop) tuples of double-width codepoints."""
+
+    ranges = []
+    for line in f.readlines():
+        line = line.strip()
+        if not line:
+            continue
+        if line.startswith('#'):
+            continue
+
+        ucs, details = line.split(';', maxsplit=1)
+        ucs, details = ucs.rstrip(), details.lstrip()
+
+        if '..' in ucs:
+            start, stop = ucs.split('..')
+        else:
+            start, stop = ucs, ucs
+
+        start = int(start, 16)
+        stop = int(stop, 16)
+
+        details = details.split('#', maxsplit=1)[0].strip()
+
+        if details in 'WF':
+            ranges.append((start, stop))
+
+    # Merge consecutive ranges
+    merged = [ranges[0]]
+    for start, stop in ranges[1:]:
+        if merged[-1][1] + 1 == start:
+            merged[-1] = merged[-1][0], stop
+        else:
+            merged.append((start, stop))
+
+    return merged
+
+if __name__ == '__main__':
+    sys.exit(main())
--- a/unicode/DerivedGeneralCategory.txt
+++ b/unicode/DerivedGeneralCategory.txt
--- a/unicode/EastAsianWidth.txt
+++ b/unicode/EastAsianWidth.txt