mirror of
https://codeberg.org/dnkl/foot.git
synced 2026-04-15 08:21:03 -04:00
wcwidth: provide our own implementation of wcwidth() and wcswidth()
This commit adds a new script, generate-wcwidth.py, that generates wcwidth tables from the bundled files (Unicode 14.0): * DerivedGeneralCategory.txt * EastAsianWidth.txt This commit also adds the functions my_wcwidth() and my_wcswidth() that replaces the system’s wcwidth()+wcswidth(), and uses the generated tables to map Unicode codepoints to widths. This is inspired by both XTerm’s wcwidth implementation, as well as https://github.com/jquast/wcwidth. Both of those are based on/inspired by https://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
This commit is contained in:
parent
99ebff5a51
commit
c758949145
7 changed files with 7123 additions and 0 deletions
19
meson.build
19
meson.build
|
|
@ -35,6 +35,14 @@ add_project_arguments(
|
||||||
language: 'c',
|
language: 'c',
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if get_option('system-wcwidth')
|
||||||
|
wcwidth_method = 'system'
|
||||||
|
add_project_arguments('-DFOOT_SYSTEM_WCWIDTH=1', language: 'c')
|
||||||
|
else
|
||||||
|
wcwidth_method = 'builtin'
|
||||||
|
add_project_arguments('-DFOOT_SYSTEM_WCWIDTH=0', language: 'c')
|
||||||
|
endif
|
||||||
|
|
||||||
terminfo_install_location = get_option('custom-terminfo-install-location')
|
terminfo_install_location = get_option('custom-terminfo-install-location')
|
||||||
|
|
||||||
if terminfo_install_location != ''
|
if terminfo_install_location != ''
|
||||||
|
|
@ -136,6 +144,15 @@ version = custom_target(
|
||||||
output: 'version.h',
|
output: 'version.h',
|
||||||
command: [env, 'LC_ALL=C', generate_version_sh, meson.project_version(), '@CURRENT_SOURCE_DIR@', '@OUTPUT@'])
|
command: [env, 'LC_ALL=C', generate_version_sh, meson.project_version(), '@CURRENT_SOURCE_DIR@', '@OUTPUT@'])
|
||||||
|
|
||||||
|
python = find_program('python3', native: true)
|
||||||
|
generate_wcwidth_py = files('scripts/generate-wcwidth.py')
|
||||||
|
wcwidth_tables = custom_target(
|
||||||
|
'generate_wcwidth',
|
||||||
|
output: 'my-wcwidth-tables.h',
|
||||||
|
input: ['unicode/DerivedGeneralCategory.txt',
|
||||||
|
'unicode/EastAsianWidth.txt'],
|
||||||
|
command: [env, 'LC_ALL=C', python, generate_wcwidth_py, '@INPUT0@', '@INPUT1@', '@OUTPUT@'])
|
||||||
|
|
||||||
common = static_library(
|
common = static_library(
|
||||||
'common',
|
'common',
|
||||||
'log.c', 'log.h',
|
'log.c', 'log.h',
|
||||||
|
|
@ -149,6 +166,7 @@ misc = static_library(
|
||||||
'hsl.c', 'hsl.h',
|
'hsl.c', 'hsl.h',
|
||||||
'macros.h',
|
'macros.h',
|
||||||
'misc.c', 'misc.h',
|
'misc.c', 'misc.h',
|
||||||
|
'my-wcwidth.c', 'my-wcwidth.h', wcwidth_tables,
|
||||||
'uri.c', 'uri.h'
|
'uri.c', 'uri.h'
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -280,6 +298,7 @@ summary(
|
||||||
'Themes': get_option('themes'),
|
'Themes': get_option('themes'),
|
||||||
'IME': get_option('ime'),
|
'IME': get_option('ime'),
|
||||||
'Grapheme clustering': utf8proc.found(),
|
'Grapheme clustering': utf8proc.found(),
|
||||||
|
'wcwidth()': wcwidth_method,
|
||||||
'Build terminfo': tic.found(),
|
'Build terminfo': tic.found(),
|
||||||
'Terminfo install location': terminfo_install_location,
|
'Terminfo install location': terminfo_install_location,
|
||||||
'Default TERM': get_option('default-terminfo'),
|
'Default TERM': get_option('default-terminfo'),
|
||||||
|
|
|
||||||
|
|
@ -10,6 +10,9 @@ option('ime', type: 'boolean', value: true,
|
||||||
option('grapheme-clustering', type: 'feature',
|
option('grapheme-clustering', type: 'feature',
|
||||||
description: 'Enables grapheme clustering using libutf8proc. Requires fcft with harfbuzz support to be useful.')
|
description: 'Enables grapheme clustering using libutf8proc. Requires fcft with harfbuzz support to be useful.')
|
||||||
|
|
||||||
|
option('system-wcwidth', type: 'boolean', value: false,
|
||||||
|
description: 'Use the system’s (e.g. glibc or musl) wcwidth(). Foot’s builtin i susually more up-to-date with the latest Unicode specification, but using the system’s version reduces the size of the foot binary.')
|
||||||
|
|
||||||
option('terminfo', type: 'feature', value: 'enabled', description: 'Build and install foot\'s terminfo files.')
|
option('terminfo', type: 'feature', value: 'enabled', description: 'Build and install foot\'s terminfo files.')
|
||||||
option('default-terminfo', type: 'string', value: 'foot',
|
option('default-terminfo', type: 'string', value: 'foot',
|
||||||
description: 'Default value of the "term" option in foot.ini.')
|
description: 'Default value of the "term" option in foot.ini.')
|
||||||
|
|
|
||||||
136
my-wcwidth.c
Normal file
136
my-wcwidth.c
Normal file
|
|
@ -0,0 +1,136 @@
|
||||||
|
#include "my-wcwidth.h"
|
||||||
|
|
||||||
|
#if FOOT_SYSTEM_WCWIDTH == 0
|
||||||
|
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
#define LOG_MODULE "wcwidth"
|
||||||
|
#define LOG_ENABLE_DBG 0
|
||||||
|
#include "log.h"
|
||||||
|
#include "debug.h"
|
||||||
|
#include "util.h"
|
||||||
|
|
||||||
|
#include "my-wcwidth-tables.h"
|
||||||
|
|
||||||
|
UNITTEST
|
||||||
|
{
|
||||||
|
uint32_t last_stop;
|
||||||
|
|
||||||
|
last_stop = ucs_invalid[0].stop;
|
||||||
|
xassert(last_stop >= ucs_invalid[0].start);
|
||||||
|
|
||||||
|
for (size_t i = 1; i < ALEN(ucs_invalid); i++) {
|
||||||
|
uint32_t start = ucs_invalid[i].start;
|
||||||
|
uint32_t stop = ucs_invalid[i].stop;
|
||||||
|
|
||||||
|
xassert(stop >= start);
|
||||||
|
xassert(start > last_stop);
|
||||||
|
|
||||||
|
last_stop = stop;
|
||||||
|
}
|
||||||
|
|
||||||
|
last_stop = ucs_zero_width[0].stop;
|
||||||
|
xassert(last_stop >= ucs_zero_width[0].start);
|
||||||
|
|
||||||
|
for (size_t i = 1; i < ALEN(ucs_zero_width); i++) {
|
||||||
|
uint32_t start = ucs_zero_width[i].start;
|
||||||
|
uint32_t stop = ucs_zero_width[i].stop;
|
||||||
|
|
||||||
|
xassert(stop >= start);
|
||||||
|
xassert(start > last_stop);
|
||||||
|
|
||||||
|
last_stop = stop;
|
||||||
|
}
|
||||||
|
|
||||||
|
last_stop = ucs_double_width[0].stop;
|
||||||
|
xassert(last_stop >= ucs_double_width[0].start);
|
||||||
|
|
||||||
|
for (size_t i = 1; i < ALEN(ucs_double_width); i++) {
|
||||||
|
uint32_t start = ucs_double_width[i].start;
|
||||||
|
uint32_t stop = ucs_double_width[i].stop;
|
||||||
|
|
||||||
|
xassert(stop >= start);
|
||||||
|
xassert(start > last_stop);
|
||||||
|
|
||||||
|
last_stop = stop;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static int
|
||||||
|
ucs_compar(const void *_key, const void *_range)
|
||||||
|
{
|
||||||
|
uint32_t key = (uintptr_t)_key;
|
||||||
|
const struct ucs_range *range = _range;
|
||||||
|
|
||||||
|
if (key < range->start)
|
||||||
|
return -1;
|
||||||
|
else if (key > range->stop)
|
||||||
|
return 1;
|
||||||
|
else
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
IGNORE_WARNING("-Wpedantic")
|
||||||
|
|
||||||
|
int
|
||||||
|
my_wcwidth(wchar_t wc)
|
||||||
|
{
|
||||||
|
#define lookup(table) \
|
||||||
|
wc >= table[0].start && \
|
||||||
|
wc <= table[ALEN(table) - 1].stop && \
|
||||||
|
bsearch(key, table, ALEN(table), sizeof(table[0]), &ucs_compar) != NULL
|
||||||
|
|
||||||
|
if (unlikely(wc == 0))
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
else if (unlikely(wc < 32 || (wc >= 0x7f && wc < 0xa0))) /* C0/C1/DEL */
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
else if (unlikely(wc == 0xad)) { /* SOFT HYPHEN */
|
||||||
|
/* TODO: return 0 instead? */
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
else {
|
||||||
|
const void *key = (const void *)(uintptr_t)wc;
|
||||||
|
|
||||||
|
if (unlikely(lookup(ucs_double_width)))
|
||||||
|
return 2;
|
||||||
|
|
||||||
|
if (unlikely(lookup(ucs_zero_width)))
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
if (unlikely(lookup(ucs_invalid)))
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
#undef lookup
|
||||||
|
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
UNITTEST
|
||||||
|
{
|
||||||
|
xassert(my_wcwidth(L'a') == 1);
|
||||||
|
xassert(my_wcwidth(L'🥲') == 2);
|
||||||
|
xassert(my_wcwidth(L'') == 1); /* SOFT HYPHEN */
|
||||||
|
}
|
||||||
|
|
||||||
|
UNIGNORE_WARNINGS
|
||||||
|
|
||||||
|
int
|
||||||
|
my_wcswidth(const wchar_t *s, size_t n)
|
||||||
|
{
|
||||||
|
int width = 0;
|
||||||
|
|
||||||
|
for (; *s != L'\0' && n-- > 0; s++) {
|
||||||
|
int w = my_wcwidth(*s);
|
||||||
|
if (w < 0)
|
||||||
|
return -1;
|
||||||
|
width += w;
|
||||||
|
}
|
||||||
|
|
||||||
|
return width;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif /* FOOT_SYSTEM_WCWIDTH == 0 */
|
||||||
14
my-wcwidth.h
Normal file
14
my-wcwidth.h
Normal file
|
|
@ -0,0 +1,14 @@
|
||||||
|
#pragma once
|
||||||
|
#include <wchar.h>
|
||||||
|
|
||||||
|
#if FOOT_SYSTEM_WCWIDTH == 0
|
||||||
|
|
||||||
|
int my_wcwidth(wchar_t wc);
|
||||||
|
int my_wcswidth(const wchar_t *s, size_t n);
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
static inline int my_wcwidth(wchar_t wc) { return wcwidth(wc); }
|
||||||
|
static inline int my_wcswidth(const wchar_t *s, size_t n) { return wcswidth(s, n); }
|
||||||
|
|
||||||
|
#endif
|
||||||
172
scripts/generate-wcwidth.py
Executable file
172
scripts/generate-wcwidth.py
Executable file
|
|
@ -0,0 +1,172 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import sys
|
||||||
|
|
||||||
|
from typing import List, Tuple
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument('derived', type=argparse.FileType('r'), help='path to DerivedGeneralCategory.txt')
|
||||||
|
parser.add_argument('east_asian', type=argparse.FileType('r'), help='path to EastEasianWidth.txt')
|
||||||
|
parser.add_argument('output', type=argparse.FileType('w'), help='output, C header file')
|
||||||
|
|
||||||
|
opts = parser.parse_args()
|
||||||
|
|
||||||
|
invalid, zero_width = parse_derived(opts.derived)
|
||||||
|
double_width = parse_east_asian(opts.east_asian)
|
||||||
|
|
||||||
|
output = opts.output
|
||||||
|
output.write('#pragma once\n')
|
||||||
|
output.write('#include <stdint.h>\n')
|
||||||
|
output.write('\n')
|
||||||
|
output.write('struct ucs_range {\n')
|
||||||
|
output.write(' uint32_t start;\n')
|
||||||
|
output.write(' uint32_t stop;\n')
|
||||||
|
output.write('};\n')
|
||||||
|
output.write('\n')
|
||||||
|
|
||||||
|
output.write('static const struct ucs_range ucs_invalid[] = {\n')
|
||||||
|
for i, (start, stop) in enumerate(invalid):
|
||||||
|
if i % 3 == 0:
|
||||||
|
output.write(' ')
|
||||||
|
output.write(f'{{0x{start:05x}, 0x{stop:05x}}}')
|
||||||
|
if i + 1 < len(invalid):
|
||||||
|
output.write(',')
|
||||||
|
if i % 3 == 2:
|
||||||
|
output.write('\n')
|
||||||
|
else:
|
||||||
|
output.write(' ')
|
||||||
|
if len(invalid) % 3 != 0:
|
||||||
|
output.write('\n')
|
||||||
|
output.write('};\n')
|
||||||
|
output.write('\n')
|
||||||
|
|
||||||
|
output.write('static const struct ucs_range ucs_zero_width[] = {\n')
|
||||||
|
for i, (start, stop) in enumerate(zero_width):
|
||||||
|
if i % 3 == 0:
|
||||||
|
output.write(' ')
|
||||||
|
output.write(f'{{0x{start:05x}, 0x{stop:05x}}}')
|
||||||
|
if i + 1 < len(zero_width):
|
||||||
|
output.write(',')
|
||||||
|
if i % 3 == 2:
|
||||||
|
output.write('\n')
|
||||||
|
else:
|
||||||
|
output.write(' ')
|
||||||
|
if len(zero_width) % 3 != 0:
|
||||||
|
output.write('\n')
|
||||||
|
output.write('};\n')
|
||||||
|
output.write('\n')
|
||||||
|
|
||||||
|
output.write('static const struct ucs_range ucs_double_width[] = {\n')
|
||||||
|
for i, (start, stop) in enumerate(double_width):
|
||||||
|
if i % 3 == 0:
|
||||||
|
output.write(' ')
|
||||||
|
output.write(f'{{0x{start:05x}, 0x{stop:05x}}}')
|
||||||
|
if i + 1 < len(double_width):
|
||||||
|
output.write(',')
|
||||||
|
if i % 3 == 2:
|
||||||
|
output.write('\n')
|
||||||
|
else:
|
||||||
|
output.write(' ')
|
||||||
|
if len(double_width) % 3 != 0:
|
||||||
|
output.write('\n')
|
||||||
|
output.write('};\n')
|
||||||
|
|
||||||
|
|
||||||
|
def parse_derived(f) -> List[Tuple[int, int]]:
|
||||||
|
"""Returns a list of (start, stop) tuples of zero-width codepoints."""
|
||||||
|
|
||||||
|
zero_width = []
|
||||||
|
invalid = []
|
||||||
|
|
||||||
|
for line in f.readlines():
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
if line.startswith('#'):
|
||||||
|
continue
|
||||||
|
|
||||||
|
ucs, details = line.split(';', maxsplit=1)
|
||||||
|
ucs, details = ucs.rstrip(), details.lstrip()
|
||||||
|
|
||||||
|
if '..' in ucs:
|
||||||
|
start, stop = ucs.split('..')
|
||||||
|
else:
|
||||||
|
start, stop = ucs, ucs
|
||||||
|
|
||||||
|
start = int(start, 16)
|
||||||
|
stop = int(stop, 16)
|
||||||
|
|
||||||
|
details = details.split('#', maxsplit=1)[0].strip()
|
||||||
|
|
||||||
|
# Me: Mark, enclosing, Mn: Mark, nonspacing, Cf: ???
|
||||||
|
if details in ['Me', 'Mn', 'Cf']:
|
||||||
|
zero_width.append((start, stop))
|
||||||
|
|
||||||
|
# Cn: unassigned
|
||||||
|
if details == 'Cn':
|
||||||
|
invalid.append((start, stop))
|
||||||
|
|
||||||
|
zero_width = sorted(zero_width)
|
||||||
|
invalid = sorted(invalid)
|
||||||
|
|
||||||
|
# Merge consecutive invalid ranges
|
||||||
|
merged_invalid = [invalid[0]]
|
||||||
|
for start, stop in invalid[1:]:
|
||||||
|
if merged_invalid[-1][1] + 1 == start:
|
||||||
|
merged_invalid[-1] = merged_invalid[-1][0], stop
|
||||||
|
else:
|
||||||
|
merged_invalid.append((start, stop))
|
||||||
|
|
||||||
|
# Merge consecutive zero-width ranges
|
||||||
|
merged_zero_width = [zero_width[0]]
|
||||||
|
for start, stop in zero_width[1:]:
|
||||||
|
if merged_zero_width[-1][1] + 1 == start:
|
||||||
|
merged_zero_width[-1] = merged_zero_width[-1][0], stop
|
||||||
|
else:
|
||||||
|
merged_zero_width.append((start, stop))
|
||||||
|
|
||||||
|
return merged_invalid, merged_zero_width
|
||||||
|
|
||||||
|
|
||||||
|
def parse_east_asian(f) -> List[Tuple[int, int]]:
|
||||||
|
"""Returns a list of (start, stop) tuples of double-width codepoints."""
|
||||||
|
|
||||||
|
ranges = []
|
||||||
|
for line in f.readlines():
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
if line.startswith('#'):
|
||||||
|
continue
|
||||||
|
|
||||||
|
ucs, details = line.split(';', maxsplit=1)
|
||||||
|
ucs, details = ucs.rstrip(), details.lstrip()
|
||||||
|
|
||||||
|
if '..' in ucs:
|
||||||
|
start, stop = ucs.split('..')
|
||||||
|
else:
|
||||||
|
start, stop = ucs, ucs
|
||||||
|
|
||||||
|
start = int(start, 16)
|
||||||
|
stop = int(stop, 16)
|
||||||
|
|
||||||
|
details = details.split('#', maxsplit=1)[0].strip()
|
||||||
|
|
||||||
|
if details in 'WF':
|
||||||
|
ranges.append((start, stop))
|
||||||
|
|
||||||
|
# Merge consecutive ranges
|
||||||
|
merged = [ranges[0]]
|
||||||
|
for start, stop in ranges[1:]:
|
||||||
|
if merged[-1][1] + 1 == start:
|
||||||
|
merged[-1] = merged[-1][0], stop
|
||||||
|
else:
|
||||||
|
merged.append((start, stop))
|
||||||
|
|
||||||
|
return merged
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
sys.exit(main())
|
||||||
4192
unicode/DerivedGeneralCategory.txt
Normal file
4192
unicode/DerivedGeneralCategory.txt
Normal file
File diff suppressed because it is too large
Load diff
2587
unicode/EastAsianWidth.txt
Normal file
2587
unicode/EastAsianWidth.txt
Normal file
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue