unicode-combine: remove utf8proc dependency

We only used utf8proc to try to pre-compose a glyph from a base and
combining character.

We can do this ourselves by using a pre-compiled table of valid
pre-compositions. This table isn't _that_ big, and binary searching it
is fast.

That is, for a very small amount of code, and not too much extra RO
data, we can get rid of the utf8proc dependency.
This commit is contained in:
Daniel Eklöf 2020-05-02 17:29:00 +02:00
parent 8389c76549
commit d945b68b73
No known key found for this signature in database
GPG key ID: 5BBD4992C116573F
13 changed files with 34934 additions and 40 deletions

View file

@ -22,12 +22,8 @@
* Right mouse button extends the current selection.
* `CSI Ps ; Ps ; Ps t` escape sequences for the following parameters:
`11t`, `13t`, `13;2t`, `14t`, `14;2t`, `15t`, `19t`.
* Unicode combining characters. This feature is optional. By default,
it is enabled if
[utf8proc](https://github.com/JuliaStrings/utf8proc) is available,
but can be explicitly disabled or enabled at compile time with
`meson -Dunicode-combining=disabled|enabled`.
* Unicode combining characters. This feature is compile time
optional. See [README.md](README.md#unicode-combining]) for details.
### Changed

View file

@ -16,6 +16,7 @@ The fast, lightweight and minimalistic Wayland terminal emulator.
1. [Backspace](#backspace)
1. [DPI and font size](#dpi-and-font-size)
1. [Supported OSCs](#supported-oscs)
1. [Unicode combining](#unicode-combining)
1. [Requirements](#requirements)
1. [Running](#running)
1. [Building](#building)
@ -268,6 +269,33 @@ with the terminal emulator itself. Foot implements the following OSCs:
* `OSC 555` - flash screen (**foot specific**)
## Unicode combining
In order to handle combining characters, foot must store additional
data for each cell. By default, foot stores at most 2 combining
characters per cell. This adds 9 bytes of additional space to each
cell (that's 75% more space than without combining characters).
You can configure the maximum number of characters to store for each
cell at **compile time** with
`-Dunicode-max-combining-chars=<int>`. Setting this to `0`
**disables** unicode combining completely - **no** additional data is
stored.
Furthermore, in order to improve rendering of combining characters,
foot will by default try to convert base + combining characters to a
pre-composed character.
This will typically look better, since we can now render a single
glyph, the way the font designer intended it to be rendered. When
pre-composing fails, foot will fallback to storing the combining
character(s) separate from the base character, and will render the
final grapheme by rendering the base and combining glyphs separately.
You can disable pre-composing at **compile time** with
`-Dunicode-precompose=false`.
## Requirements
### Running

33797
UnicodeData.txt Normal file

File diff suppressed because it is too large Load diff

8
grid.c
View file

@ -34,14 +34,14 @@ grid_row_alloc(int cols, bool initialize)
if (initialize) {
row->cells = calloc(cols, sizeof(row->cells[0]));
#if FOOT_UNICODE_COMBINING
#if FOOT_UNICODE_MAX_COMBINING_CHARS > 0
row->comb_chars = calloc(cols, sizeof(row->comb_chars[0]));
#endif
for (size_t c = 0; c < cols; c++)
row->cells[c].attrs.clean = 1;
} else {
row->cells = malloc(cols * sizeof(row->cells[0]));
#if FOOT_UNICODE_COMBINING
#if FOOT_UNICODE_MAX_COMBINING_CHARS > 0
row->comb_chars = malloc(cols * sizeof(row->comb_chars[0]));
#endif
}
@ -55,7 +55,7 @@ grid_row_free(struct row *row)
if (row == NULL)
return;
#if FOOT_UNICODE_COMBINING
#if FOOT_UNICODE_MAX_COMBINING_CHARS > 0
free(row->comb_chars);
#endif
free(row->cells);
@ -214,7 +214,7 @@ grid_reflow(struct grid *grid, int new_rows, int new_cols,
new_row->cells[new_col_idx] = *old_cell;
new_row->cells[new_col_idx].attrs.clean = 1;
#if FOOT_UNICODE_COMBINING
#if FOOT_UNICODE_MAX_COMBINING_CHARS > 0
struct combining_chars *old_comb_chars
= &old_row->comb_chars[c - empty_count + i];
struct combining_chars *new_comb_chars

View file

@ -57,8 +57,10 @@ wayland_client = dependency('wayland-client')
wayland_cursor = dependency('wayland-cursor')
xkb = dependency('xkbcommon')
utf8proc = dependency('libutf8proc', required: get_option('unicode-combining'))
add_project_arguments('-DFOOT_UNICODE_COMBINING=@0@'.format(utf8proc.found()), language: 'c')
add_project_arguments('-DFOOT_UNICODE_MAX_COMBINING_CHARS=@0@'.format(
get_option('unicode-max-combining-chars')), language: 'c')
add_project_arguments('-DFOOT_UNICODE_PRECOMPOSE=@0@'.format(
get_option('unicode-precompose')), language: 'c')
tllist = dependency('tllist', version: '>=1.0.1', fallback: 'tllist')
fcft = dependency('fcft', version: ['>=2.0.0', '<2.1.0'], fallback: 'fcft')
@ -128,8 +130,7 @@ executable(
'vt.c', 'vt.h',
'wayland.c', 'wayland.h',
wl_proto_src + wl_proto_headers, version,
dependencies: [math, threads, pixman, wayland_client, wayland_cursor, xkb, utf8proc,
tllist, fcft],
dependencies: [math, threads, pixman, wayland_client, wayland_cursor, xkb, tllist, fcft],
install: true)
executable(
@ -154,7 +155,8 @@ subdir('doc')
summary(
{
'Unicode combining': utf8proc.found(),
'Unicode max combining chars': get_option('unicode-max-combining-chars'),
'Unicode precompose': get_option('unicode-precompose'),
},
bool_yn: true
)

View file

@ -1 +1,5 @@
option('unicode-combining', type: 'feature', value: 'auto', description: 'Perform unicode combining')
option('unicode-max-combining-chars', type: 'integer', value: 2,
description: 'Maximum number of combining characters to track per cell. A value of 0 completely disables unicode combining (this reduces the runtime memory footprint)')
option('unicode-precompose', type: 'boolean', value: true,
description: 'Convert decomposed characters to precomposed. Ignored if "unicode-combining" has been disabled')

View file

@ -442,7 +442,7 @@ render_cell(struct terminal *term, pixman_image_t *pix,
}
}
#if FOOT_UNICODE_COMBINING
#if FOOT_UNICODE_MAX_COMBINING_CHARS > 0
/* Combining characters */
const struct combining_chars *comb_chars = &row->comb_chars[col];
for (size_t i = 0; i < comb_chars->count; i++) {

View file

@ -0,0 +1,8 @@
#!/usr/bin/sh
cut - -d ";" -f 1,6 |
grep ";[0-9,A-F]" | grep " " |
sed -e "s/ /, 0x/;s/^/{ 0x/;s/;/, 0x/;s/$/},/" |
sed -e "s,0x\(....\)\([^0-9A-Fa-f]\),0x0\1\2,g" |
(sort -k 3 || sort +2) |
sed -e "s,0x0\(...[0-9A-Fa-f]\),0x\1,g"

View file

@ -143,7 +143,7 @@ min_bufsize_for_extraction(const struct terminal *term)
const struct coord *start = &term->selection.start;
const struct coord *end = &term->selection.end;
const size_t chars_per_cell =
#if FOOT_UNICODE_COMBINING
#if FOOT_UNICODE_MAX_COMBINING_CHARS > 0
1 + ALEN(term->grid->cur_row->comb_chars[0].chars);
#else
1;
@ -241,7 +241,7 @@ extract_one(struct terminal *term, struct row *row, struct cell *cell,
assert(ctx->idx + 1 <= ctx->size);
ctx->buf[ctx->idx++] = cell->wc;
#if FOOT_UNICODE_COMBINING
#if FOOT_UNICODE_MAX_COMBINING_CHARS > 0
const struct combining_chars *comb_chars = &row->comb_chars[col];
assert(cell->wc != 0);

View file

@ -2295,7 +2295,7 @@ term_print(struct terminal *term, wchar_t wc, int width)
cell->wc = term->vt.last_printed = wc;
cell->attrs = term->vt.attrs;
#if FOOT_UNICODE_COMBINING
#if FOOT_UNICODE_MAX_COMBINING_CHARS > 0
row->comb_chars[term->grid->cursor.point.col].count = 0;
#endif

View file

@ -77,10 +77,10 @@ struct damage {
int lines;
};
#if FOOT_UNICODE_COMBINING
#if FOOT_UNICODE_MAX_COMBINING_CHARS > 0
struct combining_chars {
uint8_t count;
wchar_t chars[2]; /* This is XTerms default, but there _are_ cases where more are needed */
wchar_t chars[FOOT_UNICODE_MAX_COMBINING_CHARS];
} __attribute__((packed));
#endif
@ -89,7 +89,7 @@ struct row {
bool dirty;
bool linebreak;
#if FOOT_UNICODE_COMBINING
#if FOOT_UNICODE_MAX_COMBINING_CHARS > 0
struct combining_chars *comb_chars;
#endif
};

1036
unicode-compose-table.h Normal file

File diff suppressed because it is too large Load diff

57
vt.c
View file

@ -5,10 +5,6 @@
#include <unistd.h>
#include <assert.h>
#if FOOT_UNICODE_COMBINING
#include <utf8proc.h>
#endif
#define LOG_MODULE "vt"
#define LOG_ENABLE_DBG 0
#include "log.h"
@ -18,6 +14,10 @@
#include "osc.h"
#include "util.h"
#if FOOT_UNICODE_PRECOMPOSE
#include "unicode-compose-table.h"
#endif
#define UNHANDLED() LOG_DBG("unhandled: %s", esc_as_string(term, final))
/* https://vt100.net/emu/dec_ansi_parser */
@ -527,6 +527,34 @@ action_utf8_4_entry(struct terminal *term, uint8_t c)
term->vt.utf8.data[term->vt.utf8.idx++] = c;
}
static wchar_t
precompose(wchar_t base, wchar_t comb)
{
static_assert(2 * sizeof(wchar_t) <= sizeof(uint64_t),
"two wchars does not fit in an uint64_t");
const uint64_t match = (uint64_t)base << 32 | comb;
size_t start = 0;
size_t end = ALEN(precompose_table) - 1;
while (start <= end) {
size_t middle = (start + end) / 2;
const uint64_t maybe =
(uint64_t)precompose_table[middle].base << 32 | precompose_table[middle].comb;
if (maybe < match)
start = middle + 1;
else if (maybe > match)
end = middle - 1;
else
return precompose_table[middle].replacement;
}
return (wchar_t)-1;
}
static void
action_utf8_print(struct terminal *term, uint8_t c)
{
@ -541,7 +569,7 @@ action_utf8_print(struct terminal *term, uint8_t c)
int width = wcwidth(wc);
#if FOOT_UNICODE_COMBINING
#if FOOT_UNICODE_MAX_COMBINING_CHARS > 0
/*
* Is this is combining character? The basic assumption is that if
@ -588,27 +616,22 @@ action_utf8_print(struct terminal *term, uint8_t c)
* If there is, replace the base character with the
* pre-composed character, as that is likely to produce a
* better looking result.
*
* TODO: we could perhaps remove this is we improve our
* positioning of the combining characters when rendering
* the glyph.
*/
struct combining_chars *comb_chars = &row->comb_chars[base_col];
#if FOOT_UNICODE_PRECOMPOSE
if (comb_chars->count == 0) {
wchar_t composed[] = {base, wc};
ssize_t composed_length = utf8proc_normalize_utf32(
composed, ALEN(composed), UTF8PROC_COMPOSE | UTF8PROC_STABLE);
int composed_width = wcwidth(composed[0]);
if (composed_length == 1 && composed_width == base_width) {
wchar_t precomposed = precompose(base, wc);
int precomposed_width = wcwidth(precomposed);
if (precomposed != (wchar_t)-1 && precomposed_width == base_width) {
term->grid->cursor.point.col = base_col;
term->grid->cursor.lcf = false;
term_print(term, composed[0], composed_width);
term_print(term, precomposed, precomposed_width);
return;
}
}
#endif
if (comb_chars->count < ALEN(comb_chars->chars))
comb_chars->chars[comb_chars->count++] = wc;
@ -622,7 +645,7 @@ action_utf8_print(struct terminal *term, uint8_t c)
return;
}
}
#endif /* FOOT_UNICODE_COMBINING */
#endif /* FOOT_UNICODE_MAX_COMBINING_CHARS > 0 */
term_print(term, wc, width);
}