From 4283a8c51b37330ded50e49f61c6c9a3a4fe19b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Ekl=C3=B6f?= Date: Mon, 27 Apr 2020 12:13:30 +0200 Subject: [PATCH 1/8] utf8: add support for unicode combining characters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This feature lets foot combine e.g. "a\u0301" to "á". We first check if the current character (that we're about to print) is a combining character, by checking if it's in one of the following ranges: * Combining Diacritical Marks (0300–036F), since version 1.0, with modifications in subsequent versions down to 4.1 * Combining Diacritical Marks Extended (1AB0–1AFF), version 7.0 * Combining Diacritical Marks Supplement (1DC0–1DFF), versions 4.1 to 5.2 * Combining Diacritical Marks for Symbols (20D0–20FF), since version 1.0, with modifications in subsequent versions down to 5.1 * Combining Half Marks (FE20–FE2F), versions 1.0, with modifications in subsequent versions down to 8.0 If it is, we check if the last cell appears to contain a valid symbol, and if so, we attempt to compose (combine) the last cell with the current character, using utf8proc. If the result is a combined character, we replace the content in the previous cell with the new, combined character. Thus, if you select and copy the printed character, you would get e.g. "\u00e1" instead of "a\u0301". This feature can be disabled. By default, it is enabled if the utf8proc library is found, but can be explicitly disabled, or enabled, with 'meson -Dunicode-combining=disabled|enabled'. --- CHANGELOG.md | 8 +++++++ meson.build | 5 ++++- meson_options.txt | 1 + vt.c | 56 +++++++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 67 insertions(+), 3 deletions(-) create mode 100644 meson_options.txt diff --git a/CHANGELOG.md b/CHANGELOG.md index 20d82896..067f323c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,14 @@ * Right mouse button extends the current selection. * `CSI Ps ; Ps ; Ps t` escape sequences for the following parameters: `11t`, `13t`, `13;2t`, `14t`, `14;2t`, `15t`, `19t`. +* (Optional) spport for unicode combining characters. For example, + `a\u0301` will be combined to `á` (`\u00e1`). Note that copying the + printed character to the clipboard/primary selection will copy the + byte `\u00e1` and **not** `\u0061\u0301`. It requires + [utf8proc](https://github.com/JuliaStrings/utf8proc). By default, + the feature is enabled if utf8proc is found. However, it can also be + explicitly disabled (or enabled) with `meson + -Dunicode-combining=enabled|disabled`) ### Changed diff --git a/meson.build b/meson.build index 98c804a6..464e3b54 100644 --- a/meson.build +++ b/meson.build @@ -57,6 +57,9 @@ wayland_client = dependency('wayland-client') wayland_cursor = dependency('wayland-cursor') xkb = dependency('xkbcommon') +utf8proc = dependency('libutf8proc', required: get_option('unicode-combining')) +add_project_arguments('-DFOOT_UNICODE_COMBINING=@0@'.format(utf8proc.found()), language: 'c') + tllist = dependency('tllist', version: '>=1.0.1', fallback: 'tllist') fcft = dependency('fcft', version: ['>=2.0.0', '<2.1.0'], fallback: 'fcft') @@ -125,7 +128,7 @@ executable( 'vt.c', 'vt.h', 'wayland.c', 'wayland.h', wl_proto_src + wl_proto_headers, version, - dependencies: [math, threads, pixman, wayland_client, wayland_cursor, xkb, + dependencies: [math, threads, pixman, wayland_client, wayland_cursor, xkb, utf8proc, tllist, fcft], install: true) diff --git a/meson_options.txt b/meson_options.txt new file mode 100644 index 00000000..cc601843 --- /dev/null +++ b/meson_options.txt @@ -0,0 +1 @@ +option('unicode-combining', type: 'feature', value: 'auto', description: 'Perform unicode combining') diff --git a/vt.c b/vt.c index ab36ac6b..c6f86bdb 100644 --- a/vt.c +++ b/vt.c @@ -5,6 +5,10 @@ #include #include +#if FOOT_UNICODE_COMBINING + #include +#endif + #define LOG_MODULE "vt" #define LOG_ENABLE_DBG 0 #include "log.h" @@ -547,8 +551,56 @@ action_utf8_print(struct terminal *term, uint8_t c) /* Reset VT utf8 state */ term->vt.utf8.idx = 0; - int width = wcwidth(wc); - term_print(term, wc, width); +#if FOOT_UNICODE_COMBINING + /* + * Try to combine with the previous character. + * + * We _could_ try regardless of what 'wc' is. However, for + * performance reasons, we only do it when 'wc' is in a known + * 'combining' range. + * + * TODO: + * - doesn't work when base character is multi-column (we'll only + * see an empty "null" character) + */ + + if (((wc >= 0x0300 && wc <= 0x036F) || /* diacritical marks */ + (wc >= 0x1AB0 && wc <= 0x1AFF) || /* diacritical marks, extended */ + (wc >= 0x1DC0 && wc <= 0x1DFF) || /* diacritical marks, supplement */ + (wc >= 0x20D0 && wc <= 0x20FF) || /* diacritical marks, for symbols */ + (wc >= 0xFE20 && wc <= 0xFE2F)) /* half marks */ + && term->grid->cursor.point.col > 0) + { + int base_col = term->grid->cursor.point.col; + if (!term->grid->cursor.lcf) + base_col--; + + assert(base_col >= 0 && base_col < term->cols); + wchar_t base = term->grid->cur_row->cells[base_col].wc; + int base_width = wcwidth(base); + + if (base_width > 0) { + wchar_t composed[] = {base, wc}; + ssize_t composed_length = utf8proc_normalize_utf32( + composed, sizeof(composed) / sizeof(composed[0]), + UTF8PROC_COMPOSE | UTF8PROC_STABLE); + + LOG_DBG("composed = 0x%04x, 0x%04x (length = %zd)", + composed[0], composed[1], composed_length); + + if (composed_length == 1) { + /* Compose succeess - overwrite last cell with + * combined character */ + term->grid->cursor.point.col = base_col; + term->grid->cursor.lcf = false; + term_print(term, composed[0], wcwidth(composed[0])); + return; + } + } + } +#endif /* FOOT_UNICODE_COMBINING */ + + term_print(term, wc, wcwidth(wc)); } static enum state From 2008207929ef08ce6e9aa5143e81b2905266e4d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Ekl=C3=B6f?= Date: Mon, 27 Apr 2020 12:25:35 +0200 Subject: [PATCH 2/8] readme: add utf8proc to the list of dependencies --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index b55dd8aa..27810fdc 100644 --- a/README.md +++ b/README.md @@ -277,6 +277,7 @@ with the terminal emulator itself. Foot implements the following OSCs: * pixman * wayland (_client_ and _cursor_ libraries) * xkbcommon +* utf8proc (_optional_ - enables unicode combining) * [tllist](https://codeberg.org/dnkl/tllist) [^1] * [fcft](https://codeberg.org/dnkl/fcft) [^1] From aae998d9a3d78f1303cc2f991b0240beecf95574 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Ekl=C3=B6f?= Date: Mon, 27 Apr 2020 12:26:39 +0200 Subject: [PATCH 3/8] readme: add link to utf8proc --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 27810fdc..92534d6d 100644 --- a/README.md +++ b/README.md @@ -277,7 +277,8 @@ with the terminal emulator itself. Foot implements the following OSCs: * pixman * wayland (_client_ and _cursor_ libraries) * xkbcommon -* utf8proc (_optional_ - enables unicode combining) +* [utf8proc](https://github.com/JuliaStrings/utf8pro) (_optional_ - + enables unicode combining) * [tllist](https://codeberg.org/dnkl/tllist) [^1] * [fcft](https://codeberg.org/dnkl/fcft) [^1] From 3fb3b63d93a06b25eaed84a555049c0fe2ecb6bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Ekl=C3=B6f?= Date: Mon, 27 Apr 2020 12:27:00 +0200 Subject: [PATCH 4/8] readme: link to utf8proc was cut off --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 92534d6d..4271920b 100644 --- a/README.md +++ b/README.md @@ -277,7 +277,7 @@ with the terminal emulator itself. Foot implements the following OSCs: * pixman * wayland (_client_ and _cursor_ libraries) * xkbcommon -* [utf8proc](https://github.com/JuliaStrings/utf8pro) (_optional_ - +* [utf8proc](https://github.com/JuliaStrings/utf8proc) (_optional_ - enables unicode combining) * [tllist](https://codeberg.org/dnkl/tllist) [^1] * [fcft](https://codeberg.org/dnkl/fcft) [^1] From e478874dd9cefb08011c46d43deb850e68a4458b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Ekl=C3=B6f?= Date: Mon, 27 Apr 2020 15:06:23 +0200 Subject: [PATCH 5/8] term: remove unneeded utf8.left member --- terminal.h | 1 - vt.c | 15 --------------- 2 files changed, 16 deletions(-) diff --git a/terminal.h b/terminal.h index 7653daa3..7fcede70 100644 --- a/terminal.h +++ b/terminal.h @@ -135,7 +135,6 @@ struct vt { struct { uint8_t data[4]; uint8_t idx; - uint8_t left; } utf8; struct { uint8_t *data; diff --git a/vt.c b/vt.c index c6f86bdb..0a34a45e 100644 --- a/vt.c +++ b/vt.c @@ -513,27 +513,21 @@ static void action_utf8_2_entry(struct terminal *term, uint8_t c) { term->vt.utf8.idx = 0; - term->vt.utf8.left = 2; term->vt.utf8.data[term->vt.utf8.idx++] = c; - term->vt.utf8.left--; } static void action_utf8_3_entry(struct terminal *term, uint8_t c) { term->vt.utf8.idx = 0; - term->vt.utf8.left = 3; term->vt.utf8.data[term->vt.utf8.idx++] = c; - term->vt.utf8.left--; } static void action_utf8_4_entry(struct terminal *term, uint8_t c) { term->vt.utf8.idx = 0; - term->vt.utf8.left = 4; term->vt.utf8.data[term->vt.utf8.idx++] = c; - term->vt.utf8.left--; } static void @@ -1068,9 +1062,6 @@ static enum state state_utf8_collect_1_switch(struct terminal *term, uint8_t data) { term->vt.utf8.data[term->vt.utf8.idx++] = data; - term->vt.utf8.left--; - - assert(term->vt.utf8.left == 0); action_utf8_print(term, data); return STATE_GROUND; } @@ -1079,9 +1070,6 @@ static enum state state_utf8_collect_2_switch(struct terminal *term, uint8_t data) { term->vt.utf8.data[term->vt.utf8.idx++] = data; - term->vt.utf8.left--; - - assert(term->vt.utf8.left == 1); return STATE_UTF8_COLLECT_1; } @@ -1089,9 +1077,6 @@ static enum state state_utf8_collect_3_switch(struct terminal *term, uint8_t data) { term->vt.utf8.data[term->vt.utf8.idx++] = data; - term->vt.utf8.left--; - - assert(term->vt.utf8.left == 2); return STATE_UTF8_COLLECT_2; } From 4278af99d2954bd8d87c0db6fa1290663c1d9040 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Ekl=C3=B6f?= Date: Mon, 27 Apr 2020 15:47:44 +0200 Subject: [PATCH 6/8] vt: utf8-*-entry: idx is cleared in action_clear() --- vt.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vt.c b/vt.c index 0a34a45e..54a9a415 100644 --- a/vt.c +++ b/vt.c @@ -512,21 +512,21 @@ action_put(struct terminal *term, uint8_t c) static void action_utf8_2_entry(struct terminal *term, uint8_t c) { - term->vt.utf8.idx = 0; + assert(term->vt.utf8.idx == 0); term->vt.utf8.data[term->vt.utf8.idx++] = c; } static void action_utf8_3_entry(struct terminal *term, uint8_t c) { - term->vt.utf8.idx = 0; + assert(term->vt.utf8.idx == 0); term->vt.utf8.data[term->vt.utf8.idx++] = c; } static void action_utf8_4_entry(struct terminal *term, uint8_t c) { - term->vt.utf8.idx = 0; + assert(term->vt.utf8.idx == 0); term->vt.utf8.data[term->vt.utf8.idx++] = c; } From d1fc419e34b125dc99a6f5e5e1818fb1e8e72096 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Ekl=C3=B6f?= Date: Mon, 27 Apr 2020 15:49:07 +0200 Subject: [PATCH 7/8] vt: action_utf8_print: idx is cleared in action_clear() --- vt.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/vt.c b/vt.c index 54a9a415..3dcb2f57 100644 --- a/vt.c +++ b/vt.c @@ -542,9 +542,6 @@ action_utf8_print(struct terminal *term, uint8_t c) if ((ssize_t)count < 0) wc = 0; - /* Reset VT utf8 state */ - term->vt.utf8.idx = 0; - #if FOOT_UNICODE_COMBINING /* * Try to combine with the previous character. From 3f3fff768aebe4d1a7822bbe3ecc3e68a3023c8f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Ekl=C3=B6f?= Date: Mon, 27 Apr 2020 15:50:44 +0200 Subject: [PATCH 8/8] vt: lazily reset utf8 in action_utf8_*_entry action_clear() is in the super hot code path. Avoid resetting utf8 state there, as utf8 input is relatively uncommon. Instead, reset it when we explicitly enter any of the utf8 collecting states, as this is exactly the point where we need it. --- vt.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/vt.c b/vt.c index 3dcb2f57..743580a1 100644 --- a/vt.c +++ b/vt.c @@ -109,7 +109,6 @@ action_clear(struct terminal *term) term->vt.params.idx = 0; term->vt.private[0] = 0; term->vt.private[1] = 0; - term->vt.utf8.idx = 0; } static void @@ -512,21 +511,21 @@ action_put(struct terminal *term, uint8_t c) static void action_utf8_2_entry(struct terminal *term, uint8_t c) { - assert(term->vt.utf8.idx == 0); + term->vt.utf8.idx = 0; term->vt.utf8.data[term->vt.utf8.idx++] = c; } static void action_utf8_3_entry(struct terminal *term, uint8_t c) { - assert(term->vt.utf8.idx == 0); + term->vt.utf8.idx = 0; term->vt.utf8.data[term->vt.utf8.idx++] = c; } static void action_utf8_4_entry(struct terminal *term, uint8_t c) { - assert(term->vt.utf8.idx == 0); + term->vt.utf8.idx = 0; term->vt.utf8.data[term->vt.utf8.idx++] = c; }