utf8: add support for unicode combining characters

This feature lets foot combine e.g. "a\u0301" to "á". We first check if the current character (that we're about to print) is a combining character, by checking if it's in one of the following ranges: * Combining Diacritical Marks (0300–036F), since version 1.0, with modifications in subsequent versions down to 4.1 * Combining Diacritical Marks Extended (1AB0–1AFF), version 7.0 * Combining Diacritical Marks Supplement (1DC0–1DFF), versions 4.1 to 5.2 * Combining Diacritical Marks for Symbols (20D0–20FF), since version 1.0, with modifications in subsequent versions down to 5.1 * Combining Half Marks (FE20–FE2F), versions 1.0, with modifications in subsequent versions down to 8.0 If it is, we check if the last cell appears to contain a valid symbol, and if so, we attempt to compose (combine) the last cell with the current character, using utf8proc. If the result is a combined character, we replace the content in the previous cell with the new, combined character. Thus, if you select and copy the printed character, you would get e.g. "\u00e1" instead of "a\u0301". This feature can be disabled. By default, it is enabled if the utf8proc library is found, but can be explicitly disabled, or enabled, with 'meson -Dunicode-combining=disabled|enabled'.
2026-02-17 22:05:22 -05:00 · 2020-04-27 12:13:30 +02:00 · 2020-04-27 12:13:30 +02:00 · 4283a8c51b
commit 4283a8c51b
parent d959b98822
4 changed files with 67 additions and 3 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -22,6 +22,14 @@
 * Right mouse button extends the current selection.
 * `CSI Ps ; Ps ; Ps t` escape sequences for the following parameters:
  `11t`, `13t`, `13;2t`, `14t`, `14;2t`, `15t`, `19t`.
+* (Optional) spport for unicode combining characters. For example,
+  `a\u0301` will be combined to `á` (`\u00e1`). Note that copying the
+  printed character to the clipboard/primary selection will copy the
+  byte `\u00e1` and **not** `\u0061\u0301`. It requires
+  [utf8proc](https://github.com/JuliaStrings/utf8proc). By default,
+  the feature is enabled if utf8proc is found. However, it can also be
+  explicitly disabled (or enabled) with `meson
+  -Dunicode-combining=enabled|disabled`)

 ### Changed

--- a/meson.build
+++ b/meson.build
@ -57,6 +57,9 @@ wayland_client = dependency('wayland-client')
 wayland_cursor = dependency('wayland-cursor')
 xkb = dependency('xkbcommon')

+utf8proc = dependency('libutf8proc', required: get_option('unicode-combining'))
+add_project_arguments('-DFOOT_UNICODE_COMBINING=@0@'.format(utf8proc.found()), language: 'c')
+
 tllist = dependency('tllist', version: '>=1.0.1', fallback: 'tllist')
 fcft = dependency('fcft', version: ['>=2.0.0', '<2.1.0'], fallback: 'fcft')

@ -125,7 +128,7 @@ executable(
  'vt.c', 'vt.h',
  'wayland.c', 'wayland.h',
  wl_proto_src + wl_proto_headers, version,
-  dependencies: [math, threads, pixman, wayland_client, wayland_cursor, xkb,
+  dependencies: [math, threads, pixman, wayland_client, wayland_cursor, xkb, utf8proc,
                tllist, fcft],
  install: true)

--- a/meson_options.txt
+++ b/meson_options.txt
@ -0,0 +1 @@
+option('unicode-combining', type: 'feature', value: 'auto', description: 'Perform unicode combining')
--- a/vt.c
+++ b/vt.c
@ -5,6 +5,10 @@
 #include <unistd.h>
 #include <assert.h>

+#if FOOT_UNICODE_COMBINING
+ #include <utf8proc.h>
+#endif
+
 #define LOG_MODULE "vt"
 #define LOG_ENABLE_DBG 0
 #include "log.h"
@ -547,8 +551,56 @@ action_utf8_print(struct terminal *term, uint8_t c)
    /* Reset VT utf8 state */
    term->vt.utf8.idx = 0;

-    int width = wcwidth(wc);
-    term_print(term, wc, width);
+#if FOOT_UNICODE_COMBINING
+    /*
+     * Try to combine with the previous character.
+     *
+     * We _could_ try regardless of what 'wc' is. However, for
+     * performance reasons, we only do it when 'wc' is in a known
+     * 'combining' range.
+     *
+     * TODO:
+     *  - doesn't work when base character is multi-column (we'll only
+     *    see an empty "null" character)
+     */
+
+    if (((wc >= 0x0300 && wc <= 0x036F) || /* diacritical marks */
+         (wc >= 0x1AB0 && wc <= 0x1AFF) || /* diacritical marks, extended */
+         (wc >= 0x1DC0 && wc <= 0x1DFF) || /* diacritical marks, supplement */
+         (wc >= 0x20D0 && wc <= 0x20FF) || /* diacritical marks, for symbols */
+         (wc >= 0xFE20 && wc <= 0xFE2F))   /* half marks */
+        && term->grid->cursor.point.col > 0)
+    {
+        int base_col = term->grid->cursor.point.col;
+        if (!term->grid->cursor.lcf)
+            base_col--;
+
+        assert(base_col >= 0 && base_col < term->cols);
+        wchar_t base = term->grid->cur_row->cells[base_col].wc;
+        int base_width = wcwidth(base);
+
+        if (base_width > 0) {
+            wchar_t composed[] = {base, wc};
+            ssize_t composed_length = utf8proc_normalize_utf32(
+                composed, sizeof(composed) / sizeof(composed[0]),
+                UTF8PROC_COMPOSE | UTF8PROC_STABLE);
+
+            LOG_DBG("composed = 0x%04x, 0x%04x (length = %zd)",
+                    composed[0], composed[1], composed_length);
+
+            if (composed_length == 1) {
+                /* Compose succeess - overwrite last cell with
+                 * combined character */
+                term->grid->cursor.point.col = base_col;
+                term->grid->cursor.lcf = false;
+                term_print(term, composed[0], wcwidth(composed[0]));
+                return;
+            }
+        }
+    }
+#endif /* FOOT_UNICODE_COMBINING */
+
+    term_print(term, wc, wcwidth(wc));
 }

 static enum state
				`@ -0,0 +1 @@`
				`option('unicode-combining', type: 'feature', value: 'auto', description: 'Perform unicode combining')`