Merge branch 'unicode-combining'

2026-05-15 21:45:03 -04:00 · 2020-04-27 15:53:34 +02:00 · 2020-04-27 15:53:34 +02:00 · c4e3b9f69d
commit c4e3b9f69d
parent d959b98822 3f3fff768a
6 changed files with 68 additions and 22 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -22,6 +22,14 @@
 * Right mouse button extends the current selection.
 * `CSI Ps ; Ps ; Ps t` escape sequences for the following parameters:
  `11t`, `13t`, `13;2t`, `14t`, `14;2t`, `15t`, `19t`.
 * (Optional) spport for unicode combining characters. For example,
  `a\u0301` will be combined to `á` (`\u00e1`). Note that copying the
  printed character to the clipboard/primary selection will copy the
  byte `\u00e1` and **not** `\u0061\u0301`. It requires
  [utf8proc](https://github.com/JuliaStrings/utf8proc). By default,
  the feature is enabled if utf8proc is found. However, it can also be
  explicitly disabled (or enabled) with `meson
  -Dunicode-combining=enabled|disabled`)
 ### Changed
--- a/README.md
+++ b/README.md
@ -277,6 +277,8 @@ with the terminal emulator itself. Foot implements the following OSCs:
 * pixman
 * wayland (_client_ and _cursor_ libraries)
 * xkbcommon
 * [utf8proc](https://github.com/JuliaStrings/utf8proc) (_optional_ -
  enables unicode combining)
 * [tllist](https://codeberg.org/dnkl/tllist) [^1]
 * [fcft](https://codeberg.org/dnkl/fcft) [^1]
--- a/meson.build
+++ b/meson.build
@ -57,6 +57,9 @@ wayland_client = dependency('wayland-client')
 wayland_cursor = dependency('wayland-cursor')
 xkb = dependency('xkbcommon')
 utf8proc = dependency('libutf8proc', required: get_option('unicode-combining'))
 add_project_arguments('-DFOOT_UNICODE_COMBINING=@0@'.format(utf8proc.found()), language: 'c')
 tllist = dependency('tllist', version: '>=1.0.1', fallback: 'tllist')
 fcft = dependency('fcft', version: ['>=2.0.0', '<2.1.0'], fallback: 'fcft')
@ -125,7 +128,7 @@ executable(
  'vt.c', 'vt.h',
  'wayland.c', 'wayland.h',
  wl_proto_src + wl_proto_headers, version,
-  dependencies: [math, threads, pixman, wayland_client, wayland_cursor, xkb,
+  dependencies: [math, threads, pixman, wayland_client, wayland_cursor, xkb, utf8proc,
                tllist, fcft],
  install: true)
--- a/meson_options.txt
+++ b/meson_options.txt
@ -0,0 +1 @@
 option('unicode-combining', type: 'feature', value: 'auto', description: 'Perform unicode combining')
--- a/terminal.h
+++ b/terminal.h
@ -135,7 +135,6 @@ struct vt {
    struct {
        uint8_t data[4];
        uint8_t idx;
        uint8_t left;
    } utf8;
    struct {
        uint8_t *data;
--- a/vt.c
+++ b/vt.c
@ -5,6 +5,10 @@
 #include <unistd.h>
 #include <assert.h>
 #if FOOT_UNICODE_COMBINING
 #include <utf8proc.h>
 #endif
 #define LOG_MODULE "vt"
 #define LOG_ENABLE_DBG 0
 #include "log.h"
@ -105,7 +109,6 @@ action_clear(struct terminal *term)
    term->vt.params.idx = 0;
    term->vt.private[0] = 0;
    term->vt.private[1] = 0;
    term->vt.utf8.idx = 0;
 }
 static void
@ -509,27 +512,21 @@ static void
 action_utf8_2_entry(struct terminal *term, uint8_t c)
 {
    term->vt.utf8.idx = 0;
    term->vt.utf8.left = 2;
    term->vt.utf8.data[term->vt.utf8.idx++] = c;
    term->vt.utf8.left--;
 }
 static void
 action_utf8_3_entry(struct terminal *term, uint8_t c)
 {
    term->vt.utf8.idx = 0;
    term->vt.utf8.left = 3;
    term->vt.utf8.data[term->vt.utf8.idx++] = c;
    term->vt.utf8.left--;
 }
 static void
 action_utf8_4_entry(struct terminal *term, uint8_t c)
 {
    term->vt.utf8.idx = 0;
    term->vt.utf8.left = 4;
    term->vt.utf8.data[term->vt.utf8.idx++] = c;
    term->vt.utf8.left--;
 }
 static void
@ -544,11 +541,56 @@ action_utf8_print(struct terminal *term, uint8_t c)
    if ((ssize_t)count < 0)
        wc = 0;
-    /* Reset VT utf8 state */
+#if FOOT_UNICODE_COMBINING
-    term->vt.utf8.idx = 0;
+    /*
     * Try to combine with the previous character.
     *
     * We _could_ try regardless of what 'wc' is. However, for
     * performance reasons, we only do it when 'wc' is in a known
     * 'combining' range.
     *
     * TODO:
     *  - doesn't work when base character is multi-column (we'll only
     *    see an empty "null" character)
     */
-    int width = wcwidth(wc);
+    if (((wc >= 0x0300 && wc <= 0x036F) || /* diacritical marks */
-    term_print(term, wc, width);
+         (wc >= 0x1AB0 && wc <= 0x1AFF) || /* diacritical marks, extended */
         (wc >= 0x1DC0 && wc <= 0x1DFF) || /* diacritical marks, supplement */
         (wc >= 0x20D0 && wc <= 0x20FF) || /* diacritical marks, for symbols */
         (wc >= 0xFE20 && wc <= 0xFE2F))   /* half marks */
        && term->grid->cursor.point.col > 0)
    {
        int base_col = term->grid->cursor.point.col;
        if (!term->grid->cursor.lcf)
            base_col--;
        assert(base_col >= 0 && base_col < term->cols);
        wchar_t base = term->grid->cur_row->cells[base_col].wc;
        int base_width = wcwidth(base);
        if (base_width > 0) {
            wchar_t composed[] = {base, wc};
            ssize_t composed_length = utf8proc_normalize_utf32(
                composed, sizeof(composed) / sizeof(composed[0]),
                UTF8PROC_COMPOSE | UTF8PROC_STABLE);
            LOG_DBG("composed = 0x%04x, 0x%04x (length = %zd)",
                    composed[0], composed[1], composed_length);
            if (composed_length == 1) {
                /* Compose succeess - overwrite last cell with
                 * combined character */
                term->grid->cursor.point.col = base_col;
                term->grid->cursor.lcf = false;
                term_print(term, composed[0], wcwidth(composed[0]));
                return;
            }
        }
    }
 #endif /* FOOT_UNICODE_COMBINING */
    term_print(term, wc, wcwidth(wc));
 }
 static enum state
@ -1016,9 +1058,6 @@ static enum state
 state_utf8_collect_1_switch(struct terminal *term, uint8_t data)
 {
    term->vt.utf8.data[term->vt.utf8.idx++] = data;
    term->vt.utf8.left--;
    assert(term->vt.utf8.left == 0);
    action_utf8_print(term, data);
    return STATE_GROUND;
 }
@ -1027,9 +1066,6 @@ static enum state
 state_utf8_collect_2_switch(struct terminal *term, uint8_t data)
 {
    term->vt.utf8.data[term->vt.utf8.idx++] = data;
    term->vt.utf8.left--;
    assert(term->vt.utf8.left == 1);
    return STATE_UTF8_COLLECT_1;
 }
@ -1037,9 +1073,6 @@ static enum state
 state_utf8_collect_3_switch(struct terminal *term, uint8_t data)
 {
    term->vt.utf8.data[term->vt.utf8.idx++] = data;
    term->vt.utf8.left--;
    assert(term->vt.utf8.left == 2);
    return STATE_UTF8_COLLECT_2;
 }
		`@ -0,0 +1 @@`
							`option('unicode-combining', type: 'feature', value: 'auto', description: 'Perform unicode combining')`