wip: grapheme shaping

2026-05-01 06:46:43 -04:00 · 2020-08-20 19:25:35 +02:00 · 2020-08-20 19:25:35 +02:00 · b9ef703eb1
commit b9ef703eb1
parent c1cde66f70
16 changed files with 340 additions and 178 deletions
--- a/vt.c
+++ b/vt.c
@ -4,9 +4,14 @@
 #include <string.h>
 #include <unistd.h>

+#if defined(FOOT_GRAPHEME_CLUSTERING)
+ #include <utf8proc.h>
+#endif
+
 #define LOG_MODULE "vt"
 #define LOG_ENABLE_DBG 0
 #include "log.h"
+#include "config.h"
 #include "csi.h"
 #include "dcs.h"
 #include "debug.h"
@ -283,6 +288,7 @@ action_execute(struct terminal *term, uint8_t c)
 static void
 action_print(struct terminal *term, uint8_t c)
 {
+    term_reset_grapheme_state(term);
    term->ascii_printer(term, c);
 }

@ -583,152 +589,166 @@ static void
 action_utf8_print(struct terminal *term, wchar_t wc)
 {
    int width = wcwidth(wc);
+    const bool grapheme_clustering = term->conf->tweak.grapheme_shaping;

-    /*
-     * Is this is combining character? The basic assumption is that if
-     * wcwdith() returns 0, then it *is* a combining character.
-     *
-     * We hen optimize this by ignoring all characters before 0x0300,
-     * since there aren't any zero-width characters there. This means
-     * all "normal" western characters will quickly be categorized as
-     * *not* being combining characters.
-     *
-     * TODO: xterm does more or less the same, but also filters a
-     * small subset of BIDI control characters. Should we too? I think
-     * what we have here is good enough - a control character
-     * shouldn't have a glyph associated with it, so rendering
-     * shouldn't be affected.
-     *
-     * TODO: handle line-wrap when locating the base character.
-     */
-    if (width == 0 && wc >= 0x0300 && term->grid->cursor.point.col > 0) {
-        const struct row *row = term->grid->cur_row;
+#if !defined(FOOT_GRAPHEME_CLUSTERING)
+    xassert(!grapheme_clustering);
+#endif

-        int base_col = term->grid->cursor.point.col;
+    if (term->grid->cursor.point.col > 0 &&
+        (grapheme_clustering ||
+         (!grapheme_clustering && width == 0 && wc >= 0x300)))
+    {
+        int col = term->grid->cursor.point.col;
        if (!term->grid->cursor.lcf)
-            base_col--;
+            col--;

-        while (row->cells[base_col].wc >= CELL_SPACER && base_col > 0)
-            base_col--;
+        /* Skip past spacers */
+        struct row *row = term->grid->cur_row;
+        while (row->cells[col].wc >= CELL_SPACER && col > 0)
+            col--;

-        xassert(base_col >= 0 && base_col < term->cols);
-        wchar_t base = row->cells[base_col].wc;
+        xassert(col >= 0 && col < term->cols);
+        wchar_t base = row->cells[col].wc;
+        wchar_t UNUSED last = base;

+        /* Is base cell already a cluster? */
        const struct composed *composed =
            (base >= CELL_COMB_CHARS_LO &&
             base < (CELL_COMB_CHARS_LO + term->composed_count))
            ? &term->composed[base - CELL_COMB_CHARS_LO]
            : NULL;

-        if (composed != NULL)
-            base = composed->base;
+        if (composed != NULL) {
+            base = composed->chars[0];
+            last = composed->chars[composed->count - 1];
+        }
+
+#if defined(FOOT_GRAPHEME_CLUSTERING)
+        if (grapheme_clustering) {
+            /* Check if we're on a grapheme cluster break */
+            /* Note: utf8proc fails to ZWJ */
+            if (utf8proc_grapheme_break_stateful(last, wc, &term->vt.grapheme_state) &&
+                last != 0x200d /* ZWJ */)
+            {
+                term_reset_grapheme_state(term);
+                if (width > 0)
+                    term_print(term, wc, width);
+                return;
+            }
+        }
+#endif

        int base_width = wcwidth(base);
+        term->grid->cursor.point.col = col;
+        term->grid->cursor.lcf = false;

-        if (base != 0 && base_width > 0) {
+        if (composed == NULL) {
+            bool base_from_primary;
+            bool comb_from_primary;
+            bool pre_from_primary;
+
+            wchar_t precomposed = fcft_precompose(
+                term->fonts[0], base, wc, &base_from_primary,
+                &comb_from_primary, &pre_from_primary);
+
+            int precomposed_width = wcwidth(precomposed);

            /*
-             * If this is the *first* combining characger, see if
-             * there's a pre-composed character of this combo, with
-             * the same column width as the base character.
+             * Only use the pre-composed character if:
             *
-             * If there is, replace the base character with the
-             * pre-composed character, as that is likely to produce a
-             * better looking result.
+             *  1. we *have* a pre-composed character
+             *  2. the width matches the base characters width
+             *  3. it's in the primary font, OR one of the base or
+             *     combining characters are *not* from the primary
+             *     font
             */
-            term->grid->cursor.point.col = base_col;
-            term->grid->cursor.lcf = false;

-            if (composed == NULL) {
-                bool base_from_primary;
-                bool comb_from_primary;
-                bool pre_from_primary;
-
-                wchar_t precomposed = fcft_precompose(
-                    term->fonts[0], base, wc, &base_from_primary,
-                    &comb_from_primary, &pre_from_primary);
-
-                int precomposed_width = wcwidth(precomposed);
-
-                /*
-                 * Only use the pre-composed character if:
-                 *
-                 *  1. we *have* a pre-composed character
-                 *  2. the width matches the base characters width
-                 *  3. it's in the primary font, OR one of the base or
-                 *     combining characters are *not* from the primary
-                 *     font
-                 */
-
-                if (precomposed != (wchar_t)-1 &&
-                    precomposed_width == base_width &&
-                    (pre_from_primary ||
-                     !base_from_primary ||
-                     !comb_from_primary))
-                {
-                    term_print(term, precomposed, precomposed_width);
-                    return;
-                }
+            if (precomposed != (wchar_t)-1 &&
+                precomposed_width == base_width &&
+                (pre_from_primary ||
+                 !base_from_primary ||
+                 !comb_from_primary))
+            {
+                term_reset_grapheme_state(term);
+                term_print(term, precomposed, precomposed_width);
+                return;
            }
+        }

-            size_t wanted_count = composed != NULL ? composed->count + 1 : 1;
-            if (wanted_count > ALEN(composed->combining)) {
-                xassert(composed != NULL);
+        size_t wanted_count = composed != NULL ? composed->count + 1 : 2;
+        if (wanted_count > ALEN(composed->chars)) {
+            xassert(composed != NULL);

 #if defined(LOG_ENABLE_DBG) && LOG_ENABLE_DBG
-                LOG_WARN("combining character overflow:");
-                LOG_WARN("  base: 0x%04x", composed->base);
-                for (size_t i = 0; i < composed->count; i++)
-                    LOG_WARN("    cc: 0x%04x", composed->combining[i]);
-                LOG_ERR("   new: 0x%04x", wc);
+            LOG_WARN("combining character overflow:");
+            LOG_WARN("  base: 0x%04x", composed->chars[0]);
+            for (size_t i = 1; i < composed->count; i++)
+                LOG_WARN("    cc: 0x%04x", composed->chars[i]);
+            LOG_ERR("   new: 0x%04x", wc);
 #endif
-                /* This are going to break anyway... */
-                wanted_count--;
+            /* This is going to break anyway... */
+            wanted_count--;
+        }
+
+        xassert(wanted_count <= ALEN(composed->chars));
+
+        /* Look for existing combining chain */
+        for (size_t i = 0; i < term->composed_count; i++) {
+            const struct composed *cc = &term->composed[i];
+
+            if (cc->count != wanted_count)
+                continue;
+
+            if (cc->chars[0]  != base)
+                continue;
+
+            bool match = true;
+            for (size_t j = 1; j < wanted_count - 1; j++) {
+                if (cc->chars[j] != composed->chars[j]) {
+                    match = false;
+                    break;
+                }
            }
+            if (!match)
+                continue;

-            xassert(wanted_count <= ALEN(composed->combining));
+            if (cc->chars[wanted_count - 1] != wc)
+                continue;

-            /* Look for existing combining chain */
-            for (size_t i = 0; i < term->composed_count; i++) {
-                const struct composed *cc = &term->composed[i];
-                if (cc->base != base)
-                    continue;
+            int grapheme_width = my_wcswidth(cc->chars, cc->count);
+            if (grapheme_width > 0)
+                term_print(term, CELL_COMB_CHARS_LO + i, grapheme_width);
+            return;
+        }

-                if (cc->count != wanted_count)
-                    continue;
+        /* Allocate new chain */

-                if (cc->combining[wanted_count - 1] != wc)
-                    continue;
+        struct composed new_cc;
+        new_cc.count = wanted_count;
+        new_cc.chars[0] = base;
+        for (size_t i = 1; i < wanted_count - 1; i++)
+            new_cc.chars[i] = composed->chars[i];
+        new_cc.chars[wanted_count - 1] = wc;

-                term_print(term, CELL_COMB_CHARS_LO + i, base_width);
-                return;
-            }
+        if (term->composed_count < CELL_COMB_CHARS_HI) {
+            term->composed_count++;
+            term->composed = xrealloc(term->composed, term->composed_count * sizeof(term->composed[0]));
+            term->composed[term->composed_count - 1] = new_cc;

-            /* Allocate new chain */
-
-            struct composed new_cc;
-            new_cc.base = base;
-            new_cc.count = wanted_count;
-            for (size_t i = 0; i < wanted_count - 1; i++)
-                new_cc.combining[i] = composed->combining[i];
-            new_cc.combining[wanted_count - 1] = wc;
-
-            if (term->composed_count < CELL_COMB_CHARS_HI) {
-                term->composed_count++;
-                term->composed = xrealloc(term->composed, term->composed_count * sizeof(term->composed[0]));
-                term->composed[term->composed_count - 1] = new_cc;
-
-                term_print(term, CELL_COMB_CHARS_LO + term->composed_count - 1, base_width);
-                return;
-            } else {
-                /* We reached our maximum number of allowed composed
-                 * character chains. Fall through here and print the
-                 * current zero-width character to the current cell */
-                LOG_WARN("maximum number of composed characters reached");
-            }
+            int grapheme_width = my_wcswidth(new_cc.chars, new_cc.count);
+            if (grapheme_width > 0)
+                term_print(term, CELL_COMB_CHARS_LO + term->composed_count - 1, grapheme_width);
+            return;
+        } else {
+            /* We reached our maximum number of allowed composed
+             * character chains. Fall through here and print the
+             * current zero-width character to the current cell */
+            LOG_WARN("maximum number of composed characters reached");
        }
    }

+    term_reset_grapheme_state(term);
    if (width > 0)
        term_print(term, wc, width);
 }