From cb5f80ec6a8bda1f71581261cc3fc56b20ebdd7d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20Ekl=C3=B6f?= <daniel@ekloef.se>
Date: Fri, 1 May 2020 11:52:40 +0200
Subject: [PATCH] vt: utf8: track combining characters that we failed to
 compose

When we detect a combining character, we first try to compose it with
the base character (like before).

When this fails, we instead add the combining character to the base
cell's combining characters array.

The reason for using a composed character when possible is twofold:
one, the rendered glyph will look better since it will be a single
glyph instead of two separate glyphs (possibly from different
fonts(!)). And two, for performance. A composed glyph is a single
glyph to render, while a decomposed glyph sequence means the renderer
has to render multiple glyphs for a single cell.
---
 vt.c | 52 +++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 37 insertions(+), 15 deletions(-)

diff --git a/vt.c b/vt.c
index bd2b7f7a..8abba1da 100644
--- a/vt.c
+++ b/vt.c
@@ -546,10 +546,6 @@ action_utf8_print(struct terminal *term, uint8_t c)
      * We _could_ try regardless of what 'wc' is. However, for
      * performance reasons, we only do it when 'wc' is in a known
      * 'combining' range.
-     *
-     * TODO:
-     *  - doesn't work when base character is multi-column (we'll only
-     *    see an empty "null" character)
      */
 
     if (((wc >= 0x0300 && wc <= 0x036F) || /* diacritical marks */
@@ -559,31 +555,57 @@ action_utf8_print(struct terminal *term, uint8_t c)
          (wc >= 0xFE20 && wc <= 0xFE2F))   /* half marks */
         && term->grid->cursor.point.col > 0)
     {
+        const struct row *row = term->grid->cur_row;
+
         int base_col = term->grid->cursor.point.col;
         if (!term->grid->cursor.lcf)
             base_col--;
 
         assert(base_col >= 0 && base_col < term->cols);
-        wchar_t base = term->grid->cur_row->cells[base_col].wc;
+        wchar_t base = row->cells[base_col].wc;
+
+        /* Handle double-column glyphs */
+        if (base == 0 && base_col > 0) {
+            base_col--;
+            base = row->cells[base_col].wc;
+        }
+
         int base_width = wcwidth(base);
 
-        if (base_width > 0) {
+        if (base != 0 && base_width > 0) {
+
+            /*
+             * First, see if there's a pre-composed character of this
+             * combo, with the same column width as the base
+             * character. If there is, replace the base character with
+             * the pre-composed character, as that is likely to
+             * produce a better looking result.
+             */
+
             wchar_t composed[] = {base, wc};
             ssize_t composed_length = utf8proc_normalize_utf32(
-                composed, sizeof(composed) / sizeof(composed[0]),
-                UTF8PROC_COMPOSE | UTF8PROC_STABLE);
+                composed, ALEN(composed), UTF8PROC_COMPOSE | UTF8PROC_STABLE);
+            int composed_width = wcwidth(composed[0]);
 
-            LOG_DBG("composed = 0x%04x, 0x%04x (length = %zd)",
-                    composed[0], composed[1], composed_length);
-
-            if (composed_length == 1) {
-                /* Compose succeess - overwrite last cell with
-                 * combined character */
+            if (composed_length == 1 && composed_width == base_width) {
                 term->grid->cursor.point.col = base_col;
                 term->grid->cursor.lcf = false;
-                term_print(term, composed[0], wcwidth(composed[0]));
+                term_print(term, composed[0], composed_width);
                 return;
             }
+
+            struct combining_chars *comb_chars = &row->comb_chars[base_col];
+
+            if (comb_chars->count < ALEN(comb_chars->chars))
+                comb_chars->chars[comb_chars->count++] = wc;
+            else {
+                LOG_WARN("combining character overflow:");
+                LOG_WARN("  0x%04x", base);
+                for (size_t i = 0; i < comb_chars->count; i++)
+                    LOG_WARN("  0x%04x", comb_chars->chars[i]);
+                LOG_ERR("  0x%04x", wc);
+            }
+            return;
         }
     }
 #endif /* FOOT_UNICODE_COMBINING */