From 4283a8c51b37330ded50e49f61c6c9a3a4fe19b8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20Ekl=C3=B6f?= <daniel@ekloef.se>
Date: Mon, 27 Apr 2020 12:13:30 +0200
Subject: [PATCH] utf8: add support for unicode combining characters
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This feature lets foot combine e.g. "a\u0301" to "á".

We first check if the current character (that we're about to print) is
a combining character, by checking if it's in one of the following
ranges:

* Combining Diacritical Marks (0300–036F), since version 1.0, with
  modifications in subsequent versions down to 4.1
* Combining Diacritical Marks Extended (1AB0–1AFF), version 7.0
* Combining Diacritical Marks Supplement (1DC0–1DFF), versions 4.1 to 5.2
* Combining Diacritical Marks for Symbols (20D0–20FF), since version
  1.0, with modifications in subsequent versions down to 5.1
* Combining Half Marks (FE20–FE2F), versions 1.0, with modifications
  in subsequent versions down to 8.0

If it is, we check if the last cell appears to contain a valid symbol,
and if so, we attempt to compose (combine) the last cell with the
current character, using utf8proc.

If the result is a combined character, we replace the content in the
previous cell with the new, combined character.

Thus, if you select and copy the printed character, you would get
e.g. "\u00e1" instead of "a\u0301".

This feature can be disabled. By default, it is enabled if the
utf8proc library is found, but can be explicitly disabled, or enabled,
with 'meson -Dunicode-combining=disabled|enabled'.
---
 CHANGELOG.md      |  8 +++++++
 meson.build       |  5 ++++-
 meson_options.txt |  1 +
 vt.c              | 56 +++++++++++++++++++++++++++++++++++++++++++++--
 4 files changed, 67 insertions(+), 3 deletions(-)
 create mode 100644 meson_options.txt

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 20d82896..067f323c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -22,6 +22,14 @@
 * Right mouse button extends the current selection.
 * `CSI Ps ; Ps ; Ps t` escape sequences for the following parameters:
   `11t`, `13t`, `13;2t`, `14t`, `14;2t`, `15t`, `19t`.
+* (Optional) spport for unicode combining characters. For example,
+  `a\u0301` will be combined to `á` (`\u00e1`). Note that copying the
+  printed character to the clipboard/primary selection will copy the
+  byte `\u00e1` and **not** `\u0061\u0301`. It requires
+  [utf8proc](https://github.com/JuliaStrings/utf8proc). By default,
+  the feature is enabled if utf8proc is found. However, it can also be
+  explicitly disabled (or enabled) with `meson
+  -Dunicode-combining=enabled|disabled`)
 
 ### Changed
 
diff --git a/meson.build b/meson.build
index 98c804a6..464e3b54 100644
--- a/meson.build
+++ b/meson.build
@@ -57,6 +57,9 @@ wayland_client = dependency('wayland-client')
 wayland_cursor = dependency('wayland-cursor')
 xkb = dependency('xkbcommon')
 
+utf8proc = dependency('libutf8proc', required: get_option('unicode-combining'))
+add_project_arguments('-DFOOT_UNICODE_COMBINING=@0@'.format(utf8proc.found()), language: 'c')
+
 tllist = dependency('tllist', version: '>=1.0.1', fallback: 'tllist')
 fcft = dependency('fcft', version: ['>=2.0.0', '<2.1.0'], fallback: 'fcft')
 
@@ -125,7 +128,7 @@ executable(
   'vt.c', 'vt.h',
   'wayland.c', 'wayland.h',
   wl_proto_src + wl_proto_headers, version,
-  dependencies: [math, threads, pixman, wayland_client, wayland_cursor, xkb,
+  dependencies: [math, threads, pixman, wayland_client, wayland_cursor, xkb, utf8proc,
                 tllist, fcft],
   install: true)
 
diff --git a/meson_options.txt b/meson_options.txt
new file mode 100644
index 00000000..cc601843
--- /dev/null
+++ b/meson_options.txt
@@ -0,0 +1 @@
+option('unicode-combining', type: 'feature', value: 'auto', description: 'Perform unicode combining')
diff --git a/vt.c b/vt.c
index ab36ac6b..c6f86bdb 100644
--- a/vt.c
+++ b/vt.c
@@ -5,6 +5,10 @@
 #include <unistd.h>
 #include <assert.h>
 
+#if FOOT_UNICODE_COMBINING
+ #include <utf8proc.h>
+#endif
+
 #define LOG_MODULE "vt"
 #define LOG_ENABLE_DBG 0
 #include "log.h"
@@ -547,8 +551,56 @@ action_utf8_print(struct terminal *term, uint8_t c)
     /* Reset VT utf8 state */
     term->vt.utf8.idx = 0;
 
-    int width = wcwidth(wc);
-    term_print(term, wc, width);
+#if FOOT_UNICODE_COMBINING
+    /*
+     * Try to combine with the previous character.
+     *
+     * We _could_ try regardless of what 'wc' is. However, for
+     * performance reasons, we only do it when 'wc' is in a known
+     * 'combining' range.
+     *
+     * TODO:
+     *  - doesn't work when base character is multi-column (we'll only
+     *    see an empty "null" character)
+     */
+
+    if (((wc >= 0x0300 && wc <= 0x036F) || /* diacritical marks */
+         (wc >= 0x1AB0 && wc <= 0x1AFF) || /* diacritical marks, extended */
+         (wc >= 0x1DC0 && wc <= 0x1DFF) || /* diacritical marks, supplement */
+         (wc >= 0x20D0 && wc <= 0x20FF) || /* diacritical marks, for symbols */
+         (wc >= 0xFE20 && wc <= 0xFE2F))   /* half marks */
+        && term->grid->cursor.point.col > 0)
+    {
+        int base_col = term->grid->cursor.point.col;
+        if (!term->grid->cursor.lcf)
+            base_col--;
+
+        assert(base_col >= 0 && base_col < term->cols);
+        wchar_t base = term->grid->cur_row->cells[base_col].wc;
+        int base_width = wcwidth(base);
+
+        if (base_width > 0) {
+            wchar_t composed[] = {base, wc};
+            ssize_t composed_length = utf8proc_normalize_utf32(
+                composed, sizeof(composed) / sizeof(composed[0]),
+                UTF8PROC_COMPOSE | UTF8PROC_STABLE);
+
+            LOG_DBG("composed = 0x%04x, 0x%04x (length = %zd)",
+                    composed[0], composed[1], composed_length);
+
+            if (composed_length == 1) {
+                /* Compose succeess - overwrite last cell with
+                 * combined character */
+                term->grid->cursor.point.col = base_col;
+                term->grid->cursor.lcf = false;
+                term_print(term, composed[0], wcwidth(composed[0]));
+                return;
+            }
+        }
+    }
+#endif /* FOOT_UNICODE_COMBINING */
+
+    term_print(term, wc, wcwidth(wc));
 }
 
 static enum state