unicode-precompose: use fcft's precompose functionality

This allows us more options when determining whether to use a pre-composed character or not: We now only use the pre-composed character if it's from the primary font, or if at least one of the base or combining characters are from a fallback font. I.e. use glyphs from the primary font if possible. But, if one or more of the decomposed glyphs are from a fallback font, use the pre-composed character anyway.
2026-02-05 04:06:08 -05:00 · 2020-05-08 23:36:33 +02:00 · 2020-05-08 23:36:33 +02:00 · b1b32152c1
commit b1b32152c1
parent c090a0664f
4 changed files with 26 additions and 33878 deletions
--- a/UnicodeData.txt
+++ b/UnicodeData.txt
--- a/meson.build
+++ b/meson.build
@ -92,17 +92,6 @@ foreach prot : [
    command: [wscanner_prog, 'private-code', '@INPUT@', '@OUTPUT@'])
 endforeach

-if get_option('unicode-precompose')
-  generate_unicode_precompose_sh = files('scripts/generate-unicode-precompose.sh')
-  unicode_data = custom_target(
-    'unicode-data',
-    input: 'UnicodeData.txt',
-    output: 'unicode-compose-table.h',
-    command: [generate_unicode_precompose_sh, '@INPUT@', '@OUTPUT@'])
-else
-  unicode_data = []
-endif
-
 generate_version_sh = files('generate-version.sh')
 version = custom_target(
  'generate_version',
@ -138,7 +127,7 @@ executable(
  'tokenize.c', 'tokenize.h',
  'vt.c', 'vt.h',
  'wayland.c', 'wayland.h',
-  wl_proto_src + wl_proto_headers, version, unicode_data,
+  wl_proto_src + wl_proto_headers, version,
  dependencies: [math, threads, pixman, wayland_client, wayland_cursor, xkb, tllist, fcft],
  install: true)

--- a/scripts/generate-unicode-precompose.sh
+++ b/scripts/generate-unicode-precompose.sh
@ -1,33 +0,0 @@
-#!/usr/bin/sh
-
-unicodedata_txt="${1}"
-output="${2}"
-
-cat <<EOF > "${output}"
-#pragma once
-
-#include <wchar.h>
-
-static const struct {
-    wchar_t replacement;
-    wchar_t base;
-    wchar_t comb;
-} precompose_table[] = {
-EOF
-
-# extract canonical decomposition data from UnicodeData.txt,
-# - pad hex values to 5 digits,
-# - sort numerically on base character, then combining character,
-# - then reduce to 4 digits again where possible
-#
-# "borrowed" from xterm/unicode/make-precompose.sh
-
-cut "${unicodedata_txt}" -d ";" -f 1,6 |
-    grep ";[0-9,A-F]" | grep " " |
-    sed -e "s/ /, 0x/;s/^/{ 0x/;s/;/, 0x/;s/$/},/" |
-    sed -e "s,0x\(....\)\([^0-9A-Fa-f]\),0x0\1\2,g" |
-    (sort -k 3 || sort +2) |
-    sed -e "s,0x0\(...[0-9A-Fa-f]\),0x\1,g" |
-    sed 's/^/    /' >> "${output}"
-
-echo "};" >> "${output}"
--- a/vt.c
+++ b/vt.c
@ -14,10 +14,6 @@
 #include "osc.h"
 #include "util.h"

-#if FOOT_UNICODE_PRECOMPOSE
- #include "unicode-compose-table.h"
-#endif
-
 #define UNHANDLED() LOG_DBG("unhandled: %s", esc_as_string(term, final))

 /* https://vt100.net/emu/dec_ansi_parser */
@ -527,36 +523,6 @@ action_utf8_4_entry(struct terminal *term, uint8_t c)
    term->vt.utf8.data[term->vt.utf8.idx++] = c;
 }

-#if FOOT_UNICODE_PRECOMPOSE
-static wchar_t
-precompose(wchar_t base, wchar_t comb)
-{
-    static_assert(2 * sizeof(wchar_t) <= sizeof(uint64_t),
-                  "two wchars does not fit in an uint64_t");
-
-    const uint64_t match = (uint64_t)base << 32 | comb;
-
-    ssize_t start = 0;
-    ssize_t end = ALEN(precompose_table) - 1;
-
-    while (start <= end) {
-        size_t middle = (start + end) / 2;
-
-        const uint64_t maybe =
-            (uint64_t)precompose_table[middle].base << 32 | precompose_table[middle].comb;
-
-        if (maybe < match)
-            start = middle + 1;
-        else if (maybe > match)
-            end = middle - 1;
-        else
-            return precompose_table[middle].replacement;
-    }
-
-    return (wchar_t)-1;
-}
-#endif
-
 static void
 action_utf8_print(struct terminal *term, uint8_t c)
 {
@ -631,9 +597,32 @@ action_utf8_print(struct terminal *term, uint8_t c)

 #if FOOT_UNICODE_PRECOMPOSE
            if (composed == NULL) {
-                wchar_t precomposed = precompose(base, wc);
+                bool base_from_primary;
+                bool comb_from_primary;
+                bool pre_from_primary;
+
+                wchar_t precomposed = fcft_precompose(
+                    term->fonts[0], base, wc, &base_from_primary,
+                    &comb_from_primary, &pre_from_primary);
+
                int precomposed_width = wcwidth(precomposed);
-                if (precomposed != (wchar_t)-1 && precomposed_width == base_width) {
+
+                /*
+                 * Only use the pre-composed character if:
+                 *
+                 *  1. we *have* a pre-composed character
+                 *  2. the width matches the base characters width
+                 *  3. it's in the primary font, OR one of the base or
+                 *     combining characters are *not* from the primary
+                 *     font
+                 */
+
+                if (precomposed != (wchar_t)-1 &&
+                    precomposed_width == base_width &&
+                    (pre_from_primary ||
+                     !base_from_primary ||
+                     !comb_from_primary))
+                {
                    term_print(term, precomposed, precomposed_width);
                    return;
                }