diff --git a/CHANGELOG.md b/CHANGELOG.md index 20d82896..067f323c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,14 @@ * Right mouse button extends the current selection. * `CSI Ps ; Ps ; Ps t` escape sequences for the following parameters: `11t`, `13t`, `13;2t`, `14t`, `14;2t`, `15t`, `19t`. +* (Optional) spport for unicode combining characters. For example, + `a\u0301` will be combined to `รก` (`\u00e1`). Note that copying the + printed character to the clipboard/primary selection will copy the + byte `\u00e1` and **not** `\u0061\u0301`. It requires + [utf8proc](https://github.com/JuliaStrings/utf8proc). By default, + the feature is enabled if utf8proc is found. However, it can also be + explicitly disabled (or enabled) with `meson + -Dunicode-combining=enabled|disabled`) ### Changed diff --git a/README.md b/README.md index b55dd8aa..4271920b 100644 --- a/README.md +++ b/README.md @@ -277,6 +277,8 @@ with the terminal emulator itself. Foot implements the following OSCs: * pixman * wayland (_client_ and _cursor_ libraries) * xkbcommon +* [utf8proc](https://github.com/JuliaStrings/utf8proc) (_optional_ - + enables unicode combining) * [tllist](https://codeberg.org/dnkl/tllist) [^1] * [fcft](https://codeberg.org/dnkl/fcft) [^1] diff --git a/meson.build b/meson.build index 98c804a6..464e3b54 100644 --- a/meson.build +++ b/meson.build @@ -57,6 +57,9 @@ wayland_client = dependency('wayland-client') wayland_cursor = dependency('wayland-cursor') xkb = dependency('xkbcommon') +utf8proc = dependency('libutf8proc', required: get_option('unicode-combining')) +add_project_arguments('-DFOOT_UNICODE_COMBINING=@0@'.format(utf8proc.found()), language: 'c') + tllist = dependency('tllist', version: '>=1.0.1', fallback: 'tllist') fcft = dependency('fcft', version: ['>=2.0.0', '<2.1.0'], fallback: 'fcft') @@ -125,7 +128,7 @@ executable( 'vt.c', 'vt.h', 'wayland.c', 'wayland.h', wl_proto_src + wl_proto_headers, version, - dependencies: [math, threads, pixman, wayland_client, wayland_cursor, xkb, + dependencies: [math, threads, pixman, wayland_client, wayland_cursor, xkb, utf8proc, tllist, fcft], install: true) diff --git a/meson_options.txt b/meson_options.txt new file mode 100644 index 00000000..cc601843 --- /dev/null +++ b/meson_options.txt @@ -0,0 +1 @@ +option('unicode-combining', type: 'feature', value: 'auto', description: 'Perform unicode combining') diff --git a/terminal.h b/terminal.h index 7653daa3..7fcede70 100644 --- a/terminal.h +++ b/terminal.h @@ -135,7 +135,6 @@ struct vt { struct { uint8_t data[4]; uint8_t idx; - uint8_t left; } utf8; struct { uint8_t *data; diff --git a/vt.c b/vt.c index ab36ac6b..743580a1 100644 --- a/vt.c +++ b/vt.c @@ -5,6 +5,10 @@ #include #include +#if FOOT_UNICODE_COMBINING + #include +#endif + #define LOG_MODULE "vt" #define LOG_ENABLE_DBG 0 #include "log.h" @@ -105,7 +109,6 @@ action_clear(struct terminal *term) term->vt.params.idx = 0; term->vt.private[0] = 0; term->vt.private[1] = 0; - term->vt.utf8.idx = 0; } static void @@ -509,27 +512,21 @@ static void action_utf8_2_entry(struct terminal *term, uint8_t c) { term->vt.utf8.idx = 0; - term->vt.utf8.left = 2; term->vt.utf8.data[term->vt.utf8.idx++] = c; - term->vt.utf8.left--; } static void action_utf8_3_entry(struct terminal *term, uint8_t c) { term->vt.utf8.idx = 0; - term->vt.utf8.left = 3; term->vt.utf8.data[term->vt.utf8.idx++] = c; - term->vt.utf8.left--; } static void action_utf8_4_entry(struct terminal *term, uint8_t c) { term->vt.utf8.idx = 0; - term->vt.utf8.left = 4; term->vt.utf8.data[term->vt.utf8.idx++] = c; - term->vt.utf8.left--; } static void @@ -544,11 +541,56 @@ action_utf8_print(struct terminal *term, uint8_t c) if ((ssize_t)count < 0) wc = 0; - /* Reset VT utf8 state */ - term->vt.utf8.idx = 0; +#if FOOT_UNICODE_COMBINING + /* + * Try to combine with the previous character. + * + * We _could_ try regardless of what 'wc' is. However, for + * performance reasons, we only do it when 'wc' is in a known + * 'combining' range. + * + * TODO: + * - doesn't work when base character is multi-column (we'll only + * see an empty "null" character) + */ - int width = wcwidth(wc); - term_print(term, wc, width); + if (((wc >= 0x0300 && wc <= 0x036F) || /* diacritical marks */ + (wc >= 0x1AB0 && wc <= 0x1AFF) || /* diacritical marks, extended */ + (wc >= 0x1DC0 && wc <= 0x1DFF) || /* diacritical marks, supplement */ + (wc >= 0x20D0 && wc <= 0x20FF) || /* diacritical marks, for symbols */ + (wc >= 0xFE20 && wc <= 0xFE2F)) /* half marks */ + && term->grid->cursor.point.col > 0) + { + int base_col = term->grid->cursor.point.col; + if (!term->grid->cursor.lcf) + base_col--; + + assert(base_col >= 0 && base_col < term->cols); + wchar_t base = term->grid->cur_row->cells[base_col].wc; + int base_width = wcwidth(base); + + if (base_width > 0) { + wchar_t composed[] = {base, wc}; + ssize_t composed_length = utf8proc_normalize_utf32( + composed, sizeof(composed) / sizeof(composed[0]), + UTF8PROC_COMPOSE | UTF8PROC_STABLE); + + LOG_DBG("composed = 0x%04x, 0x%04x (length = %zd)", + composed[0], composed[1], composed_length); + + if (composed_length == 1) { + /* Compose succeess - overwrite last cell with + * combined character */ + term->grid->cursor.point.col = base_col; + term->grid->cursor.lcf = false; + term_print(term, composed[0], wcwidth(composed[0])); + return; + } + } + } +#endif /* FOOT_UNICODE_COMBINING */ + + term_print(term, wc, wcwidth(wc)); } static enum state @@ -1016,9 +1058,6 @@ static enum state state_utf8_collect_1_switch(struct terminal *term, uint8_t data) { term->vt.utf8.data[term->vt.utf8.idx++] = data; - term->vt.utf8.left--; - - assert(term->vt.utf8.left == 0); action_utf8_print(term, data); return STATE_GROUND; } @@ -1027,9 +1066,6 @@ static enum state state_utf8_collect_2_switch(struct terminal *term, uint8_t data) { term->vt.utf8.data[term->vt.utf8.idx++] = data; - term->vt.utf8.left--; - - assert(term->vt.utf8.left == 1); return STATE_UTF8_COLLECT_1; } @@ -1037,9 +1073,6 @@ static enum state state_utf8_collect_3_switch(struct terminal *term, uint8_t data) { term->vt.utf8.data[term->vt.utf8.idx++] = data; - term->vt.utf8.left--; - - assert(term->vt.utf8.left == 2); return STATE_UTF8_COLLECT_2; }