url-mode: handle wide chars and grapheme clusters when auto-detecting URLs

* Skip spacer cells. This fixes an issue where characters following a
  double-width character weren't detect properly.

* Unpack grapheme clusters (i.e. cells with multiple codepoints), and
  iterate all their codepoints.

Closes #1465
This commit is contained in:
Daniel Eklöf 2023-08-21 16:26:18 +02:00
parent 482a032d1a
commit 4f3f614457
No known key found for this signature in database
GPG key ID: 5BBD4992C116573F
2 changed files with 118 additions and 90 deletions

View file

@ -76,9 +76,12 @@
([#1436][1436]). ([#1436][1436]).
* One frame being rendered at the wrong scale after being hidden by * One frame being rendered at the wrong scale after being hidden by
another opaque, maximized window ([#1464][1464]). another opaque, maximized window ([#1464][1464]).
* Double-width characters, and grapheme clusters breaking URL
auto-detection ([#1465][1465]).
[1436]: https://codeberg.org/dnkl/foot/issues/1436 [1436]: https://codeberg.org/dnkl/foot/issues/1436
[1464]: https://codeberg.org/dnkl/foot/issues/1464 [1464]: https://codeberg.org/dnkl/foot/issues/1464
[1465]: https://codeberg.org/dnkl/foot/issues/1465
### Security ### Security

View file

@ -326,7 +326,25 @@ auto_detected(const struct terminal *term, enum url_action action,
for (int c = 0; c < term->cols; c++) { for (int c = 0; c < term->cols; c++) {
const struct cell *cell = &row->cells[c]; const struct cell *cell = &row->cells[c];
char32_t wc = cell->wc;
if (cell->wc >= CELL_SPACER)
continue;
const char32_t *wcs = NULL;
size_t wc_count = 0;
if (cell->wc >= CELL_COMB_CHARS_LO && cell->wc <= CELL_COMB_CHARS_HI) {
struct composed *composed =
composed_lookup(term->composed, cell->wc - CELL_COMB_CHARS_LO);
wcs = composed->chars;
wc_count = composed->count;
} else {
wcs = &cell->wc;
wc_count = 1;
}
for (size_t w_idx = 0; w_idx < wc_count; w_idx++) {
char32_t wc = wcs[w_idx];
switch (state) { switch (state) {
case STATE_PROTOCOL: case STATE_PROTOCOL:
@ -348,9 +366,11 @@ auto_detected(const struct terminal *term, enum url_action action,
if (proto_char_count < prot_len) if (proto_char_count < prot_len)
continue; continue;
const char32_t *proto = &proto_chars[max_prot_len - prot_len]; const char32_t *proto =
&proto_chars[max_prot_len - prot_len];
if (c32ncasecmp(conf->url.protocols[i], proto, prot_len) == 0) { if (c32ncasecmp(conf->url.protocols[i], proto, prot_len) ==
0) {
state = STATE_URL; state = STATE_URL;
start = proto_start[max_prot_len - prot_len]; start = proto_start[max_prot_len - prot_len];
@ -364,12 +384,9 @@ auto_detected(const struct terminal *term, enum url_action action,
break; break;
case STATE_URL: { case STATE_URL: {
const char32_t *match = bsearch( const char32_t *match =
&wc, bsearch(&wc, uri_characters, uri_characters_count,
uri_characters, sizeof(uri_characters[0]), &c32cmp_single);
uri_characters_count,
sizeof(uri_characters[0]),
&c32cmp_single);
bool emit_url = false; bool emit_url = false;
@ -449,8 +466,15 @@ auto_detected(const struct terminal *term, enum url_action action,
bool done = false; bool done = false;
do { do {
switch (url[len - 1]) { switch (url[len - 1]) {
case U'.': case U',': case U':': case U';': case U'?': case U'.':
case U'!': case U'"': case U'\'': case U'%': case U',':
case U':':
case U';':
case U'?':
case U'!':
case U'"':
case U'\'':
case U'%':
len--; len--;
end.col--; end.col--;
if (end.col < 0) { if (end.col < 0) {
@ -474,10 +498,10 @@ auto_detected(const struct terminal *term, enum url_action action,
if (url_utf8 != NULL) { if (url_utf8 != NULL) {
tll_push_back( tll_push_back(
*urls, *urls,
((struct url){ ((struct url){.id = (uint64_t)rand() << 32 | rand(),
.id = (uint64_t)rand() << 32 | rand(),
.url = url_utf8, .url = url_utf8,
.range = { .range =
{
.start = start, .start = start,
.end = end, .end = end,
}, },
@ -494,6 +518,7 @@ auto_detected(const struct terminal *term, enum url_action action,
} }
} }
} }
}
} }
static void static void