url-mode: handle wide chars and grapheme clusters when auto-detecting URLs

* Skip spacer cells. This fixes an issue where characters following a
  double-width character weren't detect properly.

* Unpack grapheme clusters (i.e. cells with multiple codepoints), and
  iterate all their codepoints.

Closes #1465
This commit is contained in:
Daniel Eklöf 2023-08-21 16:26:18 +02:00
parent 482a032d1a
commit 4f3f614457
No known key found for this signature in database
GPG key ID: 5BBD4992C116573F
2 changed files with 118 additions and 90 deletions

View file

@ -76,9 +76,12 @@
([#1436][1436]). ([#1436][1436]).
* One frame being rendered at the wrong scale after being hidden by * One frame being rendered at the wrong scale after being hidden by
another opaque, maximized window ([#1464][1464]). another opaque, maximized window ([#1464][1464]).
* Double-width characters, and grapheme clusters breaking URL
auto-detection ([#1465][1465]).
[1436]: https://codeberg.org/dnkl/foot/issues/1436 [1436]: https://codeberg.org/dnkl/foot/issues/1436
[1464]: https://codeberg.org/dnkl/foot/issues/1464 [1464]: https://codeberg.org/dnkl/foot/issues/1464
[1465]: https://codeberg.org/dnkl/foot/issues/1465
### Security ### Security

View file

@ -326,121 +326,138 @@ auto_detected(const struct terminal *term, enum url_action action,
for (int c = 0; c < term->cols; c++) { for (int c = 0; c < term->cols; c++) {
const struct cell *cell = &row->cells[c]; const struct cell *cell = &row->cells[c];
char32_t wc = cell->wc;
switch (state) { if (cell->wc >= CELL_SPACER)
case STATE_PROTOCOL: continue;
for (size_t i = 0; i < max_prot_len - 1; i++) {
const char32_t *wcs = NULL;
size_t wc_count = 0;
if (cell->wc >= CELL_COMB_CHARS_LO && cell->wc <= CELL_COMB_CHARS_HI) {
struct composed *composed =
composed_lookup(term->composed, cell->wc - CELL_COMB_CHARS_LO);
wcs = composed->chars;
wc_count = composed->count;
} else {
wcs = &cell->wc;
wc_count = 1;
}
for (size_t w_idx = 0; w_idx < wc_count; w_idx++) {
char32_t wc = wcs[w_idx];
switch (state) {
case STATE_PROTOCOL:
for (size_t i = 0; i < max_prot_len - 1; i++) {
proto_chars[i] = proto_chars[i + 1]; proto_chars[i] = proto_chars[i + 1];
proto_start[i] = proto_start[i + 1]; proto_start[i] = proto_start[i + 1];
} }
if (proto_char_count >= max_prot_len) if (proto_char_count >= max_prot_len)
proto_char_count = max_prot_len - 1; proto_char_count = max_prot_len - 1;
proto_chars[max_prot_len - 1] = wc; proto_chars[max_prot_len - 1] = wc;
proto_start[max_prot_len - 1] = (struct coord){c, r}; proto_start[max_prot_len - 1] = (struct coord){c, r};
proto_char_count++; proto_char_count++;
for (size_t i = 0; i < conf->url.prot_count; i++) { for (size_t i = 0; i < conf->url.prot_count; i++) {
size_t prot_len = c32len(conf->url.protocols[i]); size_t prot_len = c32len(conf->url.protocols[i]);
if (proto_char_count < prot_len) if (proto_char_count < prot_len)
continue; continue;
const char32_t *proto = &proto_chars[max_prot_len - prot_len]; const char32_t *proto =
&proto_chars[max_prot_len - prot_len];
if (c32ncasecmp(conf->url.protocols[i], proto, prot_len) == 0) { if (c32ncasecmp(conf->url.protocols[i], proto, prot_len) ==
state = STATE_URL; 0) {
start = proto_start[max_prot_len - prot_len]; state = STATE_URL;
start = proto_start[max_prot_len - prot_len];
c32ncpy(url, proto, prot_len); c32ncpy(url, proto, prot_len);
len = prot_len; len = prot_len;
parenthesis = brackets = ltgts = 0; parenthesis = brackets = ltgts = 0;
break; break;
} }
} }
break; break;
case STATE_URL: { case STATE_URL: {
const char32_t *match = bsearch( const char32_t *match =
&wc, bsearch(&wc, uri_characters, uri_characters_count,
uri_characters, sizeof(uri_characters[0]), &c32cmp_single);
uri_characters_count,
sizeof(uri_characters[0]),
&c32cmp_single);
bool emit_url = false; bool emit_url = false;
if (match == NULL) { if (match == NULL) {
/* /*
* Character is not a valid URI character. Emit * Character is not a valid URI character. Emit
* the URL weve collected so far, *without* * the URL weve collected so far, *without*
* including _this_ character. * including _this_ character.
*/ */
emit_url = true; emit_url = true;
} else { } else {
xassert(*match == wc); xassert(*match == wc);
switch (wc) { switch (wc) {
default: default:
url[len++] = wc; url[len++] = wc;
break; break;
case U'(': case U'(':
parenthesis++; parenthesis++;
url[len++] = wc; url[len++] = wc;
break; break;
case U'[': case U'[':
brackets++; brackets++;
url[len++] = wc; url[len++] = wc;
break; break;
case U'<': case U'<':
ltgts++; ltgts++;
url[len++] = wc; url[len++] = wc;
break; break;
case U')': case U')':
if (--parenthesis < 0) if (--parenthesis < 0)
emit_url = true; emit_url = true;
else else
url[len++] = wc; url[len++] = wc;
break; break;
case U']': case U']':
if (--brackets < 0) if (--brackets < 0)
emit_url = true; emit_url = true;
else else
url[len++] = wc; url[len++] = wc;
break; break;
case U'>': case U'>':
if (--ltgts < 0) if (--ltgts < 0)
emit_url = true; emit_url = true;
else else
url[len++] = wc; url[len++] = wc;
break; break;
} }
} }
if (c >= term->cols - 1 && row->linebreak) { if (c >= term->cols - 1 && row->linebreak) {
/* /*
* Endpoint is inclusive, and well be subtracting * Endpoint is inclusive, and well be subtracting
* 1 from the column when emitting the URL. * 1 from the column when emitting the URL.
*/ */
c++; c++;
emit_url = true; emit_url = true;
} }
if (emit_url) { if (emit_url) {
struct coord end = {c, r}; struct coord end = {c, r};
if (--end.col < 0) { if (--end.col < 0) {
end.row--; end.row--;
end.col = term->cols - 1; end.col = term->cols - 1;
} }
/* Heuristic to remove trailing characters that /* Heuristic to remove trailing characters that
@ -448,21 +465,28 @@ auto_detected(const struct terminal *term, enum url_action action,
* the end of the URL */ * the end of the URL */
bool done = false; bool done = false;
do { do {
switch (url[len - 1]) { switch (url[len - 1]) {
case U'.': case U',': case U':': case U';': case U'?': case U'.':
case U'!': case U'"': case U'\'': case U'%': case U',':
len--; case U':':
end.col--; case U';':
if (end.col < 0) { case U'?':
end.row--; case U'!':
end.col = term->cols - 1; case U'"':
} case U'\'':
break; case U'%':
len--;
default: end.col--;
done = true; if (end.col < 0) {
break; end.row--;
end.col = term->cols - 1;
} }
break;
default:
done = true;
break;
}
} while (!done); } while (!done);
url[len] = U'\0'; url[len] = U'\0';
@ -472,25 +496,26 @@ auto_detected(const struct terminal *term, enum url_action action,
char *url_utf8 = ac32tombs(url); char *url_utf8 = ac32tombs(url);
if (url_utf8 != NULL) { if (url_utf8 != NULL) {
tll_push_back( tll_push_back(
*urls, *urls,
((struct url){ ((struct url){.id = (uint64_t)rand() << 32 | rand(),
.id = (uint64_t)rand() << 32 | rand(), .url = url_utf8,
.url = url_utf8, .range =
.range = { {
.start = start, .start = start,
.end = end, .end = end,
}, },
.action = action, .action = action,
.osc8 = false})); .osc8 = false}));
} }
state = STATE_PROTOCOL; state = STATE_PROTOCOL;
len = 0; len = 0;
parenthesis = brackets = ltgts = 0; parenthesis = brackets = ltgts = 0;
}
break;
}
} }
break;
}
} }
} }
} }