url-mode: handle wide chars and grapheme clusters when auto-detecting URLs

* Skip spacer cells. This fixes an issue where characters following a double-width character weren't detect properly. * Unpack grapheme clusters (i.e. cells with multiple codepoints), and iterate all their codepoints. Closes #1465
2026-04-01 07:15:32 -04:00 · 2023-08-21 16:26:18 +02:00 · 2023-08-21 16:26:18 +02:00 · 4f3f614457
commit 4f3f614457
parent 482a032d1a
2 changed files with 118 additions and 90 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -76,9 +76,12 @@
  ([#1436][1436]).
 * One frame being rendered at the wrong scale after being hidden by
  another opaque, maximized window ([#1464][1464]).
 * Double-width characters, and grapheme clusters breaking URL
  auto-detection ([#1465][1465]).
 [1436]: https://codeberg.org/dnkl/foot/issues/1436
 [1464]: https://codeberg.org/dnkl/foot/issues/1464
 [1465]: https://codeberg.org/dnkl/foot/issues/1465
 ### Security
--- a/url-mode.c
+++ b/url-mode.c
@ -326,121 +326,138 @@ auto_detected(const struct terminal *term, enum url_action action,
        for (int c = 0; c < term->cols; c++) {
            const struct cell *cell = &row->cells[c];
            char32_t wc = cell->wc;
-            switch (state) {
+            if (cell->wc >= CELL_SPACER)
-            case STATE_PROTOCOL:
+                continue;
-                for (size_t i = 0; i < max_prot_len - 1; i++) {
+
            const char32_t *wcs = NULL;
            size_t wc_count = 0;
            if (cell->wc >= CELL_COMB_CHARS_LO && cell->wc <= CELL_COMB_CHARS_HI) {
                struct composed *composed =
                    composed_lookup(term->composed, cell->wc - CELL_COMB_CHARS_LO);
                wcs = composed->chars;
                wc_count = composed->count;
            } else {
                wcs = &cell->wc;
                wc_count = 1;
            }
            for (size_t w_idx = 0; w_idx < wc_count; w_idx++) {
                char32_t wc = wcs[w_idx];
                switch (state) {
                case STATE_PROTOCOL:
                  for (size_t i = 0; i < max_prot_len - 1; i++) {
                    proto_chars[i] = proto_chars[i + 1];
                    proto_start[i] = proto_start[i + 1];
-                }
+                  }
-                if (proto_char_count >= max_prot_len)
+                  if (proto_char_count >= max_prot_len)
                    proto_char_count = max_prot_len - 1;
-                proto_chars[max_prot_len - 1] = wc;
+                  proto_chars[max_prot_len - 1] = wc;
-                proto_start[max_prot_len - 1] = (struct coord){c, r};
+                  proto_start[max_prot_len - 1] = (struct coord){c, r};
-                proto_char_count++;
+                  proto_char_count++;
-                for (size_t i = 0; i < conf->url.prot_count; i++) {
+                  for (size_t i = 0; i < conf->url.prot_count; i++) {
                    size_t prot_len = c32len(conf->url.protocols[i]);
                    if (proto_char_count < prot_len)
-                        continue;
+                      continue;
-                    const char32_t *proto = &proto_chars[max_prot_len - prot_len];
+                    const char32_t *proto =
                        &proto_chars[max_prot_len - prot_len];
-                    if (c32ncasecmp(conf->url.protocols[i], proto, prot_len) == 0) {
+                    if (c32ncasecmp(conf->url.protocols[i], proto, prot_len) ==
-                        state = STATE_URL;
+                        0) {
-                        start = proto_start[max_prot_len - prot_len];
+                      state = STATE_URL;
                      start = proto_start[max_prot_len - prot_len];
-                        c32ncpy(url, proto, prot_len);
+                      c32ncpy(url, proto, prot_len);
-                        len = prot_len;
+                      len = prot_len;
-                        parenthesis = brackets = ltgts = 0;
+                      parenthesis = brackets = ltgts = 0;
-                        break;
+                      break;
                    }
-                }
+                  }
-                break;
+                  break;
-            case STATE_URL: {
+                case STATE_URL: {
-                const char32_t *match = bsearch(
+                  const char32_t *match =
-                    &wc,
+                      bsearch(&wc, uri_characters, uri_characters_count,
-                    uri_characters,
+                              sizeof(uri_characters[0]), &c32cmp_single);
                    uri_characters_count,
                    sizeof(uri_characters[0]),
                    &c32cmp_single);
-                bool emit_url = false;
+                  bool emit_url = false;
-                if (match == NULL) {
+                  if (match == NULL) {
                    /*
                     * Character is not a valid URI character. Emit
                     * the URL we’ve collected so far, *without*
                     * including _this_ character.
                     */
                    emit_url = true;
-                } else {
+                  } else {
                    xassert(*match == wc);
                    switch (wc) {
                    default:
-                        url[len++] = wc;
+                      url[len++] = wc;
-                        break;
+                      break;
                    case U'(':
-                        parenthesis++;
+                      parenthesis++;
-                        url[len++] = wc;
+                      url[len++] = wc;
-                        break;
+                      break;
                    case U'[':
-                        brackets++;
+                      brackets++;
-                        url[len++] = wc;
+                      url[len++] = wc;
-                        break;
+                      break;
                    case U'<':
-                        ltgts++;
+                      ltgts++;
-                        url[len++] = wc;
+                      url[len++] = wc;
-                        break;
+                      break;
                    case U')':
-                        if (--parenthesis < 0)
+                      if (--parenthesis < 0)
-                            emit_url = true;
+                        emit_url = true;
-                        else
+                      else
-                            url[len++] = wc;
+                        url[len++] = wc;
-                        break;
+                      break;
                    case U']':
-                        if (--brackets < 0)
+                      if (--brackets < 0)
-                            emit_url = true;
+                        emit_url = true;
-                        else
+                      else
-                            url[len++] = wc;
+                        url[len++] = wc;
-                        break;
+                      break;
                    case U'>':
-                        if (--ltgts < 0)
+                      if (--ltgts < 0)
-                            emit_url = true;
+                        emit_url = true;
-                        else
+                      else
-                            url[len++] = wc;
+                        url[len++] = wc;
-                        break;
+                      break;
                    }
-                }
+                  }
-                if (c >= term->cols - 1 && row->linebreak) {
+                  if (c >= term->cols - 1 && row->linebreak) {
                    /*
                     * Endpoint is inclusive, and we’ll be subtracting
                     * 1 from the column when emitting the URL.
                     */
                    c++;
                    emit_url = true;
-                }
+                  }
-                if (emit_url) {
+                  if (emit_url) {
                    struct coord end = {c, r};
                    if (--end.col < 0) {
-                        end.row--;
+                      end.row--;
-                        end.col = term->cols - 1;
+                      end.col = term->cols - 1;
                    }
                    /* Heuristic to remove trailing characters that
@ -448,21 +465,28 @@ auto_detected(const struct terminal *term, enum url_action action,
                     * the end of the URL */
                    bool done = false;
                    do {
-                        switch (url[len - 1]) {
+                      switch (url[len - 1]) {
-                        case U'.': case U',': case U':': case U';': case U'?':
+                      case U'.':
-                        case U'!': case U'"': case U'\'': case U'%':
+                      case U',':
-                            len--;
+                      case U':':
-                            end.col--;
+                      case U';':
-                            if (end.col < 0) {
+                      case U'?':
-                                end.row--;
+                      case U'!':
-                                end.col = term->cols - 1;
+                      case U'"':
-                            }
+                      case U'\'':
-                            break;
+                      case U'%':
-
+                        len--;
-                        default:
+                        end.col--;
-                            done = true;
+                        if (end.col < 0) {
-                            break;
+                          end.row--;
                          end.col = term->cols - 1;
                        }
                        break;
                      default:
                        done = true;
                        break;
                      }
                    } while (!done);
                    url[len] = U'\0';
@ -472,25 +496,26 @@ auto_detected(const struct terminal *term, enum url_action action,
                    char *url_utf8 = ac32tombs(url);
                    if (url_utf8 != NULL) {
-                        tll_push_back(
+                      tll_push_back(
-                            *urls,
+                          *urls,
-                            ((struct url){
+                          ((struct url){.id = (uint64_t)rand() << 32 | rand(),
-                                .id = (uint64_t)rand() << 32 | rand(),
+                                        .url = url_utf8,
-                                .url = url_utf8,
+                                        .range =
-                                .range = {
+                                            {
-                                    .start = start,
+                                                .start = start,
-                                    .end = end,
+                                                .end = end,
-                                },
+                                            },
-                                .action = action,
+                                        .action = action,
-                                .osc8 = false}));
+                                        .osc8 = false}));
                    }
                    state = STATE_PROTOCOL;
                    len = 0;
                    parenthesis = brackets = ltgts = 0;
                  }
                  break;
                }
                }
                break;
            }
            }
        }
    }