From 859b4c8921f5cde8cfc1f358e865ba417fe68a12 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20Ekl=C3=B6f?= <daniel@ekloef.se>
Date: Thu, 30 Jan 2025 09:51:50 +0100
Subject: [PATCH] url-mode: wip: more work on regex matching

Remove the old auto-detection and instead use the regex matches.
---
 url-mode.c | 371 ++++++++++++-----------------------------------------
 1 file changed, 80 insertions(+), 291 deletions(-)

diff --git a/url-mode.c b/url-mode.c
index 0ed79a99..18085ab5 100644
--- a/url-mode.c
+++ b/url-mode.c
@@ -292,21 +292,31 @@ urls_input(struct seat *seat, struct terminal *term,
     }
 }
 
-struct charmap {
-    const struct row *row;
-    int col;
-};
-
 struct vline {
     char *utf8;
-    size_t len;
-    size_t sz;
-    struct charmap *map;
+    size_t len;          /* Length of utf8[] */
+    size_t sz;           /* utf8[] allocated size */
+    struct coord *map;   /* Maps utf8[ofs] to grid coordinates */
 };
 
 static void
 regex_detected(const struct terminal *term, enum url_action action, url_list_t *urls)
 {
+    /*
+     * Use regcomp()+regexec() to find patterns.
+     *
+     * Since we can't feed regexec() one character at a time, and
+     * since it doesn't accept wide characters, we need to build utf8
+     * strings.
+     *
+     * Each string represents a logical line (i.e. handle line-wrap).
+     * To be able to map regex matches back to the grid, we store the
+     * grid coordinates of *each* character, in the line struct as
+     * well. This is offset based; utf8[ofs] has its grid coordinates
+     * in map[ofs.
+     */
+
+    /* There is *at most* term->rows logical lines */
     struct vline vlines[term->rows];
     size_t vline_idx = 0;
 
@@ -315,14 +325,15 @@ regex_detected(const struct terminal *term, enum url_action action, url_list_t *
 
     mbstate_t ps = {0};
 
-    for (int row_no = 0; row_no < term->rows; row_no++) {
-        const struct row *row = grid_row_in_view(term->grid, row_no);
+    for (int r = 0; r < term->rows; r++) {
+        const struct row *row = grid_row_in_view(term->grid, r);
 
         for (int c = 0; c < term->cols; c++) {
             const struct cell *cell = &row->cells[c];
             const char32_t *wc = &cell->wc;
             size_t wc_count = 1;
 
+            /* Expand combining characters */
             if (wc[0] >= CELL_COMB_CHARS_LO && wc[0] <= CELL_COMB_CHARS_HI) {
                 const struct composed *composed =
                     composed_lookup(term->composed, wc[0] - CELL_COMB_CHARS_LO);
@@ -332,26 +343,26 @@ regex_detected(const struct terminal *term, enum url_action action, url_list_t *
                 wc_count = composed->count;
             }
 
+            /* Convert wide character to utf8 */
             for (size_t i = 0; i < wc_count; i++) {
                 char buf[16];
                 size_t char_len = c32rtomb(buf, wc[i], &ps);
+
                 if (char_len == (size_t)-1)
                     continue;
 
-                if (vline->len == 0 && char_len == 1 && buf[0] == 0)
-                    continue;
-
                 for (size_t j = 0; j < char_len; j++) {
-                    if (vline->len + char_len > vline->sz) {
-                        /* TODO: grow dynamically */
-                        size_t new_count = (vline->len + char_len) * 2;
-                        vline->utf8 = xreallocarray(vline->utf8, new_count, 1);
-                        vline->map = xreallocarray(vline->map, new_count, sizeof(vline->map[0]));
+                    const size_t requires_size = vline->len + char_len;
+
+                    if (requires_size > vline->sz) {
+                        const size_t new_size = requires_size * 2;
+                        vline->utf8 = xreallocarray(vline->utf8, new_size, 1);
+                        vline->map = xreallocarray(vline->map, new_size, sizeof(vline->map[0]));
+                        vline->sz = new_size;
                     }
 
                     vline->utf8[vline->len + j] = buf[j];
-                    vline->map[vline->len + j].col = c;
-                    vline->map[vline->len + j].row = row;
+                    vline->map[vline->len + j] = (struct coord){c, term->grid->view + r};
                 }
 
                 vline->len += char_len;
@@ -371,9 +382,9 @@ regex_detected(const struct terminal *term, enum url_action action, url_list_t *
 
     // https://gist.github.com/gruber/249502
     regex_t preg;
-    const char *foo =
+    const char *regex_string =
         "("
-            "[a-z][[:alpha:]-]+:"       // protocol
+            "[a-z][[:alnum:]-]+:"       // protocol
             "("
                 "/{1,3}|[a-z0-9%]"     // slashes (what's the OR part for?)
             ")"
@@ -390,291 +401,70 @@ regex_detected(const struct terminal *term, enum url_action action, url_list_t *
         "("
             "\\(([^[:space:]()<>]+|(\\([^[:space:]()<>]+\\)))*\\)"
             "|"
-        // TODO: figure out how to add \\] to the expression below...
-            "[^[:space:]`!()\\[{};:'\".,<>?«»“”‘’]"
+            "[^]\\[[:space:]`!(){};:'\".,<>?«»“”‘’]"
         ")"
     ;
 
-    LOG_ERR("foo=%s", foo);
-
-    int r = regcomp(&preg, foo, REG_EXTENDED);
+    int r = regcomp(&preg, regex_string, REG_EXTENDED);
 
     if (r != 0) {
         char err_buf[1024];
         regerror(r, &preg, err_buf, sizeof(err_buf));
-        LOG_ERR("regcomp: %s", err_buf);
-    } else {
-        size_t i = 0;
-        while (true) {
-            const struct vline *v = &vlines[i++];
-            if (v->utf8 == NULL)
-                break;
+        LOG_ERR("failed to compile regular expression: %s", err_buf);
 
-
-            regmatch_t matches[preg.re_nsub + 1];
-            r = regexec(&preg, v->utf8, preg.re_nsub + 1, matches, 0);
-
-            if (r == REG_NOMATCH)
-                continue;
-
-            size_t mlen = matches[0].rm_eo - matches[0].rm_so;
-            LOG_WARN("MATCH at %d: %.*s (%zu)", matches[0].rm_so, (int)mlen, &v->utf8[matches[0].rm_so], mlen);
+        for (size_t i = 0; i < ALEN(vlines); i++) {
+            const struct vline *v = &vlines[i];
+            free(v->utf8);
+            free(v->map);
         }
-        regfree(&preg);
+
+        return;
     }
 
-    size_t i = 0;
-    while (true) {
-        const struct vline *v = &vlines[i++];
+    for (size_t i = 0; i < ALEN(vlines); i++) {
+        const struct vline *v = &vlines[i];
         if (v->utf8 == NULL)
-            break;
+            continue;;
+
+        const char *search_string = v->utf8;
+        while (true) {
+
+            regmatch_t matches[preg.re_nsub + 1];
+            r = regexec(&preg, search_string, preg.re_nsub + 1, matches, 0);
+
+            if (r == REG_NOMATCH)
+                break;
+
+            const size_t mlen = matches[0].rm_eo - matches[0].rm_so;
+            const size_t start = &search_string[matches[0].rm_so] - v->utf8;
+            const size_t end = start + mlen;
+
+            LOG_DBG(
+                "MATCH at %d: %.*s (%zu) row/col = %dx%d",
+                matches[0].rm_so, (int)mlen, &search_string[matches[0].rm_so],
+                mlen, v->map[start].row, v->map[start].col);
+
+            tll_push_back(
+                *urls,
+                ((struct url){
+                    .id = (uint64_t)rand() << 32 | rand(),
+                    .url = xstrndup(&v->utf8[start], mlen),
+                    .range = {
+                        .start = v->map[start],
+                        .end = v->map[end - 1], /* Inclusive */
+                    },
+                    .action = action,
+                    .osc8 = false}));
+
+            search_string += matches[0].rm_eo;
+        }
 
-        LOG_WARN("%.*s", (int)v->len, v->utf8);
         free(v->utf8);
         free(v->map);
     }
+
+    regfree(&preg);
 }
-
-static int
-c32cmp_single(const void *_a, const void *_b)
-{
-    const char32_t *a = _a;
-    const char32_t *b = _b;
-    return *a - *b;
-}
-
-static void
-auto_detected(const struct terminal *term, enum url_action action,
-              url_list_t *urls)
-{
-    const struct config *conf = term->conf;
-
-    const char32_t *uri_characters = conf->url.uri_characters;
-    if (uri_characters == NULL)
-        return;
-
-    const size_t uri_characters_count = c32len(uri_characters);
-    if (uri_characters_count == 0)
-        return;
-
-    size_t max_prot_len = conf->url.max_prot_len;
-    char32_t proto_chars[max_prot_len];
-    struct coord proto_start[max_prot_len];
-    size_t proto_char_count = 0;
-
-    enum {
-        STATE_PROTOCOL,
-        STATE_URL,
-    } state = STATE_PROTOCOL;
-
-    struct coord start = {-1, -1};
-    char32_t url[term->cols * term->rows + 1];
-    size_t len = 0;
-
-    ssize_t parenthesis = 0;
-    ssize_t brackets = 0;
-    ssize_t ltgts = 0;
-
-    for (int r = 0; r < term->rows; r++) {
-        const struct row *row = grid_row_in_view(term->grid, r);
-
-        for (int c = 0; c < term->cols; c++) {
-            const struct cell *cell = &row->cells[c];
-
-            if (cell->wc >= CELL_SPACER)
-                continue;
-
-            const char32_t *wcs = NULL;
-            size_t wc_count = 0;
-
-            if (cell->wc >= CELL_COMB_CHARS_LO && cell->wc <= CELL_COMB_CHARS_HI) {
-                struct composed *composed =
-                    composed_lookup(term->composed, cell->wc - CELL_COMB_CHARS_LO);
-                wcs = composed->chars;
-                wc_count = composed->count;
-            } else {
-                wcs = &cell->wc;
-                wc_count = 1;
-            }
-
-            for (size_t w_idx = 0; w_idx < wc_count; w_idx++) {
-                char32_t wc = wcs[w_idx];
-
-                switch (state) {
-                case STATE_PROTOCOL:
-                  for (size_t i = 0; i < max_prot_len - 1; i++) {
-                    proto_chars[i] = proto_chars[i + 1];
-                    proto_start[i] = proto_start[i + 1];
-                  }
-
-                  if (proto_char_count >= max_prot_len)
-                    proto_char_count = max_prot_len - 1;
-
-                  proto_chars[max_prot_len - 1] = wc;
-                  proto_start[max_prot_len - 1] = (struct coord){c, r};
-                  proto_char_count++;
-
-                  for (size_t i = 0; i < conf->url.prot_count; i++) {
-                    size_t prot_len = c32len(conf->url.protocols[i]);
-
-                    if (proto_char_count < prot_len)
-                      continue;
-
-                    const char32_t *proto =
-                        &proto_chars[max_prot_len - prot_len];
-
-                    if (c32ncasecmp(conf->url.protocols[i], proto, prot_len) ==
-                        0) {
-                      state = STATE_URL;
-                      start = proto_start[max_prot_len - prot_len];
-
-                      c32ncpy(url, proto, prot_len);
-                      len = prot_len;
-
-                      parenthesis = brackets = ltgts = 0;
-                      break;
-                    }
-                  }
-                  break;
-
-                case STATE_URL: {
-                  const char32_t *match =
-                      bsearch(&wc, uri_characters, uri_characters_count,
-                              sizeof(uri_characters[0]), &c32cmp_single);
-
-                  bool emit_url = false;
-
-                  if (match == NULL) {
-                    /*
-                     * Character is not a valid URI character. Emit
-                     * the URL we've collected so far, *without*
-                     * including _this_ character.
-                     */
-                    emit_url = true;
-                  } else {
-                    xassert(*match == wc);
-
-                    switch (wc) {
-                    default:
-                      url[len++] = wc;
-                      break;
-
-                    case U'(':
-                      parenthesis++;
-                      url[len++] = wc;
-                      break;
-
-                    case U'[':
-                      brackets++;
-                      url[len++] = wc;
-                      break;
-
-                    case U'<':
-                      ltgts++;
-                      url[len++] = wc;
-                      break;
-
-                    case U')':
-                      if (--parenthesis < 0)
-                        emit_url = true;
-                      else
-                        url[len++] = wc;
-                      break;
-
-                    case U']':
-                      if (--brackets < 0)
-                        emit_url = true;
-                      else
-                        url[len++] = wc;
-                      break;
-
-                    case U'>':
-                      if (--ltgts < 0)
-                        emit_url = true;
-                      else
-                        url[len++] = wc;
-                      break;
-                    }
-                  }
-
-                  if (c >= term->cols - 1 && row->linebreak) {
-                    /*
-                     * Endpoint is inclusive, and we'll be subtracting
-                     * 1 from the column when emitting the URL.
-                     */
-                    c++;
-                    emit_url = true;
-                  }
-
-                  if (emit_url) {
-                    struct coord end = {c, r};
-
-                    if (--end.col < 0) {
-                      end.row--;
-                      end.col = term->cols - 1;
-                    }
-
-                    /* Heuristic to remove trailing characters that
-                     * are valid URL characters, but typically not at
-                     * the end of the URL */
-                    bool done = false;
-                    do {
-                      switch (url[len - 1]) {
-                      case U'.':
-                      case U',':
-                      case U':':
-                      case U';':
-                      case U'?':
-                      case U'!':
-                      case U'"':
-                      case U'\'':
-                      case U'%':
-                        len--;
-                        end.col--;
-                        if (end.col < 0) {
-                          end.row--;
-                          end.col = term->cols - 1;
-                        }
-                        break;
-
-                      default:
-                        done = true;
-                        break;
-                      }
-                    } while (!done);
-
-                    url[len] = U'\0';
-
-                    start.row += term->grid->view;
-                    end.row += term->grid->view;
-
-                    char *url_utf8 = ac32tombs(url);
-                    if (url_utf8 != NULL) {
-                      tll_push_back(
-                          *urls,
-                          ((struct url){.id = (uint64_t)rand() << 32 | rand(),
-                                        .url = url_utf8,
-                                        .range =
-                                            {
-                                                .start = start,
-                                                .end = end,
-                                            },
-                                        .action = action,
-                                        .osc8 = false}));
-                    }
-
-                    state = STATE_PROTOCOL;
-                    len = 0;
-                    parenthesis = brackets = ltgts = 0;
-                  }
-                  break;
-                }
-                }
-            }
-        }
-    }
-}
-
 static void
 osc8_uris(const struct terminal *term, enum url_action action, url_list_t *urls)
 {
@@ -777,7 +567,6 @@ urls_collect(const struct terminal *term, enum url_action action, url_list_t *ur
 {
     xassert(tll_length(term->urls) == 0);
     osc8_uris(term, action, urls);
-    auto_detected(term, action, urls);
     regex_detected(term, action, urls);
     remove_overlapping(urls, term->grid->num_cols);
 }