From 859b4c8921f5cde8cfc1f358e865ba417fe68a12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Ekl=C3=B6f?= Date: Thu, 30 Jan 2025 09:51:50 +0100 Subject: [PATCH] url-mode: wip: more work on regex matching Remove the old auto-detection and instead use the regex matches. --- url-mode.c | 371 ++++++++++++----------------------------------------- 1 file changed, 80 insertions(+), 291 deletions(-) diff --git a/url-mode.c b/url-mode.c index 0ed79a99..18085ab5 100644 --- a/url-mode.c +++ b/url-mode.c @@ -292,21 +292,31 @@ urls_input(struct seat *seat, struct terminal *term, } } -struct charmap { - const struct row *row; - int col; -}; - struct vline { char *utf8; - size_t len; - size_t sz; - struct charmap *map; + size_t len; /* Length of utf8[] */ + size_t sz; /* utf8[] allocated size */ + struct coord *map; /* Maps utf8[ofs] to grid coordinates */ }; static void regex_detected(const struct terminal *term, enum url_action action, url_list_t *urls) { + /* + * Use regcomp()+regexec() to find patterns. + * + * Since we can't feed regexec() one character at a time, and + * since it doesn't accept wide characters, we need to build utf8 + * strings. + * + * Each string represents a logical line (i.e. handle line-wrap). + * To be able to map regex matches back to the grid, we store the + * grid coordinates of *each* character, in the line struct as + * well. This is offset based; utf8[ofs] has its grid coordinates + * in map[ofs. + */ + + /* There is *at most* term->rows logical lines */ struct vline vlines[term->rows]; size_t vline_idx = 0; @@ -315,14 +325,15 @@ regex_detected(const struct terminal *term, enum url_action action, url_list_t * mbstate_t ps = {0}; - for (int row_no = 0; row_no < term->rows; row_no++) { - const struct row *row = grid_row_in_view(term->grid, row_no); + for (int r = 0; r < term->rows; r++) { + const struct row *row = grid_row_in_view(term->grid, r); for (int c = 0; c < term->cols; c++) { const struct cell *cell = &row->cells[c]; const char32_t *wc = &cell->wc; size_t wc_count = 1; + /* Expand combining characters */ if (wc[0] >= CELL_COMB_CHARS_LO && wc[0] <= CELL_COMB_CHARS_HI) { const struct composed *composed = composed_lookup(term->composed, wc[0] - CELL_COMB_CHARS_LO); @@ -332,26 +343,26 @@ regex_detected(const struct terminal *term, enum url_action action, url_list_t * wc_count = composed->count; } + /* Convert wide character to utf8 */ for (size_t i = 0; i < wc_count; i++) { char buf[16]; size_t char_len = c32rtomb(buf, wc[i], &ps); + if (char_len == (size_t)-1) continue; - if (vline->len == 0 && char_len == 1 && buf[0] == 0) - continue; - for (size_t j = 0; j < char_len; j++) { - if (vline->len + char_len > vline->sz) { - /* TODO: grow dynamically */ - size_t new_count = (vline->len + char_len) * 2; - vline->utf8 = xreallocarray(vline->utf8, new_count, 1); - vline->map = xreallocarray(vline->map, new_count, sizeof(vline->map[0])); + const size_t requires_size = vline->len + char_len; + + if (requires_size > vline->sz) { + const size_t new_size = requires_size * 2; + vline->utf8 = xreallocarray(vline->utf8, new_size, 1); + vline->map = xreallocarray(vline->map, new_size, sizeof(vline->map[0])); + vline->sz = new_size; } vline->utf8[vline->len + j] = buf[j]; - vline->map[vline->len + j].col = c; - vline->map[vline->len + j].row = row; + vline->map[vline->len + j] = (struct coord){c, term->grid->view + r}; } vline->len += char_len; @@ -371,9 +382,9 @@ regex_detected(const struct terminal *term, enum url_action action, url_list_t * // https://gist.github.com/gruber/249502 regex_t preg; - const char *foo = + const char *regex_string = "(" - "[a-z][[:alpha:]-]+:" // protocol + "[a-z][[:alnum:]-]+:" // protocol "(" "/{1,3}|[a-z0-9%]" // slashes (what's the OR part for?) ")" @@ -390,291 +401,70 @@ regex_detected(const struct terminal *term, enum url_action action, url_list_t * "(" "\\(([^[:space:]()<>]+|(\\([^[:space:]()<>]+\\)))*\\)" "|" - // TODO: figure out how to add \\] to the expression below... - "[^[:space:]`!()\\[{};:'\".,<>?«»“”‘’]" + "[^]\\[[:space:]`!(){};:'\".,<>?«»“”‘’]" ")" ; - LOG_ERR("foo=%s", foo); - - int r = regcomp(&preg, foo, REG_EXTENDED); + int r = regcomp(&preg, regex_string, REG_EXTENDED); if (r != 0) { char err_buf[1024]; regerror(r, &preg, err_buf, sizeof(err_buf)); - LOG_ERR("regcomp: %s", err_buf); - } else { - size_t i = 0; - while (true) { - const struct vline *v = &vlines[i++]; - if (v->utf8 == NULL) - break; + LOG_ERR("failed to compile regular expression: %s", err_buf); - - regmatch_t matches[preg.re_nsub + 1]; - r = regexec(&preg, v->utf8, preg.re_nsub + 1, matches, 0); - - if (r == REG_NOMATCH) - continue; - - size_t mlen = matches[0].rm_eo - matches[0].rm_so; - LOG_WARN("MATCH at %d: %.*s (%zu)", matches[0].rm_so, (int)mlen, &v->utf8[matches[0].rm_so], mlen); + for (size_t i = 0; i < ALEN(vlines); i++) { + const struct vline *v = &vlines[i]; + free(v->utf8); + free(v->map); } - regfree(&preg); + + return; } - size_t i = 0; - while (true) { - const struct vline *v = &vlines[i++]; + for (size_t i = 0; i < ALEN(vlines); i++) { + const struct vline *v = &vlines[i]; if (v->utf8 == NULL) - break; + continue;; + + const char *search_string = v->utf8; + while (true) { + + regmatch_t matches[preg.re_nsub + 1]; + r = regexec(&preg, search_string, preg.re_nsub + 1, matches, 0); + + if (r == REG_NOMATCH) + break; + + const size_t mlen = matches[0].rm_eo - matches[0].rm_so; + const size_t start = &search_string[matches[0].rm_so] - v->utf8; + const size_t end = start + mlen; + + LOG_DBG( + "MATCH at %d: %.*s (%zu) row/col = %dx%d", + matches[0].rm_so, (int)mlen, &search_string[matches[0].rm_so], + mlen, v->map[start].row, v->map[start].col); + + tll_push_back( + *urls, + ((struct url){ + .id = (uint64_t)rand() << 32 | rand(), + .url = xstrndup(&v->utf8[start], mlen), + .range = { + .start = v->map[start], + .end = v->map[end - 1], /* Inclusive */ + }, + .action = action, + .osc8 = false})); + + search_string += matches[0].rm_eo; + } - LOG_WARN("%.*s", (int)v->len, v->utf8); free(v->utf8); free(v->map); } + + regfree(&preg); } - -static int -c32cmp_single(const void *_a, const void *_b) -{ - const char32_t *a = _a; - const char32_t *b = _b; - return *a - *b; -} - -static void -auto_detected(const struct terminal *term, enum url_action action, - url_list_t *urls) -{ - const struct config *conf = term->conf; - - const char32_t *uri_characters = conf->url.uri_characters; - if (uri_characters == NULL) - return; - - const size_t uri_characters_count = c32len(uri_characters); - if (uri_characters_count == 0) - return; - - size_t max_prot_len = conf->url.max_prot_len; - char32_t proto_chars[max_prot_len]; - struct coord proto_start[max_prot_len]; - size_t proto_char_count = 0; - - enum { - STATE_PROTOCOL, - STATE_URL, - } state = STATE_PROTOCOL; - - struct coord start = {-1, -1}; - char32_t url[term->cols * term->rows + 1]; - size_t len = 0; - - ssize_t parenthesis = 0; - ssize_t brackets = 0; - ssize_t ltgts = 0; - - for (int r = 0; r < term->rows; r++) { - const struct row *row = grid_row_in_view(term->grid, r); - - for (int c = 0; c < term->cols; c++) { - const struct cell *cell = &row->cells[c]; - - if (cell->wc >= CELL_SPACER) - continue; - - const char32_t *wcs = NULL; - size_t wc_count = 0; - - if (cell->wc >= CELL_COMB_CHARS_LO && cell->wc <= CELL_COMB_CHARS_HI) { - struct composed *composed = - composed_lookup(term->composed, cell->wc - CELL_COMB_CHARS_LO); - wcs = composed->chars; - wc_count = composed->count; - } else { - wcs = &cell->wc; - wc_count = 1; - } - - for (size_t w_idx = 0; w_idx < wc_count; w_idx++) { - char32_t wc = wcs[w_idx]; - - switch (state) { - case STATE_PROTOCOL: - for (size_t i = 0; i < max_prot_len - 1; i++) { - proto_chars[i] = proto_chars[i + 1]; - proto_start[i] = proto_start[i + 1]; - } - - if (proto_char_count >= max_prot_len) - proto_char_count = max_prot_len - 1; - - proto_chars[max_prot_len - 1] = wc; - proto_start[max_prot_len - 1] = (struct coord){c, r}; - proto_char_count++; - - for (size_t i = 0; i < conf->url.prot_count; i++) { - size_t prot_len = c32len(conf->url.protocols[i]); - - if (proto_char_count < prot_len) - continue; - - const char32_t *proto = - &proto_chars[max_prot_len - prot_len]; - - if (c32ncasecmp(conf->url.protocols[i], proto, prot_len) == - 0) { - state = STATE_URL; - start = proto_start[max_prot_len - prot_len]; - - c32ncpy(url, proto, prot_len); - len = prot_len; - - parenthesis = brackets = ltgts = 0; - break; - } - } - break; - - case STATE_URL: { - const char32_t *match = - bsearch(&wc, uri_characters, uri_characters_count, - sizeof(uri_characters[0]), &c32cmp_single); - - bool emit_url = false; - - if (match == NULL) { - /* - * Character is not a valid URI character. Emit - * the URL we've collected so far, *without* - * including _this_ character. - */ - emit_url = true; - } else { - xassert(*match == wc); - - switch (wc) { - default: - url[len++] = wc; - break; - - case U'(': - parenthesis++; - url[len++] = wc; - break; - - case U'[': - brackets++; - url[len++] = wc; - break; - - case U'<': - ltgts++; - url[len++] = wc; - break; - - case U')': - if (--parenthesis < 0) - emit_url = true; - else - url[len++] = wc; - break; - - case U']': - if (--brackets < 0) - emit_url = true; - else - url[len++] = wc; - break; - - case U'>': - if (--ltgts < 0) - emit_url = true; - else - url[len++] = wc; - break; - } - } - - if (c >= term->cols - 1 && row->linebreak) { - /* - * Endpoint is inclusive, and we'll be subtracting - * 1 from the column when emitting the URL. - */ - c++; - emit_url = true; - } - - if (emit_url) { - struct coord end = {c, r}; - - if (--end.col < 0) { - end.row--; - end.col = term->cols - 1; - } - - /* Heuristic to remove trailing characters that - * are valid URL characters, but typically not at - * the end of the URL */ - bool done = false; - do { - switch (url[len - 1]) { - case U'.': - case U',': - case U':': - case U';': - case U'?': - case U'!': - case U'"': - case U'\'': - case U'%': - len--; - end.col--; - if (end.col < 0) { - end.row--; - end.col = term->cols - 1; - } - break; - - default: - done = true; - break; - } - } while (!done); - - url[len] = U'\0'; - - start.row += term->grid->view; - end.row += term->grid->view; - - char *url_utf8 = ac32tombs(url); - if (url_utf8 != NULL) { - tll_push_back( - *urls, - ((struct url){.id = (uint64_t)rand() << 32 | rand(), - .url = url_utf8, - .range = - { - .start = start, - .end = end, - }, - .action = action, - .osc8 = false})); - } - - state = STATE_PROTOCOL; - len = 0; - parenthesis = brackets = ltgts = 0; - } - break; - } - } - } - } - } -} - static void osc8_uris(const struct terminal *term, enum url_action action, url_list_t *urls) { @@ -777,7 +567,6 @@ urls_collect(const struct terminal *term, enum url_action action, url_list_t *ur { xassert(tll_length(term->urls) == 0); osc8_uris(term, action, urls); - auto_detected(term, action, urls); regex_detected(term, action, urls); remove_overlapping(urls, term->grid->num_cols); }