url-mode: use the first *sub* expression as URL

When auto-matching URLs (or custom regular expressions), use the
first *subexpression* as URL, rather than the while regex match.

This allows us to write custom regular expressions with prefix/suffix
strings that should not be included in the presented match.
This commit is contained in:
Daniel Eklöf 2025-02-03 13:56:57 +01:00
parent 31f536ff8c
commit a984531ce5
No known key found for this signature in database
GPG key ID: 5BBD4992C116573F
4 changed files with 51 additions and 28 deletions

View file

@ -1256,6 +1256,12 @@ parse_section_url(struct context *ctx)
return false; return false;
} }
if (preg.re_nsub == 0) {
LOG_CONTEXTUAL_ERR("invalid regex: no marked subexpression(s)");
regfree(&preg);
return false;
}
regfree(&conf->url.preg); regfree(&conf->url.preg);
free(conf->url.regex); free(conf->url.regex);
@ -1300,6 +1306,12 @@ parse_section_regex(struct context *ctx)
return false; return false;
} }
if (preg.re_nsub == 0) {
LOG_CONTEXTUAL_ERR("invalid regex: no marked subexpression(s)");
regfree(&preg);
return false;
}
if (regex == NULL) { if (regex == NULL) {
tll_push_back(conf->custom_regexes, tll_push_back(conf->custom_regexes,
((struct custom_regex){.name = xstrdup(regex_name)})); ((struct custom_regex){.name = xstrdup(regex_name)}));
@ -3426,33 +3438,37 @@ config_load(struct config *conf, const char *conf_path,
*/ */
const char *url_regex_string = const char *url_regex_string =
"(" "("
"[a-z][[:alnum:]-]+:" // protocol
"(" "("
"/{1,3}|[a-z0-9%]" // slashes (what's the OR part for?) "[a-z][[:alnum:]-]+:" // protocol
"("
"/{1,3}|[a-z0-9%]" // slashes (what's the OR part for?)
")"
"|"
"www[:digit:]{0,3}[.]"
//"|"
//"[a-z0-9.\\-]+[.][a-z]{2,4}/" /* "looks like domain name followed by a slash" - remove? */
")"
"("
"[^[:space:](){}<>]+"
"|"
"\\(([^[:space:](){}<>]+|(\\([^[:space:](){}<>]+\\)))*\\)"
"|"
"\\[([^]\\[[:space:](){}<>]+|(\\[[^]\\[[:space:](){}<>]+\\]))*\\]"
")+"
"("
"\\(([^[:space:](){}<>]+|(\\([^[:space:](){}<>]+\\)))*\\)"
"|"
"\\[([^]\\[[:space:](){}<>]+|(\\[[^]\\[[:space:](){}<>]+\\]))*\\]"
"|"
"[^]\\[[:space:]`!(){};:'\".,<>?«»“”‘’]"
")" ")"
"|"
"www[:digit:]{0,3}[.]"
//"|"
//"[a-z0-9.\\-]+[.][a-z]{2,4}/" /* "looks like domain name followed by a slash" - remove? */
")"
"("
"[^[:space:](){}<>]+"
"|"
"\\(([^[:space:](){}<>]+|(\\([^[:space:](){}<>]+\\)))*\\)"
"|"
"\\[([^]\\[[:space:](){}<>]+|(\\[[^]\\[[:space:](){}<>]+\\]))*\\]"
")+"
"("
"\\(([^[:space:](){}<>]+|(\\([^[:space:](){}<>]+\\)))*\\)"
"|"
"\\[([^]\\[[:space:](){}<>]+|(\\[[^]\\[[:space:](){}<>]+\\]))*\\]"
"|"
"[^]\\[[:space:]`!(){};:'\".,<>?«»“”‘’]"
")" ")"
; ;
int r = regcomp(&conf->url.preg, url_regex_string, REG_EXTENDED); int r = regcomp(&conf->url.preg, url_regex_string, REG_EXTENDED);
xassert(r == 0); xassert(r == 0);
conf->url.regex = xstrdup(url_regex_string); conf->url.regex = xstrdup(url_regex_string);
xassert(conf->url.preg.re_nsub >= 1);
} }
tll_foreach(*initial_user_notifications, it) { tll_foreach(*initial_user_notifications, it) {

View file

@ -786,11 +786,13 @@ section.
Default: _sadfjklewcmpgh_. Default: _sadfjklewcmpgh_.
*regex* *regex*
Regular expression to use when auto-detecting URLs. The format is Regular expression to use when auto-detecting URLs. The format is
"POSIX-Extended Regular Expressions". "POSIX-Extended Regular Expressions". Note that the first marked
subexpression is used a the URL. In other words, if you want the
whole regex matćh to be used as an URL, surround all of it with
parenthesis: *(regex-pattern)*.
Default: _([a-z][[:alnum:]-]+:(/{1,3}|[a-z0-9%])|www[:digit:]{0,3}[.])([^[:space:](){}<>]+|\(([^[:space:](){}<>]+|(\([^[:space:](){}<>]+\)))\*\)|\[([^]\[[:space:](){}<>]+|(\[[^]\[[:space:](){}<>]+\]))\*\])+(\(([^[:space:](){}<>]+|(\([^[:space:](){}<>]+\)))\*\)|\[([^]\[[:space:](){}<>]+|(\[[^]\[[:space:](){}<>]+\]))\*\]|[^]\[[:space:]`!(){};:'".,<>?«»“”‘’])_ Default: _(([a-z][[:alnum:]-]+:(/{1,3}|[a-z0-9%])|www[:digit:]{0,3}[.])([^[:space:](){}<>]+|\(([^[:space:](){}<>]+|(\([^[:space:](){}<>]+\)))\*\)|\[([^]\[[:space:](){}<>]+|(\[[^]\[[:space:](){}<>]+\]))\*\])+(\(([^[:space:](){}<>]+|(\([^[:space:](){}<>]+\)))\*\)|\[([^]\[[:space:](){}<>]+|(\[[^]\[[:space:](){}<>]+\]))\*\]|[^]\[[:space:]`!(){};:'".,<>?«»“”‘’]))_
# SECTION: regex # SECTION: regex
@ -817,7 +819,12 @@ regex-copy=[foo] Control+Mod1+Shift+q
*regex* *regex*
Regular expression to use when matching text. The format is Regular expression to use when matching text. The format is
"POSIX-Extended Regular Expressions". Default: _not set_. "POSIX-Extended Regular Expressions". Note that the first marked
subexpression is used a the URL. In other words, if you want the
whole regex matćh to be used as an URL, surround all of it with
parenthesis: *(regex-pattern)*.
Default: _not set_.
# SECTION: cursor # SECTION: cursor

View file

@ -69,7 +69,7 @@
# launch=xdg-open ${url} # launch=xdg-open ${url}
# label-letters=sadfjklewcmpgh # label-letters=sadfjklewcmpgh
# osc8-underline=url-mode # osc8-underline=url-mode
# regex=([a-z][[:alnum:]-]+:(/{1,3}|[a-z0-9%])|www[:digit:]{0,3}[.])([^[:space:](){}<>]+|\(([^[:space:](){}<>]+|(\([^[:space:](){}<>]+\)))*\)|\[([^]\[[:space:](){}<>]+|(\[[^]\[[:space:](){}<>]+\]))*\])+(\(([^[:space:](){}<>]+|(\([^[:space:](){}<>]+\)))*\)|\[([^]\[[:space:](){}<>]+|(\[[^]\[[:space:](){}<>]+\]))*\]|[^]\[[:space:]`!(){};:'".,<>?«»“”‘’]) # regex=(([a-z][[:alnum:]-]+:(/{1,3}|[a-z0-9%])|www[:digit:]{0,3}[.])([^[:space:](){}<>]+|\(([^[:space:](){}<>]+|(\([^[:space:](){}<>]+\)))*\)|\[([^]\[[:space:](){}<>]+|(\[[^]\[[:space:](){}<>]+\]))*\])+(\(([^[:space:](){}<>]+|(\([^[:space:](){}<>]+\)))*\)|\[([^]\[[:space:](){}<>]+|(\[[^]\[[:space:](){}<>]+\]))*\]|[^]\[[:space:]`!(){};:'".,<>?«»“”‘’]))
# You can define your own regex's, by adding a section called # You can define your own regex's, by adding a section called
# 'regex:<ID>' with a 'regex' and 'launch' key. These can then be tied # 'regex:<ID>' with a 'regex' and 'launch' key. These can then be tied

View file

@ -400,13 +400,13 @@ regex_detected(const struct terminal *term, enum url_action action,
if (r == REG_NOMATCH) if (r == REG_NOMATCH)
break; break;
const size_t mlen = matches[0].rm_eo - matches[0].rm_so; const size_t mlen = matches[1].rm_eo - matches[1].rm_so;
const size_t start = &search_string[matches[0].rm_so] - v->utf8; const size_t start = &search_string[matches[1].rm_so] - v->utf8;
const size_t end = start + mlen; const size_t end = start + mlen;
LOG_DBG( LOG_DBG(
"regex match at row %d: %.*srow/col = %dx%d", "regex match at row %d: %.*srow/col = %dx%d",
matches[0].rm_so, (int)mlen, &search_string[matches[0].rm_so], matches[1].rm_so, (int)mlen, &search_string[matches[1].rm_so],
v->map[start].row, v->map[start].col); v->map[start].row, v->map[start].col);
tll_push_back( tll_push_back(