url-mode: use the first *sub* expression as URL

When auto-matching URLs (or custom regular expressions), use the first *subexpression* as URL, rather than the while regex match. This allows us to write custom regular expressions with prefix/suffix strings that should not be included in the presented match.
2026-06-20 14:33:19 -04:00 · 2025-02-03 13:56:57 +01:00 · 2025-02-03 13:56:57 +01:00 · a984531ce5
commit a984531ce5
parent 31f536ff8c
4 changed files with 51 additions and 28 deletions
--- a/config.c
+++ b/config.c
@ -1256,6 +1256,12 @@ parse_section_url(struct context *ctx)
            return false;
        }

+        if (preg.re_nsub == 0) {
+            LOG_CONTEXTUAL_ERR("invalid regex: no marked subexpression(s)");
+            regfree(&preg);
+            return false;
+        }
+
        regfree(&conf->url.preg);
        free(conf->url.regex);

@ -1300,6 +1306,12 @@ parse_section_regex(struct context *ctx)
            return false;
        }

+        if (preg.re_nsub == 0) {
+            LOG_CONTEXTUAL_ERR("invalid regex: no marked subexpression(s)");
+            regfree(&preg);
+            return false;
+        }
+
        if (regex == NULL) {
            tll_push_back(conf->custom_regexes,
                          ((struct custom_regex){.name = xstrdup(regex_name)}));
@ -3426,33 +3438,37 @@ config_load(struct config *conf, const char *conf_path,
         */
        const char *url_regex_string =
            "("
-                "[a-z][[:alnum:]-]+:"       // protocol
                "("
-                    "/{1,3}|[a-z0-9%]"     // slashes (what's the OR part for?)
+                    "[a-z][[:alnum:]-]+:"       // protocol
+                    "("
+                        "/{1,3}|[a-z0-9%]"     // slashes (what's the OR part for?)
+                    ")"
+                    "|"
+                    "www[:digit:]{0,3}[.]"
+                    //"|"
+                    //"[a-z0-9.\\-]+[.][a-z]{2,4}/"  /* "looks like domain name followed by a slash" - remove? */
+                ")"
+                "("
+                    "[^[:space:](){}<>]+"
+                    "|"
+                    "\\(([^[:space:](){}<>]+|(\\([^[:space:](){}<>]+\\)))*\\)"
+                    "|"
+                    "\\[([^]\\[[:space:](){}<>]+|(\\[[^]\\[[:space:](){}<>]+\\]))*\\]"
+                ")+"
+                "("
+                    "\\(([^[:space:](){}<>]+|(\\([^[:space:](){}<>]+\\)))*\\)"
+                    "|"
+                    "\\[([^]\\[[:space:](){}<>]+|(\\[[^]\\[[:space:](){}<>]+\\]))*\\]"
+                    "|"
+                    "[^]\\[[:space:]`!(){};:'\".,<>?«»“”‘’]"
                ")"
-                "|"
-                "www[:digit:]{0,3}[.]"
-                //"|"
-                //"[a-z0-9.\\-]+[.][a-z]{2,4}/"  /* "looks like domain name followed by a slash" - remove? */
-            ")"
-            "("
-                "[^[:space:](){}<>]+"
-                "|"
-                "\\(([^[:space:](){}<>]+|(\\([^[:space:](){}<>]+\\)))*\\)"
-                "|"
-                "\\[([^]\\[[:space:](){}<>]+|(\\[[^]\\[[:space:](){}<>]+\\]))*\\]"
-            ")+"
-            "("
-                "\\(([^[:space:](){}<>]+|(\\([^[:space:](){}<>]+\\)))*\\)"
-                "|"
-                "\\[([^]\\[[:space:](){}<>]+|(\\[[^]\\[[:space:](){}<>]+\\]))*\\]"
-                "|"
-                "[^]\\[[:space:]`!(){};:'\".,<>?«»“”‘’]"
            ")"
        ;
+
        int r = regcomp(&conf->url.preg, url_regex_string, REG_EXTENDED);
        xassert(r == 0);
        conf->url.regex = xstrdup(url_regex_string);
+        xassert(conf->url.preg.re_nsub >= 1);
    }

    tll_foreach(*initial_user_notifications, it) {
--- a/doc/foot.ini.5.scd
+++ b/doc/foot.ini.5.scd
@ -786,11 +786,13 @@ section.
 	Default: _sadfjklewcmpgh_.

 *regex*
-
 	Regular expression to use when auto-detecting URLs. The format is
-	"POSIX-Extended Regular Expressions".
+	"POSIX-Extended Regular Expressions". Note that the first marked
+	subexpression is used a the URL. In other words, if you want the
+	whole regex matćh to be used as an URL, surround all of it with
+	parenthesis: *(regex-pattern)*.
 	
-	Default: _([a-z][[:alnum:]-]+:(/{1,3}|[a-z0-9%])|www[:digit:]{0,3}[.])([^[:space:](){}<>]+|\(([^[:space:](){}<>]+|(\([^[:space:](){}<>]+\)))\*\)|\[([^]\[[:space:](){}<>]+|(\[[^]\[[:space:](){}<>]+\]))\*\])+(\(([^[:space:](){}<>]+|(\([^[:space:](){}<>]+\)))\*\)|\[([^]\[[:space:](){}<>]+|(\[[^]\[[:space:](){}<>]+\]))\*\]|[^]\[[:space:]`!(){};:'".,<>?«»“”‘’])_
+	Default: _(([a-z][[:alnum:]-]+:(/{1,3}|[a-z0-9%])|www[:digit:]{0,3}[.])([^[:space:](){}<>]+|\(([^[:space:](){}<>]+|(\([^[:space:](){}<>]+\)))\*\)|\[([^]\[[:space:](){}<>]+|(\[[^]\[[:space:](){}<>]+\]))\*\])+(\(([^[:space:](){}<>]+|(\([^[:space:](){}<>]+\)))\*\)|\[([^]\[[:space:](){}<>]+|(\[[^]\[[:space:](){}<>]+\]))\*\]|[^]\[[:space:]`!(){};:'".,<>?«»“”‘’]))_

 # SECTION: regex

@ -817,7 +819,12 @@ regex-copy=[foo] Control+Mod1+Shift+q

 *regex*
 	Regular expression to use when matching text. The format is
-	"POSIX-Extended Regular Expressions". Default: _not set_.
+	"POSIX-Extended Regular Expressions". Note that the first marked
+	subexpression is used a the URL. In other words, if you want the
+	whole regex matćh to be used as an URL, surround all of it with
+	parenthesis: *(regex-pattern)*.
+	
+	Default: _not set_.


 # SECTION: cursor
--- a/foot.ini
+++ b/foot.ini
@ -69,7 +69,7 @@
 # launch=xdg-open ${url}
 # label-letters=sadfjklewcmpgh
 # osc8-underline=url-mode
-# regex=([a-z][[:alnum:]-]+:(/{1,3}|[a-z0-9%])|www[:digit:]{0,3}[.])([^[:space:](){}<>]+|\(([^[:space:](){}<>]+|(\([^[:space:](){}<>]+\)))*\)|\[([^]\[[:space:](){}<>]+|(\[[^]\[[:space:](){}<>]+\]))*\])+(\(([^[:space:](){}<>]+|(\([^[:space:](){}<>]+\)))*\)|\[([^]\[[:space:](){}<>]+|(\[[^]\[[:space:](){}<>]+\]))*\]|[^]\[[:space:]`!(){};:'".,<>?«»“”‘’])
+# regex=(([a-z][[:alnum:]-]+:(/{1,3}|[a-z0-9%])|www[:digit:]{0,3}[.])([^[:space:](){}<>]+|\(([^[:space:](){}<>]+|(\([^[:space:](){}<>]+\)))*\)|\[([^]\[[:space:](){}<>]+|(\[[^]\[[:space:](){}<>]+\]))*\])+(\(([^[:space:](){}<>]+|(\([^[:space:](){}<>]+\)))*\)|\[([^]\[[:space:](){}<>]+|(\[[^]\[[:space:](){}<>]+\]))*\]|[^]\[[:space:]`!(){};:'".,<>?«»“”‘’]))

 # You can define your own regex's, by adding a section called
 # 'regex:<ID>' with a 'regex' and 'launch' key. These can then be tied
--- a/url-mode.c
+++ b/url-mode.c
@ -400,13 +400,13 @@ regex_detected(const struct terminal *term, enum url_action action,
            if (r == REG_NOMATCH)
                break;

-            const size_t mlen = matches[0].rm_eo - matches[0].rm_so;
-            const size_t start = &search_string[matches[0].rm_so] - v->utf8;
+            const size_t mlen = matches[1].rm_eo - matches[1].rm_so;
+            const size_t start = &search_string[matches[1].rm_so] - v->utf8;
            const size_t end = start + mlen;

            LOG_DBG(
                "regex match at row %d: %.*srow/col = %dx%d",
-                matches[0].rm_so, (int)mlen, &search_string[matches[0].rm_so],
+                matches[1].rm_so, (int)mlen, &search_string[matches[1].rm_so],
                v->map[start].row, v->map[start].col);

            tll_push_back(