From 1760cb6ab82b355a0751f614f3b45c7446e23e95 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Ekl=C3=B6f?= Date: Wed, 2 Apr 2025 08:41:46 +0200 Subject: [PATCH] config: update default URL regex The old one is in some cases too liberal. The new one is stricter in two ways: 1. The protocol list is now explicit, rather than matching anything:// 2. Allowed characters are now limited to the "safe character set", the "reserved character set", and some from the "unsafe character set" Furthermore, some of the characters are restricted in how/when they are allowed: 1. Periods, commas, question marks etc are allowed inside an URL, but not at the end. 2. [ ], ( ), " " and ' ' are allowed but only when balanced. This allows us to match e.g. [http://foo.bar/foo[bar]] correctly. Closes #2016 --- CHANGELOG.md | 5 ++++ config.c | 69 +++++++++++++++++++++++++--------------------- doc/foot.ini.5.scd | 2 +- foot.ini | 2 +- 4 files changed, 45 insertions(+), 33 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bc297e6c..3e421014 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -79,9 +79,14 @@ `Mod4` etc) in key bindings are now recognized as being virtual, and are automatically mapped to the corresponding real modifier. This means you can use e.g. `Alt+b` instead of `Mod1+b`. +* Default URL regex changed to a much more strict variant + ([#2016][2016]). You can manually set the [old + one](https://codeberg.org/dnkl/foot/src/tag/1.21.0/foot.ini#L72), if + you prefer it over the new regex. [2006]: https://codeberg.org/dnkl/foot/issues/2006 [2009]: https://codeberg.org/dnkl/foot/issues/2009 +[2016]: https://codeberg.org/dnkl/foot/issues/2016 ### Deprecated diff --git a/config.c b/config.c index a8cdb34a..bfd3ffed 100644 --- a/config.c +++ b/config.c @@ -3446,39 +3446,46 @@ config_load(struct config *conf, const char *conf_path, tokenize_cmdline("xdg-open ${url}", &conf->url.launch.argv.args); { - /* - * Based on https://gist.github.com/gruber/249502, but modified: - * - Do not allow {} at all - * - Do allow matched [] - */ - const char *url_regex_string = + const char *url_regex_string = + "(" "(" - "(" - "[a-z][[:alnum:]-]+:" // protocol - "(" - "/{1,3}|[a-z0-9%]" // slashes (what's the OR part for?) - ")" - "|" - "www[:digit:]{0,3}[.]" - //"|" - //"[a-z0-9.\\-]+[.][a-z]{2,4}/" /* "looks like domain name followed by a slash" - remove? */ - ")" - "(" - "[^[:space:](){}<>]+" - "|" - "\\(([^[:space:](){}<>]+|(\\([^[:space:](){}<>]+\\)))*\\)" - "|" - "\\[([^]\\[[:space:](){}<>]+|(\\[[^]\\[[:space:](){}<>]+\\]))*\\]" - ")+" - "(" - "\\(([^[:space:](){}<>]+|(\\([^[:space:](){}<>]+\\)))*\\)" - "|" - "\\[([^]\\[[:space:](){}<>]+|(\\[[^]\\[[:space:](){}<>]+\\]))*\\]" - "|" - "[^]\\[[:space:]`!(){};:'\".,<>?«»“”‘’]" - ")" + "(https?://|mailto:|ftp://|file:|ssh:|ssh://|git://|tel:|magnet:|ipfs://|ipns://|gemini://|gopher://|news:)" + "|" + "www\\." ")" - ; + "(" + /* Safe + reserved + some unsafe characters parenthesis and double quotes omitted (we only allow them when balanced) */ + "[0-9a-zA-Z:/?#@!$&*+,;=.~_%^\\-]+" + "|" + /* Balanced "(...)". Content is same as above, plus all _other_ characters we require to be balanced */ + "\\([]\\[\"0-9a-zA-Z:/?#@!$&'*+,;=.~_%^\\-]*\\)" + "|" + /* Balanced "[...]". Content is same as above, plus all _other_ characters we require to be balanced */ + "\\[[\\(\\)\"0-9a-zA-Z:/?#@!$&'*+,;=.~_%^\\-]*\\]" + "|" + /* Balanced '"..."'. Content is same as above, plus all _other_ characters we require to be balanced */ + "\"[]\\[\\(\\)0-9a-zA-Z:/?#@!$&'*+,;=.~_%^\\-]*\"" + "|" + /* Balanced "'...'". Content is same as above, plus all _other_ characters we require to be balanced */ + "'[]\\[\\(\\)0-9a-zA-Z:/?#@!$&*+,;=.~_%^\\-]*'" + ")+" + "(" + /* Same as above, except :?!,;. are excluded */ + "[0-9a-zA-Z/#@$&*+=~_%^\\-]" + "|" + /* Balanced "(...)". Content is same as above, plus all _other_ characters we require to be balanced */ + "\\([]\\[\"0-9a-zA-Z:/?#@!$&'*+,;=.~_%^\\-]*\\)" + "|" + /* Balanced "[...]". Content is same as above, plus all _other_ characters we require to be balanced */ + "\\[[\\(\\)\"0-9a-zA-Z:/?#@!$&'*+,;=.~_%^\\-]*\\]" + "|" + /* Balanced '"..."'. Content is same as above, plus all _other_ characters we require to be balanced */ + "\"[]\\[\\(\\)0-9a-zA-Z:/?#@!$&'*+,;=.~_%^\\-]*\"" + "|" + /* Balanced "'...'". Content is same as above, plus all _other_ characters we require to be balanced */ + "'[]\\[\\(\\)0-9a-zA-Z:/?#@!$&*+,;=.~_%^\\-]*'" + ")" + ")"; int r = regcomp(&conf->url.preg, url_regex_string, REG_EXTENDED); xassert(r == 0); diff --git a/doc/foot.ini.5.scd b/doc/foot.ini.5.scd index 043600d2..c32a8e06 100644 --- a/doc/foot.ini.5.scd +++ b/doc/foot.ini.5.scd @@ -828,7 +828,7 @@ section. whole regex match to be used as an URL, surround all of it with parenthesis: *(regex-pattern)*. - Default: _(([a-z][[:alnum:]-]+:(/{1,3}|[a-z0-9%])|www[:digit:]{0,3}[.])([^[:space:](){}<>]+|\(([^[:space:](){}<>]+|(\([^[:space:](){}<>]+\)))\*\)|\[([^]\[[:space:](){}<>]+|(\[[^]\[[:space:](){}<>]+\]))\*\])+(\(([^[:space:](){}<>]+|(\([^[:space:](){}<>]+\)))\*\)|\[([^]\[[:space:](){}<>]+|(\[[^]\[[:space:](){}<>]+\]))\*\]|[^]\[[:space:]`!(){};:'".,<>?«»“”‘’]))_ + Default: _(((https?://|mailto:|ftp://|file:|ssh:|ssh://|git://|tel:|magnet:|ipfs://|ipns://|gemini://|gopher://|news:)|www\.)([0-9a-zA-Z:/?#@!$&\*+,;=.~\_%^\-]+|\([]\["0-9a-zA-Z:/?#@!$&'\*+,;=.~\_%^\-]\*\)|\[[\(\)"0-9a-zA-Z:/?#@!$&'\*+,;=.~\_%^\-]\*\]|"[]\[\(\)0-9a-zA-Z:/?#@!$&'\*+,;=.~\_%^\-]\*"|'[]\[\(\)0-9a-zA-Z:/?#@!$&\*+,;=.~\_%^\-]\*')+([0-9a-zA-Z/#@$&\*+=~\_%^\-]|\([]\["0-9a-zA-Z:/?#@!$&'\*+,;=.~\_%^\-]\*\)|\[[\(\)"0-9a-zA-Z:/?#@!$&'\*+,;=.~\_%^\-]\*\]|"[]\[\(\)0-9a-zA-Z:/?#@!$&'\*+,;=.~\_%^\-]\*"|'[]\[\(\)0-9a-zA-Z:/?#@!$&\*+,;=.~\_%^\-]\*'))_ # SECTION: regex diff --git a/foot.ini b/foot.ini index b852da07..b170dc34 100644 --- a/foot.ini +++ b/foot.ini @@ -69,7 +69,7 @@ # launch=xdg-open ${url} # label-letters=sadfjklewcmpgh # osc8-underline=url-mode -# regex=(([a-z][[:alnum:]-]+:(/{1,3}|[a-z0-9%])|www[:digit:]{0,3}[.])([^[:space:](){}<>]+|\(([^[:space:](){}<>]+|(\([^[:space:](){}<>]+\)))*\)|\[([^]\[[:space:](){}<>]+|(\[[^]\[[:space:](){}<>]+\]))*\])+(\(([^[:space:](){}<>]+|(\([^[:space:](){}<>]+\)))*\)|\[([^]\[[:space:](){}<>]+|(\[[^]\[[:space:](){}<>]+\]))*\]|[^]\[[:space:]`!(){};:'".,<>?«»“”‘’])) +# regex=(((https?://|mailto:|ftp://|file:|ssh:|ssh://|git://|tel:|magnet:|ipfs://|ipns://|gemini://|gopher://|news:)|www\.)([0-9a-zA-Z:/?#@!$&*+,;=.~_%^\-]+|\([]\["0-9a-zA-Z:/?#@!$&'*+,;=.~_%^\-]*\)|\[[\(\)"0-9a-zA-Z:/?#@!$&'*+,;=.~_%^\-]*\]|"[]\[\(\)0-9a-zA-Z:/?#@!$&'*+,;=.~_%^\-]*"|'[]\[\(\)0-9a-zA-Z:/?#@!$&*+,;=.~_%^\-]*')+([0-9a-zA-Z/#@$&*+=~_%^\-]|\([]\["0-9a-zA-Z:/?#@!$&'*+,;=.~_%^\-]*\)|\[[\(\)"0-9a-zA-Z:/?#@!$&'*+,;=.~_%^\-]*\]|"[]\[\(\)0-9a-zA-Z:/?#@!$&'*+,;=.~_%^\-]*"|'[]\[\(\)0-9a-zA-Z:/?#@!$&*+,;=.~_%^\-]*')) # You can define your own regex's, by adding a section called # 'regex:' with a 'regex' and 'launch' key. These can then be tied