config: update default URL regex

The old one is in some cases too liberal. The new one is stricter in
two ways:

1. The protocol list is now explicit, rather than matching anything://
2. Allowed characters are now limited to the "safe character set", the
   "reserved character set", and some from the "unsafe character set"

Furthermore, some of the characters are restricted in how/when they
are allowed:

1. Periods, commas, question marks etc are allowed inside an URL, but
   not at the end.
2. [ ], ( ), " " and ' ' are allowed but only when balanced. This
   allows us to match e.g. [http://foo.bar/foo[bar]] correctly.

Closes #2016
This commit is contained in:
Daniel Eklöf 2025-04-02 08:41:46 +02:00
parent a50f78c599
commit 1760cb6ab8
No known key found for this signature in database
GPG key ID: 5BBD4992C116573F
4 changed files with 45 additions and 33 deletions

View file

@ -3446,39 +3446,46 @@ config_load(struct config *conf, const char *conf_path,
tokenize_cmdline("xdg-open ${url}", &conf->url.launch.argv.args);
{
/*
* Based on https://gist.github.com/gruber/249502, but modified:
* - Do not allow {} at all
* - Do allow matched []
*/
const char *url_regex_string =
const char *url_regex_string =
"("
"("
"("
"[a-z][[:alnum:]-]+:" // protocol
"("
"/{1,3}|[a-z0-9%]" // slashes (what's the OR part for?)
")"
"|"
"www[:digit:]{0,3}[.]"
//"|"
//"[a-z0-9.\\-]+[.][a-z]{2,4}/" /* "looks like domain name followed by a slash" - remove? */
")"
"("
"[^[:space:](){}<>]+"
"|"
"\\(([^[:space:](){}<>]+|(\\([^[:space:](){}<>]+\\)))*\\)"
"|"
"\\[([^]\\[[:space:](){}<>]+|(\\[[^]\\[[:space:](){}<>]+\\]))*\\]"
")+"
"("
"\\(([^[:space:](){}<>]+|(\\([^[:space:](){}<>]+\\)))*\\)"
"|"
"\\[([^]\\[[:space:](){}<>]+|(\\[[^]\\[[:space:](){}<>]+\\]))*\\]"
"|"
"[^]\\[[:space:]`!(){};:'\".,<>?«»“”‘’]"
")"
"(https?://|mailto:|ftp://|file:|ssh:|ssh://|git://|tel:|magnet:|ipfs://|ipns://|gemini://|gopher://|news:)"
"|"
"www\\."
")"
;
"("
/* Safe + reserved + some unsafe characters parenthesis and double quotes omitted (we only allow them when balanced) */
"[0-9a-zA-Z:/?#@!$&*+,;=.~_%^\\-]+"
"|"
/* Balanced "(...)". Content is same as above, plus all _other_ characters we require to be balanced */
"\\([]\\[\"0-9a-zA-Z:/?#@!$&'*+,;=.~_%^\\-]*\\)"
"|"
/* Balanced "[...]". Content is same as above, plus all _other_ characters we require to be balanced */
"\\[[\\(\\)\"0-9a-zA-Z:/?#@!$&'*+,;=.~_%^\\-]*\\]"
"|"
/* Balanced '"..."'. Content is same as above, plus all _other_ characters we require to be balanced */
"\"[]\\[\\(\\)0-9a-zA-Z:/?#@!$&'*+,;=.~_%^\\-]*\""
"|"
/* Balanced "'...'". Content is same as above, plus all _other_ characters we require to be balanced */
"'[]\\[\\(\\)0-9a-zA-Z:/?#@!$&*+,;=.~_%^\\-]*'"
")+"
"("
/* Same as above, except :?!,;. are excluded */
"[0-9a-zA-Z/#@$&*+=~_%^\\-]"
"|"
/* Balanced "(...)". Content is same as above, plus all _other_ characters we require to be balanced */
"\\([]\\[\"0-9a-zA-Z:/?#@!$&'*+,;=.~_%^\\-]*\\)"
"|"
/* Balanced "[...]". Content is same as above, plus all _other_ characters we require to be balanced */
"\\[[\\(\\)\"0-9a-zA-Z:/?#@!$&'*+,;=.~_%^\\-]*\\]"
"|"
/* Balanced '"..."'. Content is same as above, plus all _other_ characters we require to be balanced */
"\"[]\\[\\(\\)0-9a-zA-Z:/?#@!$&'*+,;=.~_%^\\-]*\""
"|"
/* Balanced "'...'". Content is same as above, plus all _other_ characters we require to be balanced */
"'[]\\[\\(\\)0-9a-zA-Z:/?#@!$&*+,;=.~_%^\\-]*'"
")"
")";
int r = regcomp(&conf->url.preg, url_regex_string, REG_EXTENDED);
xassert(r == 0);