From b9517e588b365436ebab14458553b8c2f799636c Mon Sep 17 00:00:00 2001 From: Lukas Mai Date: Wed, 1 Feb 2023 21:14:14 +0100 Subject: [PATCH] Allow nested ()/[] brackets in URLs (fixes #1346) --- src/Config.h | 50 +++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 45 insertions(+), 5 deletions(-) diff --git a/src/Config.h b/src/Config.h index ba9564f0..6cf6687e 100644 --- a/src/Config.h +++ b/src/Config.h @@ -26,11 +26,51 @@ constexpr auto LABEL_MEDIUM_SIZE_RATIO = 1.3; namespace strings { const QString url_html = QStringLiteral("\\1"); const QRegularExpression url_regex( - // match an URL, that is not quoted, i.e. - // vvvvvv match quote via negative lookahead/lookbehind vv - // vvvv atomic match url -> fail if there is a " before or after vvv - QStringLiteral( - R"((?((www\.(?!\.)|[a-z][a-z0-9+.-]*://)[^\s<>'"]+[^!,\.\s<>'"\]\)\:]))(?!["']))")); + // match an unquoted URL + [](){ + const auto + general_unicode = QStringLiteral(R"((?:[^\x{0}-\x{7f}\p{Cc}\s\p{P}]|[\x{2010}\x{2011}\x{2012}\x{2013}\x{2014}\x{2015}]))"), + protocol = QStringLiteral(R"((?:[Hh][Tt][Tt][Pp][Ss]?))"), + unreserved_subdelims_colon = QStringLiteral(R"([a-zA-Z0-9\-._~!$&'()*+,;=:])"), + pct_enc = QStringLiteral(R"((?:%[[:xdigit:]]{2}))"), + userinfo = "(?:" + unreserved_subdelims_colon + "*(?:" + pct_enc + unreserved_subdelims_colon + "*)*)", + dec_octet = QStringLiteral(R"((?:25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9]))"), + ipv4_addr = "(?:" + dec_octet + R"((?:\.)" + dec_octet + "){3})", + h16 = QStringLiteral(R"((?:[[:xdigit:]]{1,4}))"), + ls32 = "(?:" + h16 + ":" + h16 + "|" + ipv4_addr + ")", + ipv6_addr = "(?:" + "(?:" + h16 + ":){6}" + ls32 + + "|" "::(?:" + h16 + ":){5}" + ls32 + + "|" + h16 + "?::(?:" + h16 + ":){4}" + ls32 + + "|" "(?:" + h16 + "(?::" + h16 + "){0,1})?::(?:" + h16 + ":){3}" + ls32 + + "|" "(?:" + h16 + "(?::" + h16 + "){0,2})?::(?:" + h16 + ":){2}" + ls32 + + "|" "(?:" + h16 + "(?::" + h16 + "){0,3})?::" + h16 + ":" + ls32 + + "|" "(?:" + h16 + "(?::" + h16 + "){0,4})?::" + ls32 + + "|" "(?:" + h16 + "(?::" + h16 + "){0,5})?::" + h16 + + "|" "(?:" + h16 + "(?::" + h16 + "){0,6})?::" + ")", + ipvfuture = R"((?:v[[:xdigit:]]+\.)" + unreserved_subdelims_colon + "+)", + ip_literal = R"((?:\[(?:)" + ipv6_addr + "|" + ipvfuture + R"()\]))", + host_alnum = "(?:[a-zA-Z0-9]|" + general_unicode + ")", + host_label = "(?:" + host_alnum + "+(?:-+" + host_alnum + "+)*)", + hostname = "(?:" + host_label + R"((?:\.)" + host_label + R"()*\.?))", + host = "(?:" + hostname + "|" + ip_literal + ")", + path = R"((?:/((?:[a-zA-Z0-9\-._~!$&'*+,;=:@/]|)" + pct_enc + R"(|\((?-1)\)|)" + general_unicode + ")*))", + query = R"(((?:[a-zA-Z0-9\-._~!$&'*+,;=:@/?\\{}]|)" + pct_enc + R"(|\((?-1)\)|\[(?-1)\]|)" + general_unicode + ")*)", + fragment = query; + return + R"((?()" + + protocol + "://" + + "(?:" + userinfo + "@)?" + + host + "(?::[0-9]+)?" + + path + "?" + R"((?:\?)" + query + ")?" + R"((?:#)" + fragment + ")?" + "(?(.*?))"));