[Xfce4-commits] [apps/xfce4-terminal] 01/01: Prepare for upcoming vte regex API change: update regexes

noreply at xfce.org noreply at xfce.org
Wed Aug 24 16:19:22 CEST 2016


This is an automated email from the git hooks/post-receive script.

f2404 pushed a commit to branch master
in repository apps/xfce4-terminal.

commit cd52af40ad68609bd5b23d6e0e916e27a809d163
Author: Igor <f2404 at yandex.ru>
Date:   Wed Aug 24 17:17:02 2016 +0300

    Prepare for upcoming vte regex API change: update regexes
    
    See https://bugzilla.gnome.org/show_bug.cgi?id=770147
    terminal-regex.h taken from gnome-terminal: old regexes weren't working fine
    with the new API based on PCRE2.
---
 terminal/terminal-regex.h  | 145 +++++++++++++++++++++++++++++++++++++++++++++
 terminal/terminal-widget.c |  24 ++------
 2 files changed, 151 insertions(+), 18 deletions(-)

diff --git a/terminal/terminal-regex.h b/terminal/terminal-regex.h
new file mode 100644
index 0000000..f53bdce
--- /dev/null
+++ b/terminal/terminal-regex.h
@@ -0,0 +1,145 @@
+/*
+ * Copyright © 2015 Egmont Koblinger
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Mini style-guide:
+ *
+ * #define'd fragments should preferably have an outermost group, for the
+ * exact same reason as why usually in C/C++ #define's the values are enclosed
+ * in parentheses: that is, so that you don't get surprised when you use the
+ * macro and append a quantifier.
+ *
+ * For repeated fragments prefer regex-style (?(DEFINE)(?<NAME>(...))) and use
+ * as (?&NAME), so that the regex string and the compiled regex object is
+ * smaller.
+ *
+ * Build small blocks, comment and unittest them heavily.
+ *
+ * Use free-spacing mode for improved readability. The hardest to read is
+ * which additional characters belong to a "(?" prefix. To improve
+ * readability, place a space after this, and for symmetry, before the closing
+ * parenthesis. Also place a space around "|" characters. No space before
+ * quantifiers. Try to be consistent with the existing style (yes I know the
+ * existing style is not consistent either, but please do your best).
+ *
+ * See http://www.rexegg.com/regex-disambiguation.html for all the "(?"
+ * syntaxes.
+ */
+
+#ifndef TERMINAL_REGEX_H
+#define TERMINAL_REGEX_H
+
+#define SCHEME "(?ix: news | telnet | nntp | https? | ftps? | sftp | webcal )"
+
+#define USERCHARS "-+.[:alnum:]"
+/* Nonempty username, e.g. "john.smith" */
+#define USER "[" USERCHARS "]+"
+
+#define PASSCHARS_CLASS "[-[:alnum:]\\Q,?;.:/!%$^*&~\"#'\\E]"
+/* Optional colon-prefixed password. I guess empty password should be allowed, right? E.g. ":secret", ":", "" */
+#define PASS "(?x: :" PASSCHARS_CLASS "* )?"
+
+/* Optional at-terminated username (with perhaps a password too), e.g. "joe@", "pete:secret@", "" */
+#define USERPASS "(?:" USER PASS "@)?"
+
+/* S4: IPv4 segment (number between 0 and 255) with lookahead at the end so that we don't match "25" in the string "256".
+   The lookahead could go to the last segment of IPv4 only but this construct allows nicer unittesting. */
+#define S4_DEF "(?(DEFINE)(?<S4>(?x: (?: [0-9] | [1-9][0-9] | 1[0-9]{2} | 2[0-4][0-9] | 25[0-5] ) (?! [0-9] ) )))"
+
+/* IPV4: Decimal IPv4, e.g. "1.2.3.4", with lookahead (implemented in S4) at the end so that we don't match "192.168.1.123" in the string "192.168.1.1234". */
+#define IPV4_DEF S4_DEF "(?(DEFINE)(?<IPV4>(?x: (?: (?&S4) \\. ){3} (?&S4) )))"
+
+/* IPv6, including embedded IPv4, e.g. "::1", "dead:beef::1.2.3.4".
+ * Lookahead for the next char not being a dot or digit, so it doesn't get stuck matching "dead:beef::1" in "dead:beef::1.2.3.4".
+ * This is not required since the surrounding brackets would trigger backtracking, but it allows nicer unittesting.
+ * TODO: more strict check (right number of colons, etc.)
+ * TODO: add zone_id: RFC 4007 section 11, RFC 6874 */
+
+/* S6: IPv6 segment, S6C: IPv6 segment followed by a comma, CS6: comma followed by an IPv6 segment */
+#define S6_DEF "(?(DEFINE)(?<S6>[[:xdigit:]]{1,4})(?<CS6>:(?&S6))(?<S6C>(?&S6):))"
+
+/* No :: shorthand */
+#define IPV6_FULL  "(?x: (?&S6C){7} (?&S6) )"
+/* Begins with :: */
+#define IPV6_LEFT  "(?x: : (?&CS6){1,7} )"
+/* :: somewhere in the middle - use negative lookahead to make sure there aren't too many colons in total */
+#define IPV6_MID   "(?x: (?! (?: [[:xdigit:]]*: ){8} ) (?&S6C){1,6} (?&CS6){1,6} )"
+/* Ends with :: */
+#define IPV6_RIGHT "(?x: (?&S6C){1,7} : )"
+/* Is "::" and nothing more */
+#define IPV6_NULL  "(?x: :: )"
+
+/* The same ones for IPv4-embedded notation, without the actual IPv4 part */
+#define IPV6V4_FULL  "(?x: (?&S6C){6} )"
+#define IPV6V4_LEFT  "(?x: :: (?&S6C){0,5} )"  /* includes "::<ipv4>" */
+#define IPV6V4_MID   "(?x: (?! (?: [[:xdigit:]]*: ){7} ) (?&S6C){1,4} (?&CS6){1,4} ) :"
+#define IPV6V4_RIGHT "(?x: (?&S6C){1,5} : )"
+
+/* IPV6: An IPv6 address (possibly with an embedded IPv4).
+ * This macro defines both IPV4 and IPV6, since the latter one requires the former. */
+#define IP_DEF IPV4_DEF S6_DEF "(?(DEFINE)(?<IPV6>(?x: (?: " IPV6_NULL " | " IPV6_LEFT " | " IPV6_MID " | " IPV6_RIGHT " | " IPV6_FULL " | (?: " IPV6V4_FULL " | " IPV6V4_LEFT " | " IPV6V4_MID " | " IPV6V4_RIGHT " ) (?&IPV4) ) (?! [.:[:xdigit:]] ) )))"
+
+/* Either an alphanumeric character or dash; or if [negative lookahead] not ASCII
+ * then any graphical Unicode character.
+ * A segment can consist entirely of numbers.
+ * (Note: PCRE doesn't support character class subtraction/intersection.) */
+#define HOSTNAMESEGMENTCHARS_CLASS "(?x: [-[:alnum:]] | (?! [[:ascii:]] ) [[:graph:]] )"
+
+/* A hostname of at least 1 component. The last component cannot be entirely numbers.
+ * E.g. "foo", "example.com", "1234.com", but not "foo.123" */
+#define HOSTNAME1 "(?x: (?: " HOSTNAMESEGMENTCHARS_CLASS "+ \\. )* " HOSTNAMESEGMENTCHARS_CLASS "* (?! [0-9] ) " HOSTNAMESEGMENTCHARS_CLASS "+ )"
+
+/* A hostname of at least 2 components. The last component cannot be entirely numbers.
+ * E.g. "example.com", "1234.com", but not "1234.56" */
+#define HOSTNAME2 "(?x: (?: " HOSTNAMESEGMENTCHARS_CLASS "+ \\.)+ " HOSTNAME1 " )"
+
+/* For URL: Hostname, IPv4, or bracket-enclosed IPv6, e.g. "example.com", "1.2.3.4", "[::1]" */
+#define URL_HOST "(?x: " HOSTNAME1 " | (?&IPV4) | \\[ (?&IPV6) \\] )"
+
+/* For e-mail: Hostname of at least two segments, or bracket-enclosed IPv4 or IPv6, e.g. "example.com", "[1.2.3.4]", "[::1]".
+ * Technically an e-mail with a single-component hostname might be valid on a local network, but let's avoid tons of false positives (e.g. in a typical shell prompt). */
+#define EMAIL_HOST "(?x: " HOSTNAME2 " | \\[ (?: (?&IPV4) | (?&IPV6) ) \\] )"
+
+/* Number between 1 and 65535, with lookahead at the end so that we don't match "6789" in the string "67890",
+   and in turn we don't eventually match "http://host:6789" in "http://host:67890". */
+#define N_1_65535 "(?x: (?: [1-9][0-9]{0,3} | [1-5][0-9]{4} | 6[0-4][0-9]{3} | 65[0-4][0-9]{2} | 655[0-2][0-9] | 6553[0-5] ) (?! [0-9] ) )"
+
+/* Optional colon-prefixed port, e.g. ":1080", "" */
+#define PORT "(?x: \\:" N_1_65535 " )?"
+
+#define PATHCHARS_CLASS "[-[:alnum:]\\Q_$.+!*,:;@&=?/~#|%\\E]"
+/* Chars not to end a URL */
+#define PATHNONTERM_CLASS "[\\Q.!,?\\E]"
+
+/* Lookbehind at the end, so that the last character (if we matched a character at all) is not from PATHTERM_CLASS */
+#define URLPATH "(?x: /" PATHCHARS_CLASS "* (?<! " PATHNONTERM_CLASS " ) )?"
+#define VOIP_PATH "(?x: [;?]" PATHCHARS_CLASS "* (?<! " PATHNONTERM_CLASS " ) )?"
+
+/* Now let's put these fragments together */
+
+#define DEFS IP_DEF
+
+#define REGEX_URL_AS_IS  DEFS SCHEME "://" USERPASS URL_HOST PORT URLPATH
+/* TODO: also support file:/etc/passwd */
+#define REGEX_URL_FILE   DEFS "(?ix: file:/ (?: / (?: " HOSTNAME1 " )? / )? (?! / ) )(?x: " PATHCHARS_CLASS "+ (?<! " PATHNONTERM_CLASS " ) )?" 
+/* Lookbehind so that we don't catch "abc.www.foo.bar", bug 739757. Lookahead for www/ftp for convenience (so that we can reuse HOSTNAME1). */
+#define REGEX_URL_HTTP   DEFS "(?<!(?:" HOSTNAMESEGMENTCHARS_CLASS "|[.]))(?=(?i:www|ftp))" HOSTNAME1 PORT URLPATH
+#define REGEX_URL_VOIP   DEFS "(?i:h323:|sips?:)" USERPASS URL_HOST PORT VOIP_PATH
+#define REGEX_EMAIL      DEFS "(?i:mailto:)?" USER "@" EMAIL_HOST
+#define REGEX_NEWS_MAN   "(?i:news:|man:|info:)[-[:alnum:]\\Q^_{|}~!\"#$%&'()*+,./;:=?`\\E]+"
+
+#endif /* !TERMINAL_REGEX_H */
diff --git a/terminal/terminal-widget.c b/terminal/terminal-widget.c
index 068fe2f..afab87c 100644
--- a/terminal/terminal-widget.c
+++ b/terminal/terminal-widget.c
@@ -36,23 +36,11 @@
 #include <terminal/terminal-preferences.h>
 #include <terminal/terminal-widget.h>
 #include <terminal/terminal-private.h>
+#include <terminal/terminal-regex.h>
 
 
 
 #define MAILTO          "mailto:"
-#define USERCHARS       "-[:alnum:]\\Q_.+\\E"
-#define USERCHARS_CLASS "[" USERCHARS "]"
-#define PASSCHARS_CLASS "[-[:alnum:]\\Q,?;.:/!%$^*&~\"#'\\E]"
-#define HOSTCHARS_CLASS "[-[:alnum:]]"
-#define HOSTNAME        HOSTCHARS_CLASS "+(?:\\." HOSTCHARS_CLASS "+)*"
-#define IPV6ADDRESS     "\\[(?:[[:xdigit:]]{0,4}:){2,7}[[:xdigit:]]{0,4}\\]"
-#define HOST            "(?:" HOSTNAME "|" IPV6ADDRESS ")"
-#define PORT            "(?:\\:[[:digit:]]{1,5})?"
-#define PATHCHARS_CLASS "[-[:alnum:]\\Q_$.+!*,;@&=?/:~#'%\\E]"
-#define PATHTERM_CLASS  "[[:alnum:]\\Q_$+*@&=/~#%\\E]"
-#define SCHEME          "(?:news:|telnet:|nntp:|file:\\/|https?:|ftps?:|sftp:|webcal:|magnet:)"
-#define USERPASS        USERCHARS_CLASS "+(?:" PASSCHARS_CLASS "+)?"
-#define URLPATH         "(?:(?:\\(" PATHCHARS_CLASS "*\\)|" PATHCHARS_CLASS ")*(?:\\(" PATHCHARS_CLASS "*\\)|" PATHTERM_CLASS "))?"
 
 
 
@@ -78,11 +66,11 @@ typedef struct
 
 static const TerminalRegexPattern regex_patterns[] =
 {
-  { SCHEME "//(?:" USERPASS "\\@)?" HOST PORT URLPATH, PATTERN_TYPE_FULL_HTTP },
-  { "(?:www[[:digit:]]{0,3}|ftp)" HOSTCHARS_CLASS "*\\." HOST PORT URLPATH, PATTERN_TYPE_HTTP },
-  { "(?:" MAILTO ")?" USERCHARS_CLASS "[" USERCHARS ".]*\\@" HOSTCHARS_CLASS "+\\." HOST, PATTERN_TYPE_EMAIL },
-  { "news:[[:alnum:]\\Q^_{|}~!\"#$%&'()*+,./;:=?`\\E]+", PATTERN_TYPE_FULL_HTTP },
-  { "magnet:[-[:alnum:]\\Q^_{|}~!\"#$%&'()*+,./;:=?`\\E]+", PATTERN_TYPE_FULL_HTTP }
+  { REGEX_URL_AS_IS, PATTERN_TYPE_FULL_HTTP },
+  { REGEX_URL_HTTP,  PATTERN_TYPE_HTTP },
+  { REGEX_URL_FILE,  PATTERN_TYPE_FULL_HTTP },
+  { REGEX_EMAIL,     PATTERN_TYPE_EMAIL },
+  { REGEX_NEWS_MAN,  PATTERN_TYPE_FULL_HTTP },
 };
 
 

-- 
To stop receiving notification emails like this one, please contact
the administrator of this repository.


More information about the Xfce4-commits mailing list