[Xfce4-commits] <midori:master> Refactor adblock simplify parsing and to honor matching options
Christian Dywan
noreply at xfce.org
Fri Nov 20 01:42:01 CET 2009
Updating branch refs/heads/master
to 2d514bfb6cb3fba7850cb22f6c0278c147505b48 (commit)
from 5580b62e00d1969342cbdecf53472a8a76e2ea14 (commit)
commit 2d514bfb6cb3fba7850cb22f6c0278c147505b48
Author: Alexander Butenko <a.butenka at gmail.com>
Date: Fri Nov 20 01:00:48 2009 +0100
Refactor adblock simplify parsing and to honor matching options
There is a known flaw in that URL rules may erroneously match domains.
The code path for WebKitGTK+ < 1.1.14 is now crashing.
extensions/adblock.c | 270 ++++++++++++++++++++++++++++++++++++++------------
1 files changed, 205 insertions(+), 65 deletions(-)
diff --git a/extensions/adblock.c b/extensions/adblock.c
index 58eac99..83f5b8d 100644
--- a/extensions/adblock.c
+++ b/extensions/adblock.c
@@ -26,6 +26,13 @@
(__filter && (g_str_has_prefix (__filter, "http") \
|| g_str_has_prefix (__filter, "file")))
+typedef struct
+{
+ const gchar* page_uri;
+ const gchar* uri;
+ const gchar* query;
+} Matcher;
+
static GHashTable* pattern;
static gchar* blockcss = NULL;
static gchar* blockcssprivate = NULL;
@@ -41,21 +48,20 @@ adblock_build_js (const gchar* style,
return g_strdup_printf (
"window.addEventListener ('DOMContentLoaded',"
"function () {"
- "var URL = location.href;"
- "var sites = new Array(); %s;"
- "var public = '%s';"
- "for (var i in sites) {"
- "if (URL.indexOf(i) != -1) {"
- "public += sites[i];"
- "break;"
- "}}"
- "public += ' {display: none !important;}';"
- "var mystyle = document.createElement(\"style\");"
- "mystyle.setAttribute(\"type\", \"text/css\");"
- "mystyle.appendChild(document.createTextNode(public));"
- "var head = document.getElementsByTagName(\"head\")[0];"
- "if (head) head.appendChild(mystyle);"
- "else document.documentElement.insertBefore(mystyle, document.documentElement.firstChild);"
+ " var URL = location.href;"
+ " var sites = new Array(); %s;"
+ " var public = '%s';"
+ " for (var i in sites) {"
+ " if (URL.indexOf(i) != -1) {"
+ " public += sites[i];"
+ " break;"
+ " }}"
+ " public += ' {display: none !important;}';"
+ " var mystyle = document.createElement('style');"
+ " mystyle.setAttribute('type', 'text/css');"
+ " mystyle.appendChild(document.createTextNode(public));"
+ " var head = document.getElementsByTagName('head')[0];"
+ " if (head) head.appendChild(mystyle);"
"}, true);",
private,
style);
@@ -73,10 +79,6 @@ adblock_fixup_regexp (gchar* src)
/* FIXME: Avoid always allocating twice the string */
s = dst = g_malloc (strlen (src) * 2);
- /* |http:// means ^http:// */
- if (src[0] == '|')
- src[0] = '^';
-
while (*src)
{
switch (*src)
@@ -93,6 +95,15 @@ adblock_fixup_regexp (gchar* src)
case '|':
*s++ = '\\';
break;
+ case '/':
+ *s++ = '\\';
+ break;
+ /* FIXME: We actually need to match :[0-9]+ or '/'. Sign means
+ "here could be port number or nothing". So bla.com^ will match
+ bla.com/ or bla.com:8080/ but not bla.com.au/ */
+ case '^':
+ *src = '?';
+ break;
}
*s++ = *src;
src++;
@@ -518,11 +529,37 @@ adblock_browser_populate_tool_menu_cb (MidoriBrowser* browser,
}
static gboolean
-adblock_is_matched (const gchar* patt,
+adblock_is_matched (const gchar* opts,
const GRegex* regex,
- const gchar* uri)
+ Matcher* data)
{
- return g_regex_match_full (regex, uri, -1, 0, 0, NULL, NULL);
+ gchar* patt;
+
+ if (g_regex_match_simple ("type=fulluri,", opts, G_REGEX_UNGREEDY, G_REGEX_MATCH_NOTEMPTY))
+ patt = g_strdup (data->uri);
+ else
+ patt = g_strdup (data->query);
+
+ if (g_regex_match_full (regex, patt, -1, 0, 0, NULL, NULL))
+ {
+ if (g_regex_match_simple (",third-party", opts,
+ G_REGEX_CASELESS, G_REGEX_MATCH_NOTEMPTY))
+ {
+ if (data->page_uri && g_regex_match_full (regex, data->page_uri, -1, 0, 0, NULL, NULL))
+ {
+ g_free (patt);
+ return FALSE;
+ }
+ }
+ /* TODO: Domain opt check */
+ g_free (patt);
+ return TRUE;
+ }
+ else
+ {
+ g_free (patt);
+ return FALSE;
+ }
}
#if HAVE_WEBKIT_RESOURCE_REQUEST
@@ -534,10 +571,38 @@ adblock_resource_request_starting_cb (WebKitWebView* web_view,
WebKitNetworkResponse* response,
GtkWidget* image)
{
- const gchar* uri = webkit_network_request_get_uri (request);
- if (!strncmp (uri, "data", 4))
+ Matcher data;
+ const char *page_uri;
+ const gchar* uri;
+ SoupMessage* msg;
+ SoupURI* soup_uri;
+
+ uri = webkit_network_request_get_uri (request);
+ if (!strncmp (uri, "data", 4) || !strncmp (uri, "file", 4))
return;
- if (g_hash_table_find (pattern, (GHRFunc) adblock_is_matched, (char*)uri))
+
+ msg = webkit_network_request_get_message (request);
+ if (!msg)
+ return;
+
+ if (msg->method && !strncmp (msg->method, "POST", 4))
+ return;
+
+ soup_uri = soup_uri_new (uri);
+ if (soup_uri->query)
+ data.query = g_strdup_printf ("%s?%s", soup_uri->path, soup_uri->query);
+ else
+ data.query = g_strdup (soup_uri->path);
+ soup_uri_free (soup_uri);
+
+ data.uri = uri;
+ page_uri = webkit_web_view_get_uri (web_view);
+
+ if (!page_uri || !strcmp (page_uri, "about:blank"))
+ page_uri = uri;
+ data.page_uri = page_uri;
+
+ if (g_hash_table_find (pattern, (GHRFunc) adblock_is_matched, &data))
{
#if 0
gchar* text;
@@ -558,9 +623,32 @@ static void
adblock_session_request_queued_cb (SoupSession* session,
SoupMessage* msg)
{
- SoupURI* soup_uri = soup_message_get_uri (msg);
- gchar* uri = soup_uri ? soup_uri_to_string (soup_uri, FALSE) : g_strdup ("");
- if (g_hash_table_find (pattern, (GHRFunc) adblock_is_matched, uri))
+ Matcher data;
+ SoupURI* soup_uri;
+ gchar* uri;
+ gchar* page_uri;
+
+ if (msg->method && !strncmp (msg->method, "POST", 4))
+ return;
+
+ /* FIXME: There is a crasher somewhere introduced with the refactoring */
+
+ soup_uri = soup_message_get_uri (msg);
+ uri = soup_uri_to_string (soup_uri, FALSE);
+ if (soup_uri->query)
+ data.query = g_strdup_printf ("%s?%s", soup_uri->path, soup_uri->query);
+ else
+ data.query = g_strdup (soup_uri->path);
+ soup_uri_free (soup_uri);
+
+ data.uri = uri;
+ page_uri = NULL; /* FIXME */
+
+ if (!page_uri || !strcmp (page_uri, "about:blank"))
+ page_uri = uri;
+ data.page_uri = page_uri;
+
+ if (g_hash_table_find (pattern, (GHRFunc) adblock_is_matched, &data))
{
/* FIXME: Update image tooltip */
@@ -636,6 +724,27 @@ adblock_app_add_browser_cb (MidoriApp* app,
}
static void
+adblock_compile_regexp (GHashTable* tbl,
+ gchar* patt,
+ gchar* opts)
+{
+ GRegex* regex;
+ GError* error = NULL;
+
+ /* TODO: Play with optimization flags */
+ regex = g_regex_new (patt, G_REGEX_OPTIMIZE,
+ G_REGEX_MATCH_NOTEMPTY, &error);
+
+ if (!error)
+ g_hash_table_insert (tbl, opts, regex);
+ else
+ {
+ g_warning ("%s: %s", G_STRFUNC, error->message);
+ g_error_free (error);
+ }
+}
+
+static void
adblock_frame_add (gchar* line)
{
gchar* new_blockcss;
@@ -657,8 +766,9 @@ adblock_frame_add_private (gchar* line)
{
gchar** domains;
gint max, i;
+
domains = g_strsplit (data[0], ",", -1);
- for (max = i = 0; domains [i]; i++)
+ for (max = i = 0; domains[i]; i++)
{
new_blockcss = g_strdup_printf ("%s;\nsites['%s']+=',%s'",
blockcssprivate, g_strstrip (domains[i]), data[1]);
@@ -675,6 +785,56 @@ adblock_frame_add_private (gchar* line)
g_strfreev (data);
}
+static void
+adblock_add_url_pattern (gchar* line)
+{
+ gchar* opts;
+ gchar** data;
+ gchar* patt;
+ gchar* parsed;
+
+ if (line[0] == '|' && line[1] == '|' )
+ {
+ (void)*line++;
+ (void)*line++;
+
+ data = g_strsplit (line, "$", 2);
+ parsed = adblock_fixup_regexp (data[0]);
+ patt = g_strdup_printf ("^https?://([a-z0-9\\.]+)?%s", parsed);
+ if (data[1])
+ opts = g_strdup_printf ("type=fulluri,regexp=%s,%s", patt, data[1]);
+ else
+ opts = g_strdup_printf ("type=fulluri,regexp=%s", patt);
+
+ g_strfreev (data);
+ g_free (parsed);
+ }
+ else if (line[0] == '|')
+ {
+ (void)*line++;
+
+ data = g_strsplit (line, "$", 2);
+ parsed = adblock_fixup_regexp (data[0]);
+ patt = g_strdup_printf ("^%s", parsed);
+ if (data[1])
+ opts = g_strdup_printf ("type=fulluri,regexp=%s,%s", patt, data[1]);
+ else
+ opts = g_strdup_printf ("type=fulluri,regexp=%s", patt);
+
+ g_strfreev (data);
+ g_free (parsed);
+ }
+ else
+ {
+ patt = adblock_fixup_regexp (line);
+ opts = g_strdup_printf ("regexp=%s", patt);
+ }
+
+ /* g_debug ("got: %s opts %s", patt, opts); */
+ adblock_compile_regexp (pattern, patt, opts);
+ g_free (patt);
+}
+
static gchar*
adblock_parse_line (gchar* line)
{
@@ -687,63 +847,42 @@ adblock_parse_line (gchar* line)
/* FIXME: No support for whitelisting */
if (line[0] == '@' && line[1] == '@')
return NULL;
- /* FIXME: What is this? */
- if (line[0] == '|' && line[1] == '|')
- return NULL;
- /* ditto */
- if (strstr (line,"$"))
+ /* FIXME: No support for [include] and [exclude] tags */
+ if (line[0] == '[')
return NULL;
- /* Got block hider */
- if (line[0] == '#' && line[1] == '#' && (line[2] == '.' || line[2] == '#'
- || line[2] == 'A' || line[2] == 'a' || line[2] == 'D' || line[2] == 'U'))
+
+ /* Got CSS block hider */
+ if (line[0] == '#' && line[1] == '#' )
{
adblock_frame_add (line);
return NULL;
}
- /* FIXME: Do we have smth else starting with ##? */
- if (line[0] == '#' && line[1] == '#')
+ /* Some crazy lists do this */
+ if (line[0] == '#')
return NULL;
+ /* Got per domain CSS hider rule */
if (strstr (line,"##"))
{
adblock_frame_add_private (line);
return NULL;
}
- /* FIXME: No support for [include] and [exclude] tags */
- if (line[0] == '[')
- return NULL;
- return adblock_fixup_regexp (line);
+
+ /* Got URL blocker rule */
+ adblock_add_url_pattern (line);
+ return line;
}
static void
adblock_parse_file (gchar* path)
{
FILE* file;
+ gchar line[500];
+
if ((file = g_fopen (path, "r")))
{
- gchar line[500];
- GRegex* regex;
-
while (fgets (line, 500, file))
- {
- GError* error = NULL;
- gchar* parsed;
-
- parsed = adblock_parse_line (line);
- if (!parsed)
- continue;
-
- regex = g_regex_new (parsed, G_REGEX_OPTIMIZE,
- G_REGEX_MATCH_NOTEMPTY, &error);
- if (error)
- {
- g_warning ("%s: %s", G_STRFUNC, error->message);
- g_error_free (error);
- g_free (parsed);
- }
- else
- g_hash_table_insert (pattern, parsed, regex);
- }
+ adblock_parse_line (line);
fclose (file);
}
}
@@ -846,6 +985,7 @@ test_adblock_pattern (void)
temp = g_file_open_tmp ("midori_adblock_match_test_XXXXXX", &filename, NULL);
+ /* TODO: Update some tests and add new ones. */
g_file_set_contents (filename,
"*ads.foo.bar*\n"
"*ads.bogus.name*\n"
More information about the Xfce4-commits
mailing list