SoupContentSniffer: don't use gio anymore
authorarno <arno@renevier.net>
Tue, 3 May 2011 05:46:07 +0000 (07:46 +0200)
committerDan Winship <danw@gnome.org>
Mon, 8 Aug 2011 22:02:32 +0000 (18:02 -0400)
This brings the content sniffing algorithm closer to the HTML5
specification.

https://bugzilla.gnome.org/show_bug.cgi?id=648846

libsoup/soup-content-sniffer.c
tests/resources/html_binary.html [new file with mode: 0644]
tests/resources/ps_binary.ps [new file with mode: 0644]
tests/resources/text_binary.txt [new file with mode: 0644]
tests/sniffing-test.c

index 8d54771..4b96735 100644 (file)
@@ -10,7 +10,6 @@
 #endif
 
 #include <string.h>
-#include <gio/gio.h>
 
 #include "soup-content-sniffer.h"
 #include "soup-enum-types.h"
@@ -250,27 +249,6 @@ static char byte_looks_binary[] = {
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0xF0 - 0xFF */
 };
 
-static char *
-sniff_gio (SoupContentSniffer *sniffer, SoupMessage *msg, SoupBuffer *buffer)
-{
-       SoupURI *uri;
-       char *uri_path;
-       char *content_type;
-       char *mime_type;
-       gboolean uncertain;
-
-       uri = soup_message_get_uri (msg);
-       uri_path = soup_uri_to_string (uri, TRUE);
-
-       content_type= g_content_type_guess (uri_path, (const guchar*)buffer->data, buffer->length, &uncertain);
-       mime_type = g_content_type_get_mime_type (content_type);
-
-       g_free (uri_path);
-       g_free (content_type);
-
-       return mime_type;
-}
-
 /* HTML5: 2.7.4 Content-Type sniffing: unknown type */
 static char*
 sniff_unknown (SoupContentSniffer *sniffer, SoupMessage *msg,
@@ -278,7 +256,6 @@ sniff_unknown (SoupContentSniffer *sniffer, SoupMessage *msg,
 {
        const guchar *resource = (const guchar *)buffer->data;
        int resource_length = MIN (512, buffer->length);
-       char *gio_guess;
        int i;
 
        for (i = 0; i < G_N_ELEMENTS (types_table); i++) {
@@ -338,29 +315,15 @@ sniff_unknown (SoupContentSniffer *sniffer, SoupMessage *msg,
                }
        }
 
-       /* The spec allows us to use platform sniffing to find out
-        * about other types that are not covered, but we need to be
-        * careful to not escalate privileges, if on text or binary.
-        */
-       gio_guess = sniff_gio (sniffer, msg, buffer);
-
-       if (for_text_or_binary) {
-               for (i = 0; i < G_N_ELEMENTS (types_table); i++) {
-                       SoupContentSnifferPattern *type_row = &(types_table[i]);
+       if (for_text_or_binary)
+               return g_strdup ("application/octet-stream");
 
-                       if (!g_ascii_strcasecmp (type_row->sniffed_type, gio_guess) &&
-                           type_row->scriptable) {
-                               g_free (gio_guess);
-                               gio_guess = NULL;
-                               break;
-                       }
-               }
+       for (i = 0; i < resource_length; i++) {
+               if (byte_looks_binary[resource[i]])
+                       return g_strdup ("application/octet-stream");
        }
 
-       if (gio_guess)
-               return gio_guess;
-
-       return g_strdup ("application/octet-stream");
+       return g_strdup ("text/plain");
 }
 
 /* HTML5: 2.7.3 Content-Type sniffing: text or binary */
diff --git a/tests/resources/html_binary.html b/tests/resources/html_binary.html
new file mode 100644 (file)
index 0000000..9200dd4
--- /dev/null
@@ -0,0 +1 @@
+<HTML \1c
diff --git a/tests/resources/ps_binary.ps b/tests/resources/ps_binary.ps
new file mode 100644 (file)
index 0000000..3d210ed
--- /dev/null
@@ -0,0 +1 @@
+%!PS-Adobe-" \16
diff --git a/tests/resources/text_binary.txt b/tests/resources/text_binary.txt
new file mode 100644 (file)
index 0000000..113bfdd
--- /dev/null
@@ -0,0 +1 @@
+abc\1c
\ No newline at end of file
index 60ca389..828f1d5 100644 (file)
@@ -445,24 +445,6 @@ test_disabled (const char *path)
        g_main_loop_unref (loop);
 }
 
-/* Fix up XDG_DATA_DIRS for jhbuild runs so that it still works even
- * if you didn't install shared-mime-info.
- */
-static void
-fixup_xdg_dirs (void)
-{
-       const char *xdg_data_dirs = g_getenv ("XDG_DATA_DIRS");
-       char *new_data_dirs;
-
-       if (xdg_data_dirs &&
-           !g_str_has_prefix (xdg_data_dirs, "/usr/share") &&
-           !strstr (xdg_data_dirs, ":/usr/share")) {
-               new_data_dirs = g_strdup_printf ("%s:/usr/share", xdg_data_dirs);
-               g_setenv ("XDG_DATA_DIRS", new_data_dirs, TRUE);
-               g_free (new_data_dirs);
-       }
-}
-
 int
 main (int argc, char **argv)
 {
@@ -470,8 +452,6 @@ main (int argc, char **argv)
 
        test_init (argc, argv, NULL);
 
-       fixup_xdg_dirs ();
-
        server = soup_test_server_new (TRUE);
        soup_server_add_handler (server, NULL, server_callback, NULL, NULL);
        base_uri = soup_uri_new ("http://127.0.0.1/");
@@ -533,11 +513,24 @@ main (int argc, char **argv)
         */
        test_sniffing ("/text_or_binary/test.html", "text/plain");
 
+       /* text/plain with binary content and unknown pattern should be
+        * application/octet-stream */
+       test_sniffing ("/text_or_binary/text_binary.txt", "application/octet-stream");
+
+       /* text/plain with binary content and scriptable pattern should be
+        * application/octet-stream to avoid 'privilege escalation' */
+       test_sniffing ("/text_or_binary/html_binary.html", "application/octet-stream");
+
+       /* text/plain with binary content and non scriptable known pattern should
+        * be the given type */
+       test_sniffing ("/text_or_binary/ps_binary.ps", "application/postscript");
+
        /* Test the unknown sniffing path */
 
        test_sniffing ("/unknown/test.html", "text/html");
        test_sniffing ("/unknown/home.gif", "image/gif");
-       test_sniffing ("/unknown/mbox", "application/mbox");
+       test_sniffing ("/unknown/mbox", "text/plain");
+       test_sniffing ("/unknown/text_binary.txt", "application/octet-stream");
 
        /* Test the XML sniffing path */