From 3c9f3cdffc32126700f25d8a0c55f68b6f587bde Mon Sep 17 00:00:00 2001 From: Gustavo Noronha Silva Date: Wed, 17 Jun 2009 20:53:17 -0300 Subject: [PATCH] Implement content sniffing The implementation is based on the draft spec on Content-Type Processing Model (draft-abarth-mime-sniff-01). It is a spinoff from the HTML5 spec. Soup now provides a SoupContentSniffer session feature, which hooks into the message I/O, and delays emissions of the got-chunk signal to be able to figure out the Content-Type of messages from the actual content received, in some cases. GIO is also used to sniff content, whenever the spec allows further sniffing. http://bugzilla.gnome.org/show_bug.cgi?id=572589 --- .gitignore | 1 + libsoup/Makefile.am | 2 + libsoup/soup-content-sniffer.c | 570 +++++++++++++++++++++++++++++++++++++++++ libsoup/soup-content-sniffer.h | 57 +++++ libsoup/soup-marshal.list | 1 + libsoup/soup-message-headers.c | 19 +- libsoup/soup-message-io.c | 128 ++++++++- libsoup/soup-message-private.h | 5 + libsoup/soup-message.c | 57 +++++ libsoup/soup-message.h | 1 + libsoup/soup.h | 1 + tests/Makefile.am | 3 + tests/resources/atom.xml | 35 +++ tests/resources/home.gif | Bin 0 -> 995 bytes tests/resources/mbox | 16 ++ tests/resources/rss20.xml | 26 ++ tests/resources/test.html | 10 + tests/sniffing-test.c | 429 +++++++++++++++++++++++++++++++ 18 files changed, 1356 insertions(+), 5 deletions(-) create mode 100644 libsoup/soup-content-sniffer.c create mode 100644 libsoup/soup-content-sniffer.h create mode 100644 tests/resources/atom.xml create mode 100644 tests/resources/home.gif create mode 100644 tests/resources/mbox create mode 100644 tests/resources/rss20.xml create mode 100644 tests/resources/test.html create mode 100644 tests/sniffing-test.c diff --git a/.gitignore b/.gitignore index b0cd3a4..1bb227d 100644 --- a/.gitignore +++ b/.gitignore @@ -69,6 +69,7 @@ tests/redirect-test tests/server-auth-test tests/simple-httpd tests/simple-proxy +tests/sniffing-test tests/ssl-test tests/streaming-test tests/timeout-test diff --git a/libsoup/Makefile.am b/libsoup/Makefile.am index 949f243..2d3a6ea 100644 --- a/libsoup/Makefile.am +++ b/libsoup/Makefile.am @@ -55,6 +55,7 @@ soup_headers = \ soup-auth-domain.h \ soup-auth-domain-basic.h \ soup-auth-domain-digest.h \ + soup-content-sniffer.h \ soup-cookie.h \ soup-cookie-jar.h \ soup-cookie-jar-text.h \ @@ -119,6 +120,7 @@ libsoup_2_4_la_SOURCES = \ soup-auth-manager-ntlm.c \ soup-connection.h \ soup-connection.c \ + soup-content-sniffer.c \ soup-cookie.c \ soup-cookie-jar.c \ soup-cookie-jar-text.c \ diff --git a/libsoup/soup-content-sniffer.c b/libsoup/soup-content-sniffer.c new file mode 100644 index 0000000..5fdee5c --- /dev/null +++ b/libsoup/soup-content-sniffer.c @@ -0,0 +1,570 @@ +/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ +/* + * soup-content-sniffer.c + * + * Copyright (C) 2009 Gustavo Noronha Silva. + */ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include + +#include "soup-content-sniffer.h" +#include "soup-enum-types.h" +#include "soup-message.h" +#include "soup-message-private.h" +#include "soup-session-feature.h" +#include "soup-uri.h" + +/** + * SECTION:soup-content-sniffer + * @short_description: Content sniffing for #SoupSession + * + * A #SoupContentSniffer tries to detect the actual content type of + * the files that are being downloaded by looking at some of the data + * before the #SoupMessage emits its #SoupMessage::got-headers signal. + * #SoupContentSniffer implements #SoupSessionFeature, so you can add + * content sniffing to a session with soup_session_add_feature() or + * soup_session_add_feature_by_type(). + * + * Since: 2.27.3 + **/ + +static char *sniff (SoupContentSniffer *sniffer, SoupMessage *msg, SoupBuffer *buffer, GHashTable **params); +static gsize get_buffer_size (SoupContentSniffer *sniffer); + +static void soup_content_sniffer_session_feature_init (SoupSessionFeatureInterface *feature_interface, gpointer interface_data); + +static void request_queued (SoupSessionFeature *feature, SoupSession *session, SoupMessage *msg); +static void request_unqueued (SoupSessionFeature *feature, SoupSession *session, SoupMessage *msg); + +G_DEFINE_TYPE_WITH_CODE (SoupContentSniffer, soup_content_sniffer, G_TYPE_OBJECT, + G_IMPLEMENT_INTERFACE (SOUP_TYPE_SESSION_FEATURE, + soup_content_sniffer_session_feature_init)) + +static void +soup_content_sniffer_init (SoupContentSniffer *content_sniffer) +{ +} + +static void +soup_content_sniffer_class_init (SoupContentSnifferClass *content_sniffer_class) +{ + content_sniffer_class->sniff = sniff; + content_sniffer_class->get_buffer_size = get_buffer_size; +} + +static void +soup_content_sniffer_session_feature_init (SoupSessionFeatureInterface *feature_interface, + gpointer interface_data) +{ + feature_interface->request_queued = request_queued; + feature_interface->request_unqueued = request_unqueued; +} + +/** + * soup_content_sniffer_new: + * + * Creates a new #SoupContentSniffer. + * + * Returns: a new #SoupContentSniffer + * + * Since: 2.27.3 + **/ +SoupContentSniffer * +soup_content_sniffer_new () +{ + return g_object_new (SOUP_TYPE_CONTENT_SNIFFER, NULL); +} + +char * +soup_content_sniffer_sniff (SoupContentSniffer *sniffer, + SoupMessage *msg, SoupBuffer *buffer, + GHashTable **params) +{ + g_return_val_if_fail (SOUP_IS_CONTENT_SNIFFER (sniffer), NULL); + g_return_val_if_fail (SOUP_IS_MESSAGE (msg), NULL); + g_return_val_if_fail (buffer != NULL, NULL); + + return SOUP_CONTENT_SNIFFER_GET_CLASS (sniffer)->sniff (sniffer, msg, buffer, params); +} + +/* This table is based on the HTML5 spec; + * See 2.7.4 Content-Type sniffing: unknown type + */ +typedef struct { + /* @has_ws is TRUE if @pattern contains "generic" whitespace */ + gboolean has_ws; + const char *mask; + const char *pattern; + guint pattern_length; + const char *sniffed_type; + gboolean scriptable; +} SoupContentSnifferPattern; + +static SoupContentSnifferPattern types_table[] = { + { FALSE, + "\xFF\xFF\xDF\xDF\xDF\xDF\xDF\xDF\xDF\xFF\xDF\xDF\xDF\xDF", + "\x3C\x21\x44\x4F\x43\x54\x59\x50\x45\x20\x48\x54\x4D\x4C", + 14, + "text/html", + TRUE }, + + { TRUE, + "\xFF\xFF\xDF\xDF\xDF\xDF", + " \x3C\x48\x54\x4D\x4C", + 5, + "text/html", + TRUE }, + + { TRUE, + "\xFF\xFF\xDF\xDF\xDF\xDF", + " \x3C\x48\x45\x41\x44", + 5, + "text/html", + TRUE }, + + { TRUE, + "\xFF\xFF\xDF\xDF\xDF\xDF\xDF\xDF", + " \x3C\x53\x43\x52\x49\x50\x54", + 7, + "text/html", + TRUE }, + + { FALSE, + "\xFF\xFF\xFF\xFF\xFF", + "\x25\x50\x44\x46\x2D", + 5, + "application/pdf", + TRUE }, + + { FALSE, + "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", + "\x25\x21\x50\x53\x2D\x41\x64\x6F\x62\x65\x2D", + 11, + "application/postscript", + FALSE }, + + { FALSE, + "\xFF\xFF\x00\x00", + "\xFE\xFF\x00\x00", + 4, + "text/plain", + FALSE }, + + { FALSE, + "\xFF\xFF\x00\x00", + "\xFF\xFF\x00\x00", + 4, + "text/plain", + FALSE }, + + { FALSE, + "\xFF\xFF\xFF\x00", + "\xEF\xBB\xBF\x00", + 4, + "text/plain", + FALSE }, + + { FALSE, + "\xFF\xFF\xFF\xFF\xFF\xFF", + "\x47\x49\x46\x38\x37\x61", + 6, + "image/gif", + FALSE }, + + { FALSE, + "\xFF\xFF\xFF\xFF\xFF\xFF", + "\x47\x49\x46\x38\x39\x61", + 6, + "image/gif", + FALSE }, + + { FALSE, + "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", + "\x89\x50\x4E\x47\x0D\x0A\x1A\x0A", + 8, + "image/png", + FALSE }, + + { FALSE, + "\xFF\xFF\xFF", + "\xFF\xD8\xFF", + 3, + "image/jpeg", + FALSE }, + + { FALSE, + "\xFF\xFF", + "\x42\x4D", + 2, + "image/bmp", + FALSE }, + + { FALSE, + "\xFF\xFF\xFF\xFF", + "\x00\x00\x01\x00", + 4, + "image/vnd.microsoft.icon", + FALSE } +}; + +/* Whether a given byte looks like it might be part of binary content. + * Source: HTML5 spec; borrowed from the Chromium mime sniffer code, + * which is BSD-licensed + */ +static char byte_looks_binary[] = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, /* 0x00 - 0x0F */ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, /* 0x10 - 0x1F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x20 - 0x2F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x30 - 0x3F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x40 - 0x4F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x50 - 0x5F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x60 - 0x6F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x70 - 0x7F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x80 - 0x8F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x90 - 0x9F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xA0 - 0xAF */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xB0 - 0xBF */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xC0 - 0xCF */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xD0 - 0xDF */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xE0 - 0xEF */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xF0 - 0xFF */ +}; + +static char * +sniff_gio (SoupContentSniffer *sniffer, SoupMessage *msg, SoupBuffer *buffer) +{ + SoupURI *uri; + char *uri_path; + char *content_type; + char *mime_type; + gboolean uncertain; + + uri = soup_message_get_uri (msg); + uri_path = soup_uri_to_string (uri, TRUE); + + content_type= g_content_type_guess (uri_path, (const guchar*)buffer->data, buffer->length, &uncertain); + mime_type = g_content_type_get_mime_type (content_type); + + g_free (uri_path); + g_free (content_type); + + return mime_type; +} + +/* HTML5: 2.7.4 Content-Type sniffing: unknown type */ +static char* +sniff_unknown (SoupContentSniffer *sniffer, SoupMessage *msg, + SoupBuffer *buffer, gboolean for_text_or_binary) +{ + const char *resource = buffer->data; + int resource_length = MIN (512, buffer->length); + char *gio_guess; + int i; + + for (i = 0; i < G_N_ELEMENTS (types_table); i++) { + SoupContentSnifferPattern *type_row = &(types_table[i]); + + /* The scriptable types should be skiped for the text + * or binary path, but considered for other paths */ + if (for_text_or_binary && type_row->scriptable) + continue; + + if (type_row->has_ws) { + int index_stream = 0; + int index_pattern = 0; + gboolean skip_row = FALSE; + + while (index_stream < resource_length) { + /* Skip insignificant white space ("WS" in the spec) */ + if (type_row->pattern[index_pattern] == ' ') { + if (resource[index_stream] == '\x09' || + resource[index_stream] == '\x0a' || + resource[index_stream] == '\x0c' || + resource[index_stream] == '\x0d' || + resource[index_stream] == '\x20') + index_stream++; + else + index_pattern++; + } else { + if ((type_row->mask[index_pattern] & resource[index_stream]) != type_row->pattern[index_pattern]) { + skip_row = TRUE; + break; + } + index_pattern++; + index_stream++; + } + } + + if (skip_row) + continue; + + if (index_pattern > type_row->pattern_length) + return g_strdup (type_row->sniffed_type); + } else { + int j; + + if (resource_length < type_row->pattern_length) + continue; + + for (j = 0; j < type_row->pattern_length; j++) { + if ((type_row->mask[j] & resource[j]) != type_row->pattern[j]) + break; + } + + /* This means our comparison above matched completely */ + if (j == type_row->pattern_length) + return g_strdup (type_row->sniffed_type); + } + } + + /* The spec allows us to use platform sniffing to find out + * about other types that are not covered, but we need to be + * careful to not escalate privileges, if on text or binary. + */ + gio_guess = sniff_gio (sniffer, msg, buffer); + + if (for_text_or_binary) { + for (i = 0; i < G_N_ELEMENTS (types_table); i++) { + SoupContentSnifferPattern *type_row = &(types_table[i]); + + if (!g_ascii_strcasecmp (type_row->sniffed_type, gio_guess) && + type_row->scriptable) { + g_free (gio_guess); + gio_guess = NULL; + break; + } + } + } + + if (gio_guess) + return gio_guess; + + return g_strdup ("application/octet-stream"); +} + +/* HTML5: 2.7.3 Content-Type sniffing: text or binary */ +static char* +sniff_text_or_binary (SoupContentSniffer *sniffer, SoupMessage *msg, + SoupBuffer *buffer) +{ + const char *resource = buffer->data; + int resource_length = MIN (512, buffer->length); + gboolean looks_binary = FALSE; + int i; + + /* Detecting UTF-16BE, UTF-16LE, or UTF-8 BOMs means it's text/plain */ + if (resource_length >= 4) { + if ((resource[0] == 0xFE && resource[1] == 0xFF) || + (resource[0] == 0xFF && resource[1] == 0xFE) || + (resource[0] == 0xEF && resource[1] == 0xBB && resource[2] == 0xBF)) + return g_strdup ("text/plain"); + } + + /* Look to see if any of the first n bytes looks binary */ + for (i = 0; i < resource_length; i++) { + if (byte_looks_binary[(unsigned char)resource[i]]) { + looks_binary = TRUE; + break; + } + } + + if (!looks_binary) + return g_strdup ("text/plain"); + + return sniff_unknown (sniffer, msg, buffer, TRUE); +} + +static char* +sniff_images (SoupContentSniffer *sniffer, SoupMessage *msg, + SoupBuffer *buffer, const char *content_type) +{ + const char *resource = buffer->data; + int resource_length = MIN (512, buffer->length); + int i; + + for (i = 0; i < G_N_ELEMENTS (types_table); i++) { + SoupContentSnifferPattern *type_row = &(types_table[i]); + + if (resource_length < type_row->pattern_length) + continue; + + if (!g_str_has_prefix (type_row->sniffed_type, "image/")) + continue; + + /* All of the image types use all-\xFF for the mask, + * so we can just memcmp. + */ + if (memcmp (type_row->pattern, resource, type_row->pattern_length) == 0) + return g_strdup (type_row->sniffed_type); + } + + return g_strdup (content_type); +} + +static char* +sniff_feed_or_html (SoupContentSniffer *sniffer, SoupMessage *msg, SoupBuffer *buffer) +{ + const char *resource = buffer->data; + int resource_length = MIN (512, buffer->length); + int pos = 0; + + /* Skip a leading UTF-8 BOM */ + if (resource[0] == 0xEF && resource[1] == 0xBB && resource[2] == 0xBF) + pos = 3; + + look_for_tag: + /* Skip insignificant white space */ + while ((resource[pos] == '\x09') || + (resource[pos] == '\x20') || + (resource[pos] == '\x0A') || + (resource[pos] == '\x0D')) + pos++; + + /* != < */ + if (resource[pos] != '\x3C') + return g_strdup ("text/html"); + + pos++; + + /* Skipping comments */ + if ((resource[pos] == '\x2D') || + (resource[pos+1] == '\x2D') || + (resource[pos+2] == '\x3E')) { + pos = pos + 3; + + while ((resource[pos] != '\x2D') && + (resource[pos+1] != '\x2D') && + (resource[pos+2] != '\x3E')) + pos++; + + goto look_for_tag; + } + + /* == ! */ + if (resource[pos] == '\x21') { + do { + pos++; + } while (resource[pos] != '\x3E'); + + pos++; + + goto look_for_tag; + } else if (resource[pos] == '\x3F') { /* ? */ + do { + pos++; + } while ((resource[pos] != '\x3F') && + (resource[pos+1] != '\x3E')); + + pos = pos + 2; + + goto look_for_tag; + } + + if ((resource[pos] == '\x72') && + (resource[pos+1] == '\x73') && + (resource[pos+2] == '\x73')) + return g_strdup ("application/rss+xml"); + + if ((resource[pos] == '\x66') && + (resource[pos+1] == '\x65') && + (resource[pos+2] == '\x65') && + (resource[pos+3] == '\x64')) + return g_strdup ("application/atom+xml"); + + return g_strdup ("text/html"); +} + +static char* +sniff (SoupContentSniffer *sniffer, SoupMessage *msg, SoupBuffer *buffer, GHashTable **params) +{ + const char *content_type_with_params; + const char *content_type; + + content_type = soup_message_headers_get_content_type (msg->response_headers, params); + content_type_with_params = soup_message_headers_get_one (msg->response_headers, "Content-Type"); + + + /* These comparisons are done in an ASCII-case-insensitive + * manner because the spec requires it */ + if ((content_type == NULL) || + !g_ascii_strcasecmp (content_type, "unknown/unknown") || + !g_ascii_strcasecmp (content_type, "application/unknown") || + !g_ascii_strcasecmp (content_type, "*/*")) + return sniff_unknown (sniffer, msg, buffer, FALSE); + + if (g_str_has_suffix (content_type, "+xml") || + !g_ascii_strcasecmp (content_type, "text/xml") || + !g_ascii_strcasecmp (content_type, "application/xml")) + return g_strdup (content_type); + + /* 2.7.5 Content-Type sniffing: image + * The spec says: + * + * If the resource's official type is "image/svg+xml", then + * the sniffed type of the resource is its official type (an + * XML type) + * + * The XML case is handled by the if above; if you refactor + * this code, keep this in mind. + */ + if (!g_ascii_strncasecmp (content_type, "image/", 6)) + return sniff_images (sniffer, msg, buffer, content_type); + + /* If we got text/plain, use text_or_binary */ + if (g_str_equal (content_type_with_params, "text/plain") || + g_str_equal (content_type_with_params, "text/plain; charset=ISO-8859-1") || + g_str_equal (content_type_with_params, "text/plain; charset=iso-8859-1") || + g_str_equal (content_type_with_params, "text/plain; charset=UTF-8")) { + return sniff_text_or_binary (sniffer, msg, buffer); + } + + if (!g_ascii_strcasecmp (content_type, "text/html")) + return sniff_feed_or_html (sniffer, msg, buffer); + + return g_strdup (content_type); +} + +static gsize +get_buffer_size (SoupContentSniffer *sniffer) +{ + return 512; +} + +static void +soup_content_sniffer_got_headers_cb (SoupMessage *msg, SoupContentSniffer *sniffer) +{ + SoupMessagePrivate *priv = SOUP_MESSAGE_GET_PRIVATE (msg); + SoupContentSnifferClass *content_sniffer_class = SOUP_CONTENT_SNIFFER_GET_CLASS (sniffer); + + priv->should_sniff_content = TRUE; + priv->bytes_for_sniffing = content_sniffer_class->get_buffer_size (sniffer); +} + +static void +request_queued (SoupSessionFeature *feature, SoupSession *session, + SoupMessage *msg) +{ + SoupMessagePrivate *priv = SOUP_MESSAGE_GET_PRIVATE (msg); + + priv->sniffer = g_object_ref (feature); + g_signal_connect (msg, "got-headers", + G_CALLBACK (soup_content_sniffer_got_headers_cb), + feature); +} + +static void +request_unqueued (SoupSessionFeature *feature, SoupSession *session, + SoupMessage *msg) +{ + SoupMessagePrivate *priv = SOUP_MESSAGE_GET_PRIVATE (msg); + + g_object_unref (priv->sniffer); + priv->sniffer = NULL; + + g_signal_handlers_disconnect_by_func (msg, soup_content_sniffer_got_headers_cb, feature); +} diff --git a/libsoup/soup-content-sniffer.h b/libsoup/soup-content-sniffer.h new file mode 100644 index 0000000..a8aa915 --- /dev/null +++ b/libsoup/soup-content-sniffer.h @@ -0,0 +1,57 @@ +/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ +/* + * Copyright (C) 2009 Gustavo Noronha Silva. + */ + +#ifndef SOUP_CONTENT_SNIFFER_H +#define SOUP_CONTENT_SNIFFER_H 1 + +#include +#include + +G_BEGIN_DECLS + +#define SOUP_TYPE_CONTENT_SNIFFER (soup_content_sniffer_get_type ()) +#define SOUP_CONTENT_SNIFFER(obj) (G_TYPE_CHECK_INSTANCE_CAST ((obj), SOUP_TYPE_CONTENT_SNIFFER, SoupContentSniffer)) +#define SOUP_CONTENT_SNIFFER_CLASS(klass) (G_TYPE_CHECK_CLASS_CAST ((klass), SOUP_TYPE_CONTENT_SNIFFER, SoupContentSnifferClass)) +#define SOUP_IS_CONTENT_SNIFFER(obj) (G_TYPE_CHECK_INSTANCE_TYPE ((obj), SOUP_TYPE_CONTENT_SNIFFER)) +#define SOUP_IS_CONTENT_SNIFFER_CLASS(klass) (G_TYPE_CHECK_CLASS_TYPE ((obj), SOUP_TYPE_CONTENT_SNIFFER)) +#define SOUP_CONTENT_SNIFFER_GET_CLASS(obj) (G_TYPE_INSTANCE_GET_CLASS ((obj), SOUP_TYPE_CONTENT_SNIFFER, SoupContentSnifferClass)) + +typedef struct _SoupContentSnifferPrivate SoupContentSnifferPrivate; + +typedef struct { + GObject parent; + + SoupContentSnifferPrivate *priv; +} SoupContentSniffer; + +typedef struct { + GObjectClass parent_class; + + char* (*sniff) (SoupContentSniffer *sniffer, + SoupMessage *msg, + SoupBuffer *buffer, + GHashTable **params); + gsize (*get_buffer_size) (SoupContentSniffer *sniffer); + + /* Padding for future expansion */ + void (*_libsoup_reserved1) (void); + void (*_libsoup_reserved2) (void); + void (*_libsoup_reserved3) (void); + void (*_libsoup_reserved4) (void); + void (*_libsoup_reserved5) (void); +} SoupContentSnifferClass; + +GType soup_content_sniffer_get_type (void); + +SoupContentSniffer *soup_content_sniffer_new (void); + +char *soup_content_sniffer_sniff (SoupContentSniffer *sniffer, + SoupMessage *msg, + SoupBuffer *buffer, + GHashTable **params); + +G_END_DECLS + +#endif /* SOUP_CONTENT_SNIFFER_H */ diff --git a/libsoup/soup-marshal.list b/libsoup/soup-marshal.list index 1a43570..d0c53ef 100644 --- a/libsoup/soup-marshal.list +++ b/libsoup/soup-marshal.list @@ -6,3 +6,4 @@ NONE:OBJECT,OBJECT NONE:OBJECT,POINTER NONE:BOXED,BOXED NONE:OBJECT,OBJECT,BOOLEAN +NONE:STRING,BOXED diff --git a/libsoup/soup-message-headers.c b/libsoup/soup-message-headers.c index f0abb78..185346e 100644 --- a/libsoup/soup-message-headers.c +++ b/libsoup/soup-message-headers.c @@ -226,6 +226,20 @@ find_header (SoupHeader *hdr_array, const char *interned_name, int nth) return -1; } +static int +find_last_header (SoupHeader *hdr_array, guint length, const char *interned_name, int nth) +{ + int i; + + for (i = length; i >= 0; i--) { + if (hdr_array[i].name == interned_name) { + if (nth-- == 0) + return i; + } + } + return -1; +} + /** * soup_message_headers_remove: * @hdrs: a #SoupMessageHeaders @@ -277,12 +291,15 @@ const char * soup_message_headers_get_one (SoupMessageHeaders *hdrs, const char *name) { SoupHeader *hdr_array = (SoupHeader *)(hdrs->array->data); + guint hdr_length = hdrs->array->len; int index; g_return_val_if_fail (name != NULL, NULL); name = intern_header_name (name, NULL); - index = find_header (hdr_array, name, 0); + + index = find_last_header (hdr_array, hdr_length, name, 0); + return (index == -1) ? NULL : hdr_array[index].value; } diff --git a/libsoup/soup-message-io.c b/libsoup/soup-message-io.c index 8e04b66..10657b7 100644 --- a/libsoup/soup-message-io.c +++ b/libsoup/soup-message-io.c @@ -18,6 +18,7 @@ #include "soup-misc.h" #include "soup-socket.h" #include "soup-ssl.h" +#include "soup-uri.h" typedef enum { SOUP_MESSAGE_IO_CLIENT, @@ -53,6 +54,11 @@ typedef struct { SoupMessageBody *read_body; goffset read_length; + gboolean acked_content_sniff_decision; + gboolean delay_got_chunks; + SoupMessageBody *delayed_chunk_data; + gsize delayed_chunk_length; + SoupMessageIOState write_state; SoupEncoding write_encoding; GString *write_buf; @@ -105,6 +111,9 @@ soup_message_io_cleanup (SoupMessage *msg) if (io->write_chunk) soup_buffer_free (io->write_chunk); + if (io->delayed_chunk_data) + soup_message_body_free (io->delayed_chunk_data); + g_slice_free (SoupMessageIOData, io); } @@ -207,6 +216,35 @@ io_disconnected (SoupSocket *sock, SoupMessage *msg) io_error (sock, msg, NULL); } +static gboolean +io_sniff_content (SoupMessage *msg) +{ + SoupMessagePrivate *priv = SOUP_MESSAGE_GET_PRIVATE (msg); + SoupMessageIOData *io = priv->io_data; + SoupBuffer *sniffed_buffer = soup_message_body_flatten (io->delayed_chunk_data); + char *sniffed_mime_type; + GHashTable *params = NULL; + + io->delay_got_chunks = FALSE; + + sniffed_mime_type = soup_content_sniffer_sniff (priv->sniffer, msg, sniffed_buffer, ¶ms); + SOUP_MESSAGE_IO_PREPARE_FOR_CALLBACK; + soup_message_content_sniffed (msg, sniffed_mime_type, params); + g_free (sniffed_mime_type); + if (params) + g_hash_table_destroy (params); + SOUP_MESSAGE_IO_RETURN_VAL_IF_CANCELLED_OR_PAUSED (FALSE); + + SOUP_MESSAGE_IO_PREPARE_FOR_CALLBACK; + soup_message_got_chunk (msg, sniffed_buffer); + soup_buffer_free (sniffed_buffer); + soup_message_body_free (io->delayed_chunk_data); + io->delayed_chunk_data = NULL; + SOUP_MESSAGE_IO_RETURN_VAL_IF_CANCELLED_OR_PAUSED (FALSE); + + return TRUE; +} + /* Reads data from io->sock into io->read_meta_buf. If @to_blank is * %TRUE, it reads up until a blank line ("CRLF CRLF" or "LF LF"). * Otherwise, it reads up until a single CRLF or LF. @@ -294,6 +332,21 @@ read_body_chunk (SoupMessage *msg) GError *error = NULL; SoupBuffer *buffer; + if (!io->acked_content_sniff_decision) { + /* The content sniffer feature decides whether a + * message needs to be sniffed while handling + * got-headers, but the message may be paused in a + * user handler, so we need to make sure the signal is + * emitted, or delay_got_chunks is correctly setup + * here. + */ + if (priv->should_sniff_content) + io->delay_got_chunks = TRUE; + else if (priv->sniffer) + soup_message_content_sniffed (msg, NULL, NULL); + io->acked_content_sniff_decision = TRUE; + } + while (read_to_eof || io->read_length > 0) { if (priv->chunk_allocator) { buffer = priv->chunk_allocator (msg, io->read_length, priv->chunk_allocator_data); @@ -324,10 +377,24 @@ read_body_chunk (SoupMessage *msg) io->read_length -= nread; - SOUP_MESSAGE_IO_PREPARE_FOR_CALLBACK; - soup_message_got_chunk (msg, buffer); - soup_buffer_free (buffer); - SOUP_MESSAGE_IO_RETURN_VAL_IF_CANCELLED_OR_PAUSED (FALSE); + if (io->delay_got_chunks) { + if (!io->delayed_chunk_data) + io->delayed_chunk_data = soup_message_body_new (); + + soup_message_body_append_buffer (io->delayed_chunk_data, buffer); + io->delayed_chunk_length += buffer->length; + + /* We already have enough data to perform sniffing, so do it */ + if (io->delayed_chunk_length > priv->bytes_for_sniffing) { + if (!io_sniff_content (msg)) + return FALSE; + } + } else { + SOUP_MESSAGE_IO_PREPARE_FOR_CALLBACK; + soup_message_got_chunk (msg, buffer); + soup_buffer_free (buffer); + SOUP_MESSAGE_IO_RETURN_VAL_IF_CANCELLED_OR_PAUSED (FALSE); + } continue; } @@ -675,6 +742,23 @@ io_read (SoupSocket *sock, SoupMessage *msg) guint status; read_more: + /* We have delayed chunks, but are no longer delaying, so this + * means we already sniffed but the message got paused while + * content-sniffed was being handled, in which case we did not + * emit the necessary got-chunk; See also the handling for + * state SOUP_MESSAGE_IO_STATE_BODY in the switch bellow. + */ + if (io->delayed_chunk_data && !io->delay_got_chunks) { + SoupBuffer *sniffed_buffer = soup_message_body_flatten (io->delayed_chunk_data); + + SOUP_MESSAGE_IO_PREPARE_FOR_CALLBACK; + soup_message_got_chunk (msg, sniffed_buffer); + soup_buffer_free (sniffed_buffer); + soup_message_body_free (io->delayed_chunk_data); + io->delayed_chunk_data = NULL; + SOUP_MESSAGE_IO_RETURN_IF_CANCELLED_OR_PAUSED; + } + switch (io->read_state) { case SOUP_MESSAGE_IO_STATE_NOT_STARTED: return; @@ -782,6 +866,39 @@ io_read (SoupSocket *sock, SoupMessage *msg) return; got_body: + /* A chunk of data may have been read and the emission + * of got_chunk delayed because we wanted to wait for + * more chunks to arrive, for doing content sniffing, + * but the body was too small, so we need to check if + * an emission is in order here, along with the + * sniffing, if we haven't done it yet, of course. + */ + if (io->delayed_chunk_data) { + if (io->delay_got_chunks) { + if (!io_sniff_content (msg)) + return; + } else { + SoupBuffer *sniffed_buffer = soup_message_body_flatten (io->delayed_chunk_data); + + SOUP_MESSAGE_IO_PREPARE_FOR_CALLBACK; + soup_message_got_chunk (msg, sniffed_buffer); + soup_buffer_free (sniffed_buffer); + soup_message_body_free (io->delayed_chunk_data); + io->delayed_chunk_data = NULL; + + /* If we end up returning, read_state + * needs to be set to IO_STATE_BODY, + * and read_length must be 0; since we + * may be coming from STATE_TRAILERS, + * or may be doing a read-to-eof, we + * sanitize these here. + */ + io->read_state = SOUP_MESSAGE_IO_STATE_BODY; + io->read_length = 0; + SOUP_MESSAGE_IO_RETURN_IF_CANCELLED_OR_PAUSED; + } + } + io->read_state = SOUP_MESSAGE_IO_STATE_FINISHING; SOUP_MESSAGE_IO_PREPARE_FOR_CALLBACK; @@ -885,6 +1002,9 @@ new_iostate (SoupMessage *msg, SoupSocket *sock, SoupMessageIOMode mode, io->read_state = SOUP_MESSAGE_IO_STATE_NOT_STARTED; io->write_state = SOUP_MESSAGE_IO_STATE_NOT_STARTED; + if (priv->should_sniff_content) + io->delay_got_chunks = TRUE; + if (priv->io_data) soup_message_io_cleanup (msg); priv->io_data = io; diff --git a/libsoup/soup-message-private.h b/libsoup/soup-message-private.h index f47251a..999c335 100644 --- a/libsoup/soup-message-private.h +++ b/libsoup/soup-message-private.h @@ -9,6 +9,7 @@ #include "soup-message.h" #include "soup-auth.h" #include "soup-connection.h" +#include "soup-content-sniffer.h" typedef enum { SOUP_MESSAGE_IO_STATUS_IDLE, @@ -29,6 +30,10 @@ typedef struct { guint msg_flags; gboolean server_side; + SoupContentSniffer *sniffer; + gboolean should_sniff_content; + gsize bytes_for_sniffing; + SoupHTTPVersion http_version, orig_http_version; SoupURI *uri; diff --git a/libsoup/soup-message.c b/libsoup/soup-message.c index 5475bb7..f614946 100644 --- a/libsoup/soup-message.c +++ b/libsoup/soup-message.c @@ -99,6 +99,7 @@ enum { GOT_HEADERS, GOT_CHUNK, GOT_BODY, + CONTENT_SNIFFED, RESTARTED, FINISHED, @@ -402,6 +403,44 @@ soup_message_class_init (SoupMessageClass *message_class) G_TYPE_NONE, 0); /** + * SoupMessage::content-sniffed: + * @msg: the message + * @type: the content type that we got from sniffing + * @params: a #GHashTable with the parameters + * + * This signal is emitted after %got-headers, and before the + * first %got-chunk. If content sniffing is disabled, or no + * content sniffing will be performed, due to the sniffer + * deciding to trust the Content-Type sent by the server, this + * signal is emitted immediately after %got_headers, and @type + * is %NULL. + * + * If the #SoupContentSniffer feature is enabled, and the + * sniffer decided to perform sniffing, the first %got_chunk + * emission may be delayed, so that the sniffer has enough + * data to correctly sniff the content. It notified the + * library user that the content has been sniffed, and allows + * it to change the header contents in the message, if + * desired. + * + * After this signal is emitted, the data that was spooled so + * that sniffing could be done is delivered on the first + * emission of %got_chunk. + * + * Since: 2.27.3 + **/ + signals[CONTENT_SNIFFED] = + g_signal_new ("content_sniffed", + G_OBJECT_CLASS_TYPE (object_class), + G_SIGNAL_RUN_FIRST, + 0, + NULL, NULL, + soup_marshal_NONE__STRING_BOXED, + G_TYPE_NONE, 2, + G_TYPE_STRING, + G_TYPE_HASH_TABLE); + + /** * SoupMessage::restarted: * @msg: the message * @@ -858,6 +897,24 @@ soup_message_got_body (SoupMessage *msg) g_signal_emit (msg, signals[GOT_BODY], 0); } +/** + * soup_message_content_sniffed: + * @msg: a #SoupMessage + * @type: a string with the sniffed content type + * @params: a #GHashTable with the parameters + * + * Emits the %content_sniffed signal, indicating that the IO layer + * finished sniffing the content type for @msg. If content sniffing + * will not be performed, due to the sniffer deciding to trust the + * Content-Type sent by the server, this signal is emitted immediately + * after %got_headers, with %NULL as @content_type. + **/ +void +soup_message_content_sniffed (SoupMessage *msg, const char *content_type, GHashTable *params) +{ + g_signal_emit (msg, signals[CONTENT_SNIFFED], 0, content_type, params); +} + static void restarted (SoupMessage *req) { diff --git a/libsoup/soup-message.h b/libsoup/soup-message.h index 1b850be..b940ac6 100644 --- a/libsoup/soup-message.h +++ b/libsoup/soup-message.h @@ -155,6 +155,7 @@ void soup_message_got_informational (SoupMessage *msg); void soup_message_got_headers (SoupMessage *msg); void soup_message_got_chunk (SoupMessage *msg, SoupBuffer *chunk); void soup_message_got_body (SoupMessage *msg); +void soup_message_content_sniffed (SoupMessage *msg, const char *content_type, GHashTable *params); void soup_message_restarted (SoupMessage *msg); void soup_message_finished (SoupMessage *msg); diff --git a/libsoup/soup.h b/libsoup/soup.h index 496a4c1..ddb73f7 100644 --- a/libsoup/soup.h +++ b/libsoup/soup.h @@ -15,6 +15,7 @@ extern "C" { #include #include #include +#include #include #include #include diff --git a/tests/Makefile.am b/tests/Makefile.am index 0d46df5..ca8158d 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -26,6 +26,7 @@ noinst_PROGRAMS = \ redirect-test \ simple-httpd \ simple-proxy \ + sniffing-test \ streaming-test \ timeout-test \ uri-parsing \ @@ -58,6 +59,7 @@ redirect_test_SOURCES = redirect-test.c $(TEST_SRCS) server_auth_test_SOURCES = server-auth-test.c $(TEST_SRCS) simple_httpd_SOURCES = simple-httpd.c simple_proxy_SOURCES = simple-proxy.c +sniffing_test_SOURCES = sniffing-test.c $(TEST_SRCS) ssl_test_SOURCES = ssl-test.c $(TEST_SRCS) streaming_test_SOURCES = streaming-test.c $(TEST_SRCS) timeout_test_SOURCES = timeout-test.c $(TEST_SRCS) @@ -87,6 +89,7 @@ TESTS = \ misc-test \ ntlm-test \ redirect-test \ + sniffing-test \ streaming-test \ timeout-test \ uri-parsing \ diff --git a/tests/resources/atom.xml b/tests/resources/atom.xml new file mode 100644 index 0000000..962ecf4 --- /dev/null +++ b/tests/resources/atom.xml @@ -0,0 +1,35 @@ + + + A small ATOM feed + 2009-07-02T10:27:44Z + kov + + Anonymous Coward + + http://libsoup.rocks/atom.xml + + + + + http://libsoup.rocks/so/much/ + + One post too many + woo [...] +

woohoo

+
+ 2009-07-02T10:38:28Z + + + + kov + + + http://libsoup.rocks/blog + + + Just stuff to test libsoup + Random stuff to test libsoup + 2009-07-02T00:38:29Z + +
+
diff --git a/tests/resources/home.gif b/tests/resources/home.gif new file mode 100644 index 0000000000000000000000000000000000000000..55e1d5993fc8ba20b1febb5a395a0cc5c80879a1 GIT binary patch literal 995 zcmZ?wbh9u|RA%63_|Cu}&?BJOqo_GW({PTV(;O$SrC!NvlZ*Ejx9n}{Io31f+?1u~ zmTtYb_1L{*=boRt_Ws)Q_s`#dfB%0JjE2Cl2mu`iAOPhB296U9svI&N8x|aF<`CA3 zIk91(bEANSNrA#aM`ktwIU^GVhXfWzb`v8PhXe&i7G~K1krxV0UYvsLZV3Vp6+9Xy}<-Pfit2&px)3vi@}Ecoi+!6qxs r(ZS&8c(6lQJMIjJqGMAJzuvhVhQ!S+Y#bZ9V$SZ``nrvgk--`Oh`uWh literal 0 HcmV?d00001 diff --git a/tests/resources/mbox b/tests/resources/mbox new file mode 100644 index 0000000..929ad2b --- /dev/null +++ b/tests/resources/mbox @@ -0,0 +1,16 @@ +From email@here Wed Jun 17 21:20:48 2009 +Return-path: +Envelope-to: email@here +Delivery-date: Wed, 17 Jun 2009 21:20:48 -0300 +Received: from email by here.domain with local (Exim 4.69) + (envelope-from ) + id 1MH5N2-0008Lq-7c + for email@here; Wed, 17 Jun 2009 21:20:48 -0300 +To: email@here +Subject: This is just so that I have a mailbox +Message-Id: +From: A Nice User +Date: Wed, 17 Jun 2009 21:20:48 -0300 + +This is a dumb email. + diff --git a/tests/resources/rss20.xml b/tests/resources/rss20.xml new file mode 100644 index 0000000..d64bdda --- /dev/null +++ b/tests/resources/rss20.xml @@ -0,0 +1,26 @@ + + + + + A small RSS + http://libsoup.rocks/ + en + A small RSS to test libsoup + + + One post too many + http://libsoup.rocks/so/much/ + http://libsoup.rocks/so/much/ + <p>woohoo</p> + Wed, 02 Jul 2009 10:26:28 +0000 + + + GCDS will rock + http://libsoup.rocks/so/much/again/ + http://libsoup.rocks/so/much/again/ + <p>I mean, really.</p> + Wed, 02 Jul 2009 10:26:28 +0000 + + + + diff --git a/tests/resources/test.html b/tests/resources/test.html new file mode 100644 index 0000000..5a6cc0c --- /dev/null +++ b/tests/resources/test.html @@ -0,0 +1,10 @@ + + + + + + + +

GNOME!

+ + diff --git a/tests/sniffing-test.c b/tests/sniffing-test.c new file mode 100644 index 0000000..ad2690f --- /dev/null +++ b/tests/sniffing-test.c @@ -0,0 +1,429 @@ +/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ +/* + * Copyright (C) 2009 Gustavo Noronha Silva . + */ + +#include +#include +#include + +#include + +#include "test-utils.h" + +SoupSession *session; +SoupURI *base_uri; +SoupMessageBody *chunk_data; + +static void +server_callback (SoupServer *server, SoupMessage *msg, + const char *path, GHashTable *query, + SoupClientContext *context, gpointer data) +{ + GError *error = NULL; + char *chunked; + char *contents; + gsize length; + + if (msg->method != SOUP_METHOD_GET) { + soup_message_set_status (msg, SOUP_STATUS_NOT_IMPLEMENTED); + return; + } + + soup_message_set_status (msg, SOUP_STATUS_OK); + + if (query) { + chunked = g_hash_table_lookup (query, "chunked"); + if (chunked && g_str_equal (chunked, "yes")) + soup_message_headers_set_encoding (msg->response_headers, + SOUP_ENCODING_CHUNKED); + } + + if (!strcmp (path, "/mbox")) { + g_file_get_contents ("resources/mbox", + &contents, &length, + &error); + + if (error) { + g_error ("%s", error->message); + g_error_free (error); + exit (1); + } + + soup_message_set_response (msg, "text/plain", + SOUP_MEMORY_TAKE, + contents, + length); + } + + if (g_str_has_prefix (path, "/text_or_binary/")) { + char *base_name = g_path_get_basename (path); + char *file_name = g_strdup_printf ("resources/%s", base_name); + + g_file_get_contents (file_name, + &contents, &length, + &error); + + g_free (base_name); + g_free (file_name); + + if (error) { + g_error ("%s", error->message); + g_error_free (error); + exit (1); + } + + soup_message_set_response (msg, "text/plain", + SOUP_MEMORY_TAKE, + contents, + length); + } + + if (g_str_has_prefix (path, "/unknown/")) { + char *base_name = g_path_get_basename (path); + char *file_name = g_strdup_printf ("resources/%s", base_name); + + g_file_get_contents (file_name, + &contents, &length, + &error); + + g_free (base_name); + g_free (file_name); + + if (error) { + g_error ("%s", error->message); + g_error_free (error); + exit (1); + } + + soup_message_set_response (msg, "UNKNOWN/unknown", + SOUP_MEMORY_TAKE, + contents, + length); + } + + if (g_str_has_prefix (path, "/type/")) { + char **components = g_strsplit (path, "/", 4); + char *ptr; + + char *base_name = g_path_get_basename (path); + char *file_name = g_strdup_printf ("resources/%s", base_name); + + g_file_get_contents (file_name, + &contents, &length, + &error); + + g_free (base_name); + g_free (file_name); + + if (error) { + g_error ("%s", error->message); + g_error_free (error); + exit (1); + } + + /* Hack to allow passing type in the URI */ + ptr = g_strrstr (components[2], "_"); + *ptr = '/'; + + soup_message_set_response (msg, components[2], + SOUP_MEMORY_TAKE, + contents, + length); + + g_strfreev (components); + } + + if (g_str_has_prefix (path, "/multiple_headers/")) { + char *base_name = g_path_get_basename (path); + char *file_name = g_strdup_printf ("resources/%s", base_name); + + g_file_get_contents (file_name, + &contents, &length, + &error); + + g_free (base_name); + g_free (file_name); + + if (error) { + g_error ("%s", error->message); + g_error_free (error); + exit (1); + } + + soup_message_set_response (msg, "text/xml", + SOUP_MEMORY_TAKE, + contents, + length); + + soup_message_headers_append (msg->response_headers, + "Content-Type", "text/plain"); + } + +} + +static gboolean +unpause_msg (gpointer data) +{ + SoupMessage *msg = (SoupMessage*)data; + soup_session_unpause_message (session, msg); + return FALSE; +} + + +static void +content_sniffed (SoupMessage *msg, char *content_type, GHashTable *params, gpointer data) +{ + gboolean should_pause = GPOINTER_TO_INT (data); + + if (g_object_get_data (G_OBJECT (msg), "got-chunk")) { + debug_printf (1, " got-chunk got emitted before content-sniffed\n"); + errors++; + } + + g_object_set_data (G_OBJECT (msg), "content-sniffed", GINT_TO_POINTER (TRUE)); + + if (should_pause) { + soup_session_pause_message (session, msg); + g_idle_add (unpause_msg, msg); + } +} + +static void +got_headers (SoupMessage *msg, gpointer data) +{ + gboolean should_pause = GPOINTER_TO_INT (data); + + if (g_object_get_data (G_OBJECT (msg), "content-sniffed")) { + debug_printf (1, " content-sniffed got emitted before got-headers\n"); + errors++; + } + + g_object_set_data (G_OBJECT (msg), "got-headers", GINT_TO_POINTER (TRUE)); + + if (should_pause) { + soup_session_pause_message (session, msg); + g_idle_add (unpause_msg, msg); + } +} + +static void +got_chunk (SoupMessage *msg, SoupBuffer *chunk, gpointer data) +{ + gboolean should_accumulate = GPOINTER_TO_INT (data); + + g_object_set_data (G_OBJECT (msg), "got-chunk", GINT_TO_POINTER (TRUE)); + + if (!should_accumulate) { + if (!chunk_data) + chunk_data = soup_message_body_new (); + soup_message_body_append_buffer (chunk_data, chunk); + } +} + +static void +finished (SoupSession *session, SoupMessage *msg, gpointer data) +{ + GMainLoop *loop = (GMainLoop*)data; + g_main_loop_quit (loop); +} + +static void +do_signals_test (gboolean should_content_sniff, + gboolean should_pause, + gboolean should_accumulate, + gboolean chunked_encoding) +{ + SoupURI *uri = soup_uri_new_with_base (base_uri, "/mbox"); + SoupMessage *msg = soup_message_new_from_uri ("GET", uri); + GMainLoop *loop = g_main_loop_new (NULL, TRUE); + char *contents; + gsize length; + GError *error = NULL; + SoupBuffer *body; + + if (chunked_encoding) + soup_uri_set_query (uri, "chunked=yes"); + + soup_message_body_set_accumulate (msg->response_body, should_accumulate); + + g_object_connect (msg, + "signal::got-headers", got_headers, GINT_TO_POINTER (should_pause), + "signal::got-chunk", got_chunk, GINT_TO_POINTER (should_accumulate), + "signal::content_sniffed", content_sniffed, GINT_TO_POINTER (should_pause), + NULL); + + g_object_ref (msg); + soup_session_queue_message (session, msg, finished, loop); + + g_main_loop_run (loop); + + if (!should_content_sniff && + g_object_get_data (G_OBJECT (msg), "content-sniffed")) { + debug_printf (1, " content-sniffed got emitted without a sniffer\n"); + errors++; + } else if (should_content_sniff && + !g_object_get_data (G_OBJECT (msg), "content-sniffed")) { + debug_printf (1, " content-sniffed did not get emitted\n"); + errors++; + } + + g_file_get_contents ("resources/mbox", + &contents, &length, + &error); + + if (error) { + g_error ("%s", error->message); + g_error_free (error); + exit (1); + } + + if (!should_accumulate) { + body = soup_message_body_flatten (chunk_data); + soup_message_body_free (chunk_data); + chunk_data = NULL; + } else + body = soup_message_body_flatten (msg->response_body); + + if (body->length != length) { + debug_printf (1, " lengths do not match\n"); + errors++; + } + + if (memcmp (body->data, contents, length)) { + debug_printf (1, " downloaded data does not match\n"); + errors++; + } + + g_free (contents); + soup_buffer_free (body); + + soup_uri_free (uri); + g_object_unref (msg); + g_main_loop_unref (loop); +} + +static void +sniffing_content_sniffed (SoupMessage *msg, char *content_type, GHashTable *params, gpointer data) +{ + char *expected_type = (char*)data; + + if (strcmp (content_type, expected_type)) { + debug_printf (1, " sniffing failed! expected %s, got %s\n", + expected_type, content_type); + errors++; + } +} + +static void +test_sniffing (const char *path, const char *expected_type) +{ + SoupURI *uri = soup_uri_new_with_base (base_uri, path); + SoupMessage *msg = soup_message_new_from_uri ("GET", uri); + GMainLoop *loop = g_main_loop_new (NULL, TRUE); + + g_object_connect (msg, + "signal::content_sniffed", sniffing_content_sniffed, expected_type, + NULL); + + g_object_ref (msg); + + soup_session_queue_message (session, msg, finished, loop); + + g_main_loop_run (loop); + + soup_uri_free (uri); + g_object_unref (msg); + g_main_loop_unref (loop); +} + +int +main (int argc, char **argv) +{ + SoupServer *server; + SoupContentSniffer *sniffer; + + test_init (argc, argv, NULL); + + server = soup_test_server_new (TRUE); + soup_server_add_handler (server, NULL, server_callback, NULL, NULL); + base_uri = soup_uri_new ("http://127.0.0.1/"); + soup_uri_set_port (base_uri, soup_server_get_port (server)); + + session = soup_session_async_new (); + + /* No sniffer, no content_sniffed should be emitted */ + do_signals_test (FALSE, FALSE, FALSE, FALSE); + do_signals_test (FALSE, FALSE, FALSE, TRUE); + do_signals_test (FALSE, FALSE, TRUE, FALSE); + do_signals_test (FALSE, FALSE, TRUE, TRUE); + + do_signals_test (FALSE, TRUE, TRUE, FALSE); + do_signals_test (FALSE, TRUE, TRUE, TRUE); + do_signals_test (FALSE, TRUE, FALSE, FALSE); + do_signals_test (FALSE, TRUE, FALSE, TRUE); + + sniffer = soup_content_sniffer_new (); + soup_session_add_feature (session, (SoupSessionFeature*)sniffer); + + /* Now, with a sniffer, content_sniffed must be emitted after + * got-headers, and before got-chunk. + */ + do_signals_test (TRUE, FALSE, FALSE, FALSE); + do_signals_test (TRUE, FALSE, FALSE, TRUE); + do_signals_test (TRUE, FALSE, TRUE, FALSE); + do_signals_test (TRUE, FALSE, TRUE, TRUE); + + do_signals_test (TRUE, TRUE, TRUE, FALSE); + do_signals_test (TRUE, TRUE, TRUE, TRUE); + do_signals_test (TRUE, TRUE, FALSE, FALSE); + do_signals_test (TRUE, TRUE, FALSE, TRUE); + + /* Test the text_or_binary sniffing path */ + + /* GIF is a 'safe' type */ + test_sniffing ("/text_or_binary/home.gif", "image/gif"); + + /* With our current code, no sniffing is done using GIO, so + * the mbox will be identified as text/plain; should we change + * this? + */ + test_sniffing ("/text_or_binary/mbox", "text/plain"); + + /* HTML is considered unsafe for this algorithm, since it is + * scriptable, so going from text/plain to text/html is + * considered 'privilege escalation' + */ + test_sniffing ("/text_or_binary/test.html", "text/plain"); + + /* Test the unknown sniffing path */ + + test_sniffing ("/unknown/test.html", "text/html"); + test_sniffing ("/unknown/home.gif", "image/gif"); + test_sniffing ("/unknown/mbox", "application/mbox"); + + /* Test the XML sniffing path */ + + test_sniffing ("/type/text_xml/home.gif", "text/xml"); + test_sniffing ("/type/anice_type+xml/home.gif", "anice/type+xml"); + test_sniffing ("/type/application_xml/home.gif", "application/xml"); + + /* Test the image sniffing path */ + + test_sniffing ("/type/image_png/home.gif", "image/gif"); + + /* Test the feed or html path */ + + test_sniffing ("/type/text_html/test.html", "text/html"); + test_sniffing ("/type/text_html/rss20.xml", "application/rss+xml"); + test_sniffing ("/type/text_html/atom.xml", "application/atom+xml"); + + /* The spec tells us to only use the last Content-Type header */ + + test_sniffing ("/multiple_headers/home.gif", "image/gif"); + + soup_uri_free (base_uri); + + test_cleanup (); + return errors != 0; +} -- 2.7.4