1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
3 * soup-content-sniffer.c
5 * Copyright (C) 2009 Gustavo Noronha Silva.
15 #include "soup-content-sniffer.h"
16 #include "soup-enum-types.h"
17 #include "soup-message.h"
18 #include "soup-message-private.h"
19 #include "soup-session-feature.h"
23 * SECTION:soup-content-sniffer
24 * @short_description: Content sniffing for #SoupSession
26 * A #SoupContentSniffer tries to detect the actual content type of
27 * the files that are being downloaded by looking at some of the data
28 * before the #SoupMessage emits its #SoupMessage::got-headers signal.
29 * #SoupContentSniffer implements #SoupSessionFeature, so you can add
30 * content sniffing to a session with soup_session_add_feature() or
31 * soup_session_add_feature_by_type().
36 static char *sniff (SoupContentSniffer *sniffer, SoupMessage *msg, SoupBuffer *buffer, GHashTable **params);
37 static gsize get_buffer_size (SoupContentSniffer *sniffer);
39 static void soup_content_sniffer_session_feature_init (SoupSessionFeatureInterface *feature_interface, gpointer interface_data);
41 static void request_queued (SoupSessionFeature *feature, SoupSession *session, SoupMessage *msg);
42 static void request_unqueued (SoupSessionFeature *feature, SoupSession *session, SoupMessage *msg);
44 G_DEFINE_TYPE_WITH_CODE (SoupContentSniffer, soup_content_sniffer, G_TYPE_OBJECT,
45 G_IMPLEMENT_INTERFACE (SOUP_TYPE_SESSION_FEATURE,
46 soup_content_sniffer_session_feature_init))
49 soup_content_sniffer_init (SoupContentSniffer *content_sniffer)
54 soup_content_sniffer_class_init (SoupContentSnifferClass *content_sniffer_class)
56 content_sniffer_class->sniff = sniff;
57 content_sniffer_class->get_buffer_size = get_buffer_size;
61 soup_content_sniffer_session_feature_init (SoupSessionFeatureInterface *feature_interface,
62 gpointer interface_data)
64 feature_interface->request_queued = request_queued;
65 feature_interface->request_unqueued = request_unqueued;
69 * soup_content_sniffer_new:
71 * Creates a new #SoupContentSniffer.
73 * Returns: a new #SoupContentSniffer
78 soup_content_sniffer_new ()
80 return g_object_new (SOUP_TYPE_CONTENT_SNIFFER, NULL);
84 soup_content_sniffer_sniff (SoupContentSniffer *sniffer,
85 SoupMessage *msg, SoupBuffer *buffer,
88 g_return_val_if_fail (SOUP_IS_CONTENT_SNIFFER (sniffer), NULL);
89 g_return_val_if_fail (SOUP_IS_MESSAGE (msg), NULL);
90 g_return_val_if_fail (buffer != NULL, NULL);
92 return SOUP_CONTENT_SNIFFER_GET_CLASS (sniffer)->sniff (sniffer, msg, buffer, params);
95 /* This table is based on the HTML5 spec;
96 * See 2.7.4 Content-Type sniffing: unknown type
99 /* @has_ws is TRUE if @pattern contains "generic" whitespace */
102 const guchar *pattern;
103 guint pattern_length;
104 const char *sniffed_type;
106 } SoupContentSnifferPattern;
108 static SoupContentSnifferPattern types_table[] = {
110 (const guchar *)"\xFF\xFF\xDF\xDF\xDF\xDF\xDF\xDF\xDF\xFF\xDF\xDF\xDF\xDF",
111 (const guchar *)"\x3C\x21\x44\x4F\x43\x54\x59\x50\x45\x20\x48\x54\x4D\x4C",
117 (const guchar *)"\xFF\xFF\xDF\xDF\xDF\xDF",
118 (const guchar *)" \x3C\x48\x54\x4D\x4C",
124 (const guchar *)"\xFF\xFF\xDF\xDF\xDF\xDF",
125 (const guchar *)" \x3C\x48\x45\x41\x44",
131 (const guchar *)"\xFF\xFF\xDF\xDF\xDF\xDF\xDF\xDF",
132 (const guchar *)" \x3C\x53\x43\x52\x49\x50\x54",
138 (const guchar *)"\xFF\xFF\xFF\xFF\xFF",
139 (const guchar *)"\x25\x50\x44\x46\x2D",
145 (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF",
146 (const guchar *)"\x25\x21\x50\x53\x2D\x41\x64\x6F\x62\x65\x2D",
148 "application/postscript",
152 (const guchar *)"\xFF\xFF\x00\x00",
153 (const guchar *)"\xFE\xFF\x00\x00",
159 (const guchar *)"\xFF\xFF\x00\x00",
160 (const guchar *)"\xFF\xFF\x00\x00",
166 (const guchar *)"\xFF\xFF\xFF\x00",
167 (const guchar *)"\xEF\xBB\xBF\x00",
173 (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF",
174 (const guchar *)"\x47\x49\x46\x38\x37\x61",
180 (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF",
181 (const guchar *)"\x47\x49\x46\x38\x39\x61",
187 (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF",
188 (const guchar *)"\x89\x50\x4E\x47\x0D\x0A\x1A\x0A",
194 (const guchar *)"\xFF\xFF\xFF",
195 (const guchar *)"\xFF\xD8\xFF",
201 (const guchar *)"\xFF\xFF",
202 (const guchar *)"\x42\x4D",
208 (const guchar *)"\xFF\xFF\xFF\xFF",
209 (const guchar *)"\x00\x00\x01\x00",
211 "image/vnd.microsoft.icon",
215 /* Whether a given byte looks like it might be part of binary content.
216 * Source: HTML5 spec; borrowed from the Chromium mime sniffer code,
217 * which is BSD-licensed
219 static char byte_looks_binary[] = {
220 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, /* 0x00 - 0x0F */
221 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, /* 0x10 - 0x1F */
222 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x20 - 0x2F */
223 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x30 - 0x3F */
224 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x40 - 0x4F */
225 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x50 - 0x5F */
226 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x60 - 0x6F */
227 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x70 - 0x7F */
228 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x80 - 0x8F */
229 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x90 - 0x9F */
230 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xA0 - 0xAF */
231 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xB0 - 0xBF */
232 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xC0 - 0xCF */
233 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xD0 - 0xDF */
234 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xE0 - 0xEF */
235 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xF0 - 0xFF */
239 sniff_gio (SoupContentSniffer *sniffer, SoupMessage *msg, SoupBuffer *buffer)
247 uri = soup_message_get_uri (msg);
248 uri_path = soup_uri_to_string (uri, TRUE);
250 content_type= g_content_type_guess (uri_path, (const guchar*)buffer->data, buffer->length, &uncertain);
251 mime_type = g_content_type_get_mime_type (content_type);
254 g_free (content_type);
259 /* HTML5: 2.7.4 Content-Type sniffing: unknown type */
261 sniff_unknown (SoupContentSniffer *sniffer, SoupMessage *msg,
262 SoupBuffer *buffer, gboolean for_text_or_binary)
264 const guchar *resource = (const guchar *)buffer->data;
265 int resource_length = MIN (512, buffer->length);
269 for (i = 0; i < G_N_ELEMENTS (types_table); i++) {
270 SoupContentSnifferPattern *type_row = &(types_table[i]);
272 /* The scriptable types should be skiped for the text
273 * or binary path, but considered for other paths */
274 if (for_text_or_binary && type_row->scriptable)
277 if (type_row->has_ws) {
278 int index_stream = 0;
279 int index_pattern = 0;
280 gboolean skip_row = FALSE;
282 while ((index_stream < resource_length) &&
283 (index_pattern <= type_row->pattern_length)) {
284 /* Skip insignificant white space ("WS" in the spec) */
285 if (type_row->pattern[index_pattern] == ' ') {
286 if (resource[index_stream] == '\x09' ||
287 resource[index_stream] == '\x0a' ||
288 resource[index_stream] == '\x0c' ||
289 resource[index_stream] == '\x0d' ||
290 resource[index_stream] == '\x20')
295 if ((type_row->mask[index_pattern] & resource[index_stream]) != type_row->pattern[index_pattern]) {
307 if (index_pattern > type_row->pattern_length)
308 return g_strdup (type_row->sniffed_type);
312 if (resource_length < type_row->pattern_length)
315 for (j = 0; j < type_row->pattern_length; j++) {
316 if ((type_row->mask[j] & resource[j]) != type_row->pattern[j])
320 /* This means our comparison above matched completely */
321 if (j == type_row->pattern_length)
322 return g_strdup (type_row->sniffed_type);
326 /* The spec allows us to use platform sniffing to find out
327 * about other types that are not covered, but we need to be
328 * careful to not escalate privileges, if on text or binary.
330 gio_guess = sniff_gio (sniffer, msg, buffer);
332 if (for_text_or_binary) {
333 for (i = 0; i < G_N_ELEMENTS (types_table); i++) {
334 SoupContentSnifferPattern *type_row = &(types_table[i]);
336 if (!g_ascii_strcasecmp (type_row->sniffed_type, gio_guess) &&
337 type_row->scriptable) {
348 return g_strdup ("application/octet-stream");
351 /* HTML5: 2.7.3 Content-Type sniffing: text or binary */
353 sniff_text_or_binary (SoupContentSniffer *sniffer, SoupMessage *msg,
356 const guchar *resource = (const guchar *)buffer->data;
357 int resource_length = MIN (512, buffer->length);
358 gboolean looks_binary = FALSE;
361 /* Detecting UTF-16BE, UTF-16LE, or UTF-8 BOMs means it's text/plain */
362 if (resource_length >= 4) {
363 if ((resource[0] == 0xFE && resource[1] == 0xFF) ||
364 (resource[0] == 0xFF && resource[1] == 0xFE) ||
365 (resource[0] == 0xEF && resource[1] == 0xBB && resource[2] == 0xBF))
366 return g_strdup ("text/plain");
369 /* Look to see if any of the first n bytes looks binary */
370 for (i = 0; i < resource_length; i++) {
371 if (byte_looks_binary[resource[i]]) {
378 return g_strdup ("text/plain");
380 return sniff_unknown (sniffer, msg, buffer, TRUE);
384 sniff_images (SoupContentSniffer *sniffer, SoupMessage *msg,
385 SoupBuffer *buffer, const char *content_type)
387 const guchar *resource = (const guchar *)buffer->data;
388 int resource_length = MIN (512, buffer->length);
391 for (i = 0; i < G_N_ELEMENTS (types_table); i++) {
392 SoupContentSnifferPattern *type_row = &(types_table[i]);
394 if (resource_length < type_row->pattern_length)
397 if (!g_str_has_prefix (type_row->sniffed_type, "image/"))
400 /* All of the image types use all-\xFF for the mask,
401 * so we can just memcmp.
403 if (memcmp (type_row->pattern, resource, type_row->pattern_length) == 0)
404 return g_strdup (type_row->sniffed_type);
407 return g_strdup (content_type);
411 sniff_feed_or_html (SoupContentSniffer *sniffer, SoupMessage *msg, SoupBuffer *buffer)
413 const guchar *resource = (const guchar *)buffer->data;
414 int resource_length = MIN (512, buffer->length);
417 if (resource_length < 3)
420 /* Skip a leading UTF-8 BOM */
421 if (resource[0] == 0xEF && resource[1] == 0xBB && resource[2] == 0xBF)
425 if (pos > resource_length)
428 /* Skip insignificant white space */
429 while ((resource[pos] == '\x09') ||
430 (resource[pos] == '\x20') ||
431 (resource[pos] == '\x0A') ||
432 (resource[pos] == '\x0D')) {
435 if (pos > resource_length)
440 if (resource[pos] != '\x3C')
441 return g_strdup ("text/html");
445 if ((pos + 2) > resource_length)
448 /* Skipping comments */
449 if ((resource[pos] == '\x2D') ||
450 (resource[pos+1] == '\x2D') ||
451 (resource[pos+2] == '\x3E')) {
454 if ((pos + 2) > resource_length)
457 while ((resource[pos] != '\x2D') &&
458 (resource[pos+1] != '\x2D') &&
459 (resource[pos+2] != '\x3E')) {
462 if ((pos + 2) > resource_length)
469 if (pos > resource_length)
473 if (resource[pos] == '\x21') {
477 if (pos > resource_length)
479 } while (resource[pos] != '\x3E');
484 } else if (resource[pos] == '\x3F') { /* ? */
488 if ((pos + 1) > resource_length)
490 } while ((resource[pos] != '\x3F') &&
491 (resource[pos+1] != '\x3E'));
498 if ((pos + 2) > resource_length)
501 if ((resource[pos] == '\x72') &&
502 (resource[pos+1] == '\x73') &&
503 (resource[pos+2] == '\x73'))
504 return g_strdup ("application/rss+xml");
506 if ((pos + 3) > resource_length)
509 if ((resource[pos] == '\x66') &&
510 (resource[pos+1] == '\x65') &&
511 (resource[pos+2] == '\x65') &&
512 (resource[pos+3] == '\x64'))
513 return g_strdup ("application/atom+xml");
516 return g_strdup ("text/html");
520 sniff (SoupContentSniffer *sniffer, SoupMessage *msg, SoupBuffer *buffer, GHashTable **params)
522 const char *content_type;
524 content_type = soup_message_headers_get_content_type (msg->response_headers, params);
526 /* These comparisons are done in an ASCII-case-insensitive
527 * manner because the spec requires it */
528 if ((content_type == NULL) ||
529 !g_ascii_strcasecmp (content_type, "unknown/unknown") ||
530 !g_ascii_strcasecmp (content_type, "application/unknown") ||
531 !g_ascii_strcasecmp (content_type, "*/*"))
532 return sniff_unknown (sniffer, msg, buffer, FALSE);
534 if (g_str_has_suffix (content_type, "+xml") ||
535 !g_ascii_strcasecmp (content_type, "text/xml") ||
536 !g_ascii_strcasecmp (content_type, "application/xml"))
537 return g_strdup (content_type);
539 /* 2.7.5 Content-Type sniffing: image
542 * If the resource's official type is "image/svg+xml", then
543 * the sniffed type of the resource is its official type (an
546 * The XML case is handled by the if above; if you refactor
547 * this code, keep this in mind.
549 if (!g_ascii_strncasecmp (content_type, "image/", 6))
550 return sniff_images (sniffer, msg, buffer, content_type);
552 /* If we got text/plain, use text_or_binary */
553 if (g_str_equal (content_type, "text/plain")) {
554 return sniff_text_or_binary (sniffer, msg, buffer);
557 if (!g_ascii_strcasecmp (content_type, "text/html"))
558 return sniff_feed_or_html (sniffer, msg, buffer);
560 return g_strdup (content_type);
564 get_buffer_size (SoupContentSniffer *sniffer)
570 soup_content_sniffer_got_headers_cb (SoupMessage *msg, SoupContentSniffer *sniffer)
572 SoupMessagePrivate *priv = SOUP_MESSAGE_GET_PRIVATE (msg);
573 SoupContentSnifferClass *content_sniffer_class = SOUP_CONTENT_SNIFFER_GET_CLASS (sniffer);
575 priv->bytes_for_sniffing = content_sniffer_class->get_buffer_size (sniffer);
579 request_queued (SoupSessionFeature *feature, SoupSession *session,
582 SoupMessagePrivate *priv = SOUP_MESSAGE_GET_PRIVATE (msg);
584 priv->sniffer = g_object_ref (feature);
585 g_signal_connect (msg, "got-headers",
586 G_CALLBACK (soup_content_sniffer_got_headers_cb),
591 request_unqueued (SoupSessionFeature *feature, SoupSession *session,
594 SoupMessagePrivate *priv = SOUP_MESSAGE_GET_PRIVATE (msg);
596 g_object_unref (priv->sniffer);
597 priv->sniffer = NULL;
599 g_signal_handlers_disconnect_by_func (msg, soup_content_sniffer_got_headers_cb, feature);