1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
3 * soup-content-sniffer.c
5 * Copyright (C) 2009 Gustavo Noronha Silva.
14 #include "soup-content-sniffer.h"
16 #include "soup-content-processor.h"
17 #include "soup-content-sniffer-stream.h"
18 #include "soup-message-private.h"
21 * SECTION:soup-content-sniffer
22 * @short_description: Content sniffing for #SoupSession
24 * A #SoupContentSniffer tries to detect the actual content type of
25 * the files that are being downloaded by looking at some of the data
26 * before the #SoupMessage emits its #SoupMessage::got-headers signal.
27 * #SoupContentSniffer implements #SoupSessionFeature, so you can add
28 * content sniffing to a session with soup_session_add_feature() or
29 * soup_session_add_feature_by_type().
34 static void soup_content_sniffer_session_feature_init (SoupSessionFeatureInterface *feature_interface, gpointer interface_data);
36 static SoupContentProcessorInterface *soup_content_sniffer_default_content_processor_interface;
37 static void soup_content_sniffer_content_processor_init (SoupContentProcessorInterface *interface, gpointer interface_data);
40 G_DEFINE_TYPE_WITH_CODE (SoupContentSniffer, soup_content_sniffer, G_TYPE_OBJECT,
41 G_IMPLEMENT_INTERFACE (SOUP_TYPE_SESSION_FEATURE,
42 soup_content_sniffer_session_feature_init)
43 G_IMPLEMENT_INTERFACE (SOUP_TYPE_CONTENT_PROCESSOR,
44 soup_content_sniffer_content_processor_init))
48 soup_content_sniffer_content_processor_wrap_input (SoupContentProcessor *processor,
49 GInputStream *base_stream,
53 return g_object_new (SOUP_TYPE_CONTENT_SNIFFER_STREAM,
54 "base-stream", base_stream,
56 "sniffer", SOUP_CONTENT_SNIFFER (processor),
61 soup_content_sniffer_content_processor_init (SoupContentProcessorInterface *processor_interface,
62 gpointer interface_data)
64 soup_content_sniffer_default_content_processor_interface =
65 g_type_default_interface_peek (SOUP_TYPE_CONTENT_PROCESSOR);
67 processor_interface->processing_stage = SOUP_STAGE_BODY_DATA;
68 processor_interface->wrap_input = soup_content_sniffer_content_processor_wrap_input;
72 soup_content_sniffer_init (SoupContentSniffer *content_sniffer)
76 /* This table is based on the HTML5 spec;
77 * See 2.7.4 Content-Type sniffing: unknown type
80 /* @has_ws is TRUE if @pattern contains "generic" whitespace */
83 const guchar *pattern;
85 const char *sniffed_type;
87 } SoupContentSnifferPattern;
89 static SoupContentSnifferPattern types_table[] = {
91 (const guchar *)"\xFF\xFF\xDF\xDF\xDF\xDF\xDF\xDF\xDF\xFF\xDF\xDF\xDF\xDF",
92 (const guchar *)"\x3C\x21\x44\x4F\x43\x54\x59\x50\x45\x20\x48\x54\x4D\x4C",
98 (const guchar *)"\xFF\xFF\xDF\xDF\xDF\xDF",
99 (const guchar *)" \x3C\x48\x54\x4D\x4C",
105 (const guchar *)"\xFF\xFF\xDF\xDF\xDF\xDF",
106 (const guchar *)" \x3C\x48\x45\x41\x44",
112 (const guchar *)"\xFF\xFF\xDF\xDF\xDF\xDF\xDF\xDF",
113 (const guchar *)" \x3C\x53\x43\x52\x49\x50\x54",
119 (const guchar *)"\xFF\xFF\xFF\xFF\xFF",
120 (const guchar *)"\x25\x50\x44\x46\x2D",
126 (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF",
127 (const guchar *)"\x25\x21\x50\x53\x2D\x41\x64\x6F\x62\x65\x2D",
129 "application/postscript",
133 (const guchar *)"\xFF\xFF\x00\x00",
134 (const guchar *)"\xFE\xFF\x00\x00",
140 (const guchar *)"\xFF\xFF\x00\x00",
141 (const guchar *)"\xFF\xFF\x00\x00",
147 (const guchar *)"\xFF\xFF\xFF\x00",
148 (const guchar *)"\xEF\xBB\xBF\x00",
154 (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF",
155 (const guchar *)"\x47\x49\x46\x38\x37\x61",
161 (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF",
162 (const guchar *)"\x47\x49\x46\x38\x39\x61",
168 (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF",
169 (const guchar *)"\x89\x50\x4E\x47\x0D\x0A\x1A\x0A",
175 (const guchar *)"\xFF\xFF\xFF",
176 (const guchar *)"\xFF\xD8\xFF",
182 (const guchar *)"\xFF\xFF",
183 (const guchar *)"\x42\x4D",
189 (const guchar *)"\xFF\xFF\xFF\xFF",
190 (const guchar *)"\x00\x00\x01\x00",
192 "image/vnd.microsoft.icon",
196 /* Whether a given byte looks like it might be part of binary content.
197 * Source: HTML5 spec; borrowed from the Chromium mime sniffer code,
198 * which is BSD-licensed
200 static char byte_looks_binary[] = {
201 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, /* 0x00 - 0x0F */
202 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, /* 0x10 - 0x1F */
203 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x20 - 0x2F */
204 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x30 - 0x3F */
205 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x40 - 0x4F */
206 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x50 - 0x5F */
207 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x60 - 0x6F */
208 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x70 - 0x7F */
209 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x80 - 0x8F */
210 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x90 - 0x9F */
211 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xA0 - 0xAF */
212 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xB0 - 0xBF */
213 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xC0 - 0xCF */
214 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xD0 - 0xDF */
215 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xE0 - 0xEF */
216 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xF0 - 0xFF */
219 /* HTML5: 2.7.4 Content-Type sniffing: unknown type */
221 sniff_unknown (SoupContentSniffer *sniffer, SoupBuffer *buffer,
222 gboolean for_text_or_binary)
224 const guchar *resource = (const guchar *)buffer->data;
225 int resource_length = MIN (512, buffer->length);
228 for (i = 0; i < G_N_ELEMENTS (types_table); i++) {
229 SoupContentSnifferPattern *type_row = &(types_table[i]);
231 /* The scriptable types should be skiped for the text
232 * or binary path, but considered for other paths */
233 if (for_text_or_binary && type_row->scriptable)
236 if (type_row->has_ws) {
237 int index_stream = 0;
238 int index_pattern = 0;
239 gboolean skip_row = FALSE;
241 while ((index_stream < resource_length) &&
242 (index_pattern <= type_row->pattern_length)) {
243 /* Skip insignificant white space ("WS" in the spec) */
244 if (type_row->pattern[index_pattern] == ' ') {
245 if (resource[index_stream] == '\x09' ||
246 resource[index_stream] == '\x0a' ||
247 resource[index_stream] == '\x0c' ||
248 resource[index_stream] == '\x0d' ||
249 resource[index_stream] == '\x20')
254 if ((type_row->mask[index_pattern] & resource[index_stream]) != type_row->pattern[index_pattern]) {
266 if (index_pattern > type_row->pattern_length)
267 return g_strdup (type_row->sniffed_type);
271 if (resource_length < type_row->pattern_length)
274 for (j = 0; j < type_row->pattern_length; j++) {
275 if ((type_row->mask[j] & resource[j]) != type_row->pattern[j])
279 /* This means our comparison above matched completely */
280 if (j == type_row->pattern_length)
281 return g_strdup (type_row->sniffed_type);
285 if (for_text_or_binary)
286 return g_strdup ("application/octet-stream");
288 for (i = 0; i < resource_length; i++) {
289 if (byte_looks_binary[resource[i]])
290 return g_strdup ("application/octet-stream");
293 return g_strdup ("text/plain");
296 /* HTML5: 2.7.3 Content-Type sniffing: text or binary */
298 sniff_text_or_binary (SoupContentSniffer *sniffer, SoupBuffer *buffer)
300 const guchar *resource = (const guchar *)buffer->data;
301 int resource_length = MIN (512, buffer->length);
302 gboolean looks_binary = FALSE;
305 /* Detecting UTF-16BE, UTF-16LE, or UTF-8 BOMs means it's text/plain */
306 if (resource_length >= 4) {
307 if ((resource[0] == 0xFE && resource[1] == 0xFF) ||
308 (resource[0] == 0xFF && resource[1] == 0xFE) ||
309 (resource[0] == 0xEF && resource[1] == 0xBB && resource[2] == 0xBF))
310 return g_strdup ("text/plain");
313 /* Look to see if any of the first n bytes looks binary */
314 for (i = 0; i < resource_length; i++) {
315 if (byte_looks_binary[resource[i]]) {
322 return g_strdup ("text/plain");
324 return sniff_unknown (sniffer, buffer, TRUE);
328 sniff_images (SoupContentSniffer *sniffer, SoupBuffer *buffer,
329 const char *content_type)
331 const guchar *resource = (const guchar *)buffer->data;
332 int resource_length = MIN (512, buffer->length);
335 for (i = 0; i < G_N_ELEMENTS (types_table); i++) {
336 SoupContentSnifferPattern *type_row = &(types_table[i]);
338 if (resource_length < type_row->pattern_length)
341 if (!g_str_has_prefix (type_row->sniffed_type, "image/"))
344 /* All of the image types use all-\xFF for the mask,
345 * so we can just memcmp.
347 if (memcmp (type_row->pattern, resource, type_row->pattern_length) == 0)
348 return g_strdup (type_row->sniffed_type);
351 return g_strdup (content_type);
355 sniff_feed_or_html (SoupContentSniffer *sniffer, SoupBuffer *buffer)
357 const guchar *resource = (const guchar *)buffer->data;
358 int resource_length = MIN (512, buffer->length);
361 if (resource_length < 3)
364 /* Skip a leading UTF-8 BOM */
365 if (resource[0] == 0xEF && resource[1] == 0xBB && resource[2] == 0xBF)
369 if (pos > resource_length)
372 /* Skip insignificant white space */
373 while ((resource[pos] == '\x09') ||
374 (resource[pos] == '\x20') ||
375 (resource[pos] == '\x0A') ||
376 (resource[pos] == '\x0D')) {
379 if (pos > resource_length)
384 if (resource[pos] != '\x3C')
385 return g_strdup ("text/html");
389 if ((pos + 2) > resource_length)
392 /* Skipping comments */
393 if ((resource[pos] == '\x2D') ||
394 (resource[pos+1] == '\x2D') ||
395 (resource[pos+2] == '\x3E')) {
398 if ((pos + 2) > resource_length)
401 while ((resource[pos] != '\x2D') &&
402 (resource[pos+1] != '\x2D') &&
403 (resource[pos+2] != '\x3E')) {
406 if ((pos + 2) > resource_length)
413 if (pos > resource_length)
417 if (resource[pos] == '\x21') {
421 if (pos > resource_length)
423 } while (resource[pos] != '\x3E');
428 } else if (resource[pos] == '\x3F') { /* ? */
432 if ((pos + 1) > resource_length)
434 } while ((resource[pos] != '\x3F') &&
435 (resource[pos+1] != '\x3E'));
442 if ((pos + 2) > resource_length)
445 if ((resource[pos] == '\x72') &&
446 (resource[pos+1] == '\x73') &&
447 (resource[pos+2] == '\x73'))
448 return g_strdup ("application/rss+xml");
450 if ((pos + 3) > resource_length)
453 if ((resource[pos] == '\x66') &&
454 (resource[pos+1] == '\x65') &&
455 (resource[pos+2] == '\x65') &&
456 (resource[pos+3] == '\x64'))
457 return g_strdup ("application/atom+xml");
460 return g_strdup ("text/html");
464 soup_content_sniffer_real_sniff (SoupContentSniffer *sniffer, SoupMessage *msg,
465 SoupBuffer *buffer, GHashTable **params)
467 const char *content_type;
469 content_type = soup_message_headers_get_content_type (msg->response_headers, params);
471 /* These comparisons are done in an ASCII-case-insensitive
472 * manner because the spec requires it */
473 if ((content_type == NULL) ||
474 !g_ascii_strcasecmp (content_type, "unknown/unknown") ||
475 !g_ascii_strcasecmp (content_type, "application/unknown") ||
476 !g_ascii_strcasecmp (content_type, "*/*"))
477 return sniff_unknown (sniffer, buffer, FALSE);
479 if (g_str_has_suffix (content_type, "+xml") ||
480 !g_ascii_strcasecmp (content_type, "text/xml") ||
481 !g_ascii_strcasecmp (content_type, "application/xml"))
482 return g_strdup (content_type);
484 /* 2.7.5 Content-Type sniffing: image
487 * If the resource's official type is "image/svg+xml", then
488 * the sniffed type of the resource is its official type (an
491 * The XML case is handled by the if above; if you refactor
492 * this code, keep this in mind.
494 if (!g_ascii_strncasecmp (content_type, "image/", 6))
495 return sniff_images (sniffer, buffer, content_type);
497 /* If we got text/plain, use text_or_binary */
498 if (g_str_equal (content_type, "text/plain")) {
499 return sniff_text_or_binary (sniffer, buffer);
502 if (!g_ascii_strcasecmp (content_type, "text/html"))
503 return sniff_feed_or_html (sniffer, buffer);
505 return g_strdup (content_type);
509 soup_content_sniffer_real_get_buffer_size (SoupContentSniffer *sniffer)
515 soup_content_sniffer_got_headers_cb (SoupMessage *msg, SoupContentSniffer *sniffer)
517 SoupMessagePrivate *priv = SOUP_MESSAGE_GET_PRIVATE (msg);
519 priv->bytes_for_sniffing = soup_content_sniffer_get_buffer_size (sniffer);
523 soup_content_sniffer_request_queued (SoupSessionFeature *feature,
524 SoupSession *session,
527 SoupMessagePrivate *priv = SOUP_MESSAGE_GET_PRIVATE (msg);
529 priv->sniffer = g_object_ref (feature);
530 g_signal_connect (msg, "got-headers",
531 G_CALLBACK (soup_content_sniffer_got_headers_cb),
536 soup_content_sniffer_request_unqueued (SoupSessionFeature *feature,
537 SoupSession *session,
540 SoupMessagePrivate *priv = SOUP_MESSAGE_GET_PRIVATE (msg);
542 g_object_unref (priv->sniffer);
543 priv->sniffer = NULL;
545 g_signal_handlers_disconnect_by_func (msg, soup_content_sniffer_got_headers_cb, feature);
549 soup_content_sniffer_class_init (SoupContentSnifferClass *content_sniffer_class)
551 content_sniffer_class->sniff = soup_content_sniffer_real_sniff;
552 content_sniffer_class->get_buffer_size = soup_content_sniffer_real_get_buffer_size;
556 soup_content_sniffer_session_feature_init (SoupSessionFeatureInterface *feature_interface,
557 gpointer interface_data)
559 feature_interface->request_queued = soup_content_sniffer_request_queued;
560 feature_interface->request_unqueued = soup_content_sniffer_request_unqueued;
564 * soup_content_sniffer_new:
566 * Creates a new #SoupContentSniffer.
568 * Returns: a new #SoupContentSniffer
573 soup_content_sniffer_new ()
575 return g_object_new (SOUP_TYPE_CONTENT_SNIFFER, NULL);
579 * soup_content_sniffer_sniff:
580 * @sniffer: a #SoupContentSniffer
581 * @msg: the message to sniff
582 * @buffer: a buffer containing the start of @msg's response body
583 * @params: (element-type utf8 utf8) (out) (transfer full) (allow-none): return
584 * location for Content-Type parameters (eg, "charset"), or %NULL
586 * Sniffs @buffer to determine its Content-Type. The result may also
587 * be influenced by the Content-Type declared in @msg's response
590 * Return value: the sniffed Content-Type of @buffer; this will never be %NULL,
591 * but may be "application/octet-stream".
596 soup_content_sniffer_sniff (SoupContentSniffer *sniffer,
597 SoupMessage *msg, SoupBuffer *buffer,
600 g_return_val_if_fail (SOUP_IS_CONTENT_SNIFFER (sniffer), NULL);
601 g_return_val_if_fail (SOUP_IS_MESSAGE (msg), NULL);
602 g_return_val_if_fail (buffer != NULL, NULL);
604 return SOUP_CONTENT_SNIFFER_GET_CLASS (sniffer)->sniff (sniffer, msg, buffer, params);
608 * soup_content_sniffer_get_buffer_size:
609 * @sniffer: a #SoupContentSniffer
611 * Gets the number of bytes @sniffer needs in order to properly sniff
614 * Return value: the number of bytes to sniff
619 soup_content_sniffer_get_buffer_size (SoupContentSniffer *sniffer)
621 g_return_val_if_fail (SOUP_IS_CONTENT_SNIFFER (sniffer), 0);
623 return SOUP_CONTENT_SNIFFER_GET_CLASS (sniffer)->get_buffer_size (sniffer);