soup-auth-manager: add soup_auth_manager_use_auth()
[platform/upstream/libsoup.git] / libsoup / soup-content-sniffer.c
1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
2 /*
3  * soup-content-sniffer.c
4  *
5  * Copyright (C) 2009 Gustavo Noronha Silva.
6  */
7
8 #ifdef HAVE_CONFIG_H
9 #include <config.h>
10 #endif
11
12 #include <string.h>
13
14 #include "soup-content-sniffer.h"
15 #include "soup.h"
16 #include "soup-content-processor.h"
17 #include "soup-content-sniffer-stream.h"
18 #include "soup-message-private.h"
19
20 /**
21  * SECTION:soup-content-sniffer
22  * @short_description: Content sniffing for #SoupSession
23  *
24  * A #SoupContentSniffer tries to detect the actual content type of
25  * the files that are being downloaded by looking at some of the data
26  * before the #SoupMessage emits its #SoupMessage::got-headers signal.
27  * #SoupContentSniffer implements #SoupSessionFeature, so you can add
28  * content sniffing to a session with soup_session_add_feature() or
29  * soup_session_add_feature_by_type().
30  *
31  * Since: 2.28
32  **/
33
34 static void soup_content_sniffer_session_feature_init (SoupSessionFeatureInterface *feature_interface, gpointer interface_data);
35
36 static SoupContentProcessorInterface *soup_content_sniffer_default_content_processor_interface;
37 static void soup_content_sniffer_content_processor_init (SoupContentProcessorInterface *interface, gpointer interface_data);
38
39
40 G_DEFINE_TYPE_WITH_CODE (SoupContentSniffer, soup_content_sniffer, G_TYPE_OBJECT,
41                          G_IMPLEMENT_INTERFACE (SOUP_TYPE_SESSION_FEATURE,
42                                                 soup_content_sniffer_session_feature_init)
43                          G_IMPLEMENT_INTERFACE (SOUP_TYPE_CONTENT_PROCESSOR,
44                                                 soup_content_sniffer_content_processor_init))
45
46
47 static GInputStream *
48 soup_content_sniffer_content_processor_wrap_input (SoupContentProcessor *processor,
49                                                    GInputStream *base_stream,
50                                                    SoupMessage *msg,
51                                                    GError **error)
52 {
53         return g_object_new (SOUP_TYPE_CONTENT_SNIFFER_STREAM,
54                              "base-stream", base_stream,
55                              "message", msg,
56                              "sniffer", SOUP_CONTENT_SNIFFER (processor),
57                              NULL);
58 }
59
60 static void
61 soup_content_sniffer_content_processor_init (SoupContentProcessorInterface *processor_interface,
62                                             gpointer interface_data)
63 {
64         soup_content_sniffer_default_content_processor_interface =
65                 g_type_default_interface_peek (SOUP_TYPE_CONTENT_PROCESSOR);
66
67         processor_interface->processing_stage = SOUP_STAGE_BODY_DATA;
68         processor_interface->wrap_input = soup_content_sniffer_content_processor_wrap_input;
69 }
70
71 static void
72 soup_content_sniffer_init (SoupContentSniffer *content_sniffer)
73 {
74 }
75
76 /* This table is based on the HTML5 spec;
77  * See 2.7.4 Content-Type sniffing: unknown type
78  */
79 typedef struct {
80         /* @has_ws is TRUE if @pattern contains "generic" whitespace */
81         gboolean      has_ws;
82         const guchar *mask;
83         const guchar *pattern;
84         guint         pattern_length;
85         const char   *sniffed_type;
86         gboolean      scriptable;
87 } SoupContentSnifferPattern;
88
89 static SoupContentSnifferPattern types_table[] = {
90         { FALSE,
91           (const guchar *)"\xFF\xFF\xDF\xDF\xDF\xDF\xDF\xDF\xDF\xFF\xDF\xDF\xDF\xDF",
92           (const guchar *)"\x3C\x21\x44\x4F\x43\x54\x59\x50\x45\x20\x48\x54\x4D\x4C",
93           14,
94           "text/html",
95           TRUE },
96
97         { TRUE,
98           (const guchar *)"\xFF\xFF\xDF\xDF\xDF\xDF",
99           (const guchar *)" \x3C\x48\x54\x4D\x4C",
100           5,
101           "text/html",
102           TRUE },
103
104         { TRUE,
105           (const guchar *)"\xFF\xFF\xDF\xDF\xDF\xDF",
106           (const guchar *)" \x3C\x48\x45\x41\x44",
107           5,
108           "text/html",
109           TRUE },
110
111         { TRUE,
112           (const guchar *)"\xFF\xFF\xDF\xDF\xDF\xDF\xDF\xDF",
113           (const guchar *)" \x3C\x53\x43\x52\x49\x50\x54",
114           7,
115           "text/html",
116           TRUE },
117
118         { FALSE,
119           (const guchar *)"\xFF\xFF\xFF\xFF\xFF",
120           (const guchar *)"\x25\x50\x44\x46\x2D",
121           5,
122           "application/pdf",
123           TRUE },
124
125         { FALSE,
126           (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF",
127           (const guchar *)"\x25\x21\x50\x53\x2D\x41\x64\x6F\x62\x65\x2D",
128           11,
129           "application/postscript",
130           FALSE },
131
132         { FALSE,
133           (const guchar *)"\xFF\xFF\x00\x00",
134           (const guchar *)"\xFE\xFF\x00\x00",
135           4,
136           "text/plain",
137           FALSE },
138
139         { FALSE,
140           (const guchar *)"\xFF\xFF\x00\x00",
141           (const guchar *)"\xFF\xFF\x00\x00",
142           4,
143           "text/plain",
144           FALSE },
145
146         { FALSE,
147           (const guchar *)"\xFF\xFF\xFF\x00",
148           (const guchar *)"\xEF\xBB\xBF\x00",
149           4,
150           "text/plain",
151           FALSE },
152
153         { FALSE,
154           (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF",
155           (const guchar *)"\x47\x49\x46\x38\x37\x61",
156           6,
157           "image/gif",
158           FALSE },
159
160         { FALSE,
161           (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF",
162           (const guchar *)"\x47\x49\x46\x38\x39\x61",
163           6,
164           "image/gif",
165           FALSE },
166
167         { FALSE,
168           (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF",
169           (const guchar *)"\x89\x50\x4E\x47\x0D\x0A\x1A\x0A",
170           8,
171           "image/png",
172           FALSE },
173
174         { FALSE,
175           (const guchar *)"\xFF\xFF\xFF",
176           (const guchar *)"\xFF\xD8\xFF",
177           3,
178           "image/jpeg",
179           FALSE },
180
181         { FALSE,
182           (const guchar *)"\xFF\xFF",
183           (const guchar *)"\x42\x4D",
184           2,
185           "image/bmp",
186           FALSE },
187
188         { FALSE,
189           (const guchar *)"\xFF\xFF\xFF\xFF",
190           (const guchar *)"\x00\x00\x01\x00",
191           4,
192           "image/vnd.microsoft.icon",
193           FALSE }
194 };
195
196 /* Whether a given byte looks like it might be part of binary content.
197  * Source: HTML5 spec; borrowed from the Chromium mime sniffer code,
198  * which is BSD-licensed
199  */
200 static char byte_looks_binary[] = {
201         1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1,  /* 0x00 - 0x0F */
202         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,  /* 0x10 - 0x1F */
203         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x20 - 0x2F */
204         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x30 - 0x3F */
205         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x40 - 0x4F */
206         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x50 - 0x5F */
207         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x60 - 0x6F */
208         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x70 - 0x7F */
209         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x80 - 0x8F */
210         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x90 - 0x9F */
211         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0xA0 - 0xAF */
212         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0xB0 - 0xBF */
213         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0xC0 - 0xCF */
214         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0xD0 - 0xDF */
215         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0xE0 - 0xEF */
216         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0xF0 - 0xFF */
217 };
218
219 /* HTML5: 2.7.4 Content-Type sniffing: unknown type */
220 static char*
221 sniff_unknown (SoupContentSniffer *sniffer, SoupBuffer *buffer,
222                gboolean for_text_or_binary)
223 {
224         const guchar *resource = (const guchar *)buffer->data;
225         int resource_length = MIN (512, buffer->length);
226         int i;
227
228         for (i = 0; i < G_N_ELEMENTS (types_table); i++) {
229                 SoupContentSnifferPattern *type_row = &(types_table[i]);
230
231                 /* The scriptable types should be skiped for the text
232                  * or binary path, but considered for other paths */
233                 if (for_text_or_binary && type_row->scriptable)
234                         continue;
235
236                 if (type_row->has_ws) {
237                         int index_stream = 0;
238                         int index_pattern = 0;
239                         gboolean skip_row = FALSE;
240
241                         while ((index_stream < resource_length) &&
242                                (index_pattern <= type_row->pattern_length)) {
243                                 /* Skip insignificant white space ("WS" in the spec) */
244                                 if (type_row->pattern[index_pattern] == ' ') {
245                                         if (resource[index_stream] == '\x09' ||
246                                             resource[index_stream] == '\x0a' ||
247                                             resource[index_stream] == '\x0c' ||
248                                             resource[index_stream] == '\x0d' ||
249                                             resource[index_stream] == '\x20')
250                                                 index_stream++;
251                                         else
252                                                 index_pattern++;
253                                 } else {
254                                         if ((type_row->mask[index_pattern] & resource[index_stream]) != type_row->pattern[index_pattern]) {
255                                                 skip_row = TRUE;
256                                                 break;
257                                         }
258                                         index_pattern++;
259                                         index_stream++;
260                                 }
261                         }
262
263                         if (skip_row)
264                                 continue;
265
266                         if (index_pattern > type_row->pattern_length)
267                                 return g_strdup (type_row->sniffed_type);
268                 } else {
269                         int j;
270
271                         if (resource_length < type_row->pattern_length)
272                                 continue;
273
274                         for (j = 0; j < type_row->pattern_length; j++) {
275                                 if ((type_row->mask[j] & resource[j]) != type_row->pattern[j])
276                                         break;
277                         }
278
279                         /* This means our comparison above matched completely */
280                         if (j == type_row->pattern_length)
281                                 return g_strdup (type_row->sniffed_type);
282                 }
283         }
284
285         if (for_text_or_binary)
286                 return g_strdup ("application/octet-stream");
287
288         for (i = 0; i < resource_length; i++) {
289                 if (byte_looks_binary[resource[i]])
290                         return g_strdup ("application/octet-stream");
291         }
292
293         return g_strdup ("text/plain");
294 }
295
296 /* HTML5: 2.7.3 Content-Type sniffing: text or binary */
297 static char*
298 sniff_text_or_binary (SoupContentSniffer *sniffer, SoupBuffer *buffer)
299 {
300         const guchar *resource = (const guchar *)buffer->data;
301         int resource_length = MIN (512, buffer->length);
302         gboolean looks_binary = FALSE;
303         int i;
304
305         /* Detecting UTF-16BE, UTF-16LE, or UTF-8 BOMs means it's text/plain */
306         if (resource_length >= 4) {
307                 if ((resource[0] == 0xFE && resource[1] == 0xFF) ||
308                     (resource[0] == 0xFF && resource[1] == 0xFE) ||
309                     (resource[0] == 0xEF && resource[1] == 0xBB && resource[2] == 0xBF))
310                         return g_strdup ("text/plain");
311         }
312
313         /* Look to see if any of the first n bytes looks binary */
314         for (i = 0; i < resource_length; i++) {
315                 if (byte_looks_binary[resource[i]]) {
316                         looks_binary = TRUE;
317                         break;
318                 }
319         }
320
321         if (!looks_binary)
322                 return g_strdup ("text/plain");
323
324         return sniff_unknown (sniffer, buffer, TRUE);
325 }
326
327 static char*
328 sniff_images (SoupContentSniffer *sniffer, SoupBuffer *buffer,
329               const char *content_type)
330 {
331         const guchar *resource = (const guchar *)buffer->data;
332         int resource_length = MIN (512, buffer->length);
333         int i;
334
335         for (i = 0; i < G_N_ELEMENTS (types_table); i++) {
336                 SoupContentSnifferPattern *type_row = &(types_table[i]);
337
338                 if (resource_length < type_row->pattern_length)
339                         continue;
340
341                 if (!g_str_has_prefix (type_row->sniffed_type, "image/"))
342                         continue;
343
344                 /* All of the image types use all-\xFF for the mask,
345                  * so we can just memcmp.
346                  */
347                 if (memcmp (type_row->pattern, resource, type_row->pattern_length) == 0)
348                         return g_strdup (type_row->sniffed_type);
349         }
350
351         return g_strdup (content_type);
352 }
353
354 static char*
355 sniff_feed_or_html (SoupContentSniffer *sniffer, SoupBuffer *buffer)
356 {
357         const guchar *resource = (const guchar *)buffer->data;
358         int resource_length = MIN (512, buffer->length);
359         int pos = 0;
360
361         if (resource_length < 3)
362                 goto text_html;
363
364         /* Skip a leading UTF-8 BOM */
365         if (resource[0] == 0xEF && resource[1] == 0xBB && resource[2] == 0xBF)
366                 pos = 3;
367
368  look_for_tag:
369         if (pos > resource_length)
370                 goto text_html;
371
372         /* Skip insignificant white space */
373         while ((resource[pos] == '\x09') ||
374                (resource[pos] == '\x20') ||
375                (resource[pos] == '\x0A') ||
376                (resource[pos] == '\x0D')) {
377                 pos++;
378
379                 if (pos > resource_length)
380                         goto text_html;
381         }
382
383         /* != < */
384         if (resource[pos] != '\x3C')
385                 return g_strdup ("text/html");
386
387         pos++;
388
389         if ((pos + 2) > resource_length)
390                 goto text_html;
391
392         /* Skipping comments */
393         if ((resource[pos] == '\x2D') ||
394             (resource[pos+1] == '\x2D') ||
395             (resource[pos+2] == '\x3E')) {
396                 pos = pos + 3;
397
398                 if ((pos + 2) > resource_length)
399                         goto text_html;
400
401                 while ((resource[pos] != '\x2D') &&
402                        (resource[pos+1] != '\x2D') &&
403                        (resource[pos+2] != '\x3E')) {
404                         pos++;
405
406                         if ((pos + 2) > resource_length)
407                                 goto text_html;
408                 }
409
410                 goto look_for_tag;
411         }
412
413         if (pos > resource_length)
414                 goto text_html;
415
416         /* == ! */
417         if (resource[pos] == '\x21') {
418                 do {
419                         pos++;
420
421                         if (pos > resource_length)
422                                 goto text_html;
423                 } while (resource[pos] != '\x3E');
424
425                 pos++;
426
427                 goto look_for_tag;
428         } else if (resource[pos] == '\x3F') { /* ? */
429                 do {
430                         pos++;
431
432                         if ((pos + 1) > resource_length)
433                                 goto text_html;
434                 } while ((resource[pos] != '\x3F') &&
435                          (resource[pos+1] != '\x3E'));
436
437                 pos = pos + 2;
438
439                 goto look_for_tag;
440         }
441
442         if ((pos + 2) > resource_length)
443                 goto text_html;
444
445         if ((resource[pos] == '\x72') &&
446             (resource[pos+1] == '\x73') &&
447             (resource[pos+2] == '\x73'))
448                 return g_strdup ("application/rss+xml");
449
450         if ((pos + 3) > resource_length)
451                 goto text_html;
452
453         if ((resource[pos] == '\x66') &&
454             (resource[pos+1] == '\x65') &&
455             (resource[pos+2] == '\x65') &&
456             (resource[pos+3] == '\x64'))
457                 return g_strdup ("application/atom+xml");
458
459  text_html:
460         return g_strdup ("text/html");
461 }
462
463 static char *
464 soup_content_sniffer_real_sniff (SoupContentSniffer *sniffer, SoupMessage *msg,
465                                  SoupBuffer *buffer, GHashTable **params)
466 {
467         const char *content_type;
468
469         content_type = soup_message_headers_get_content_type (msg->response_headers, params);
470
471         /* These comparisons are done in an ASCII-case-insensitive
472          * manner because the spec requires it */
473         if ((content_type == NULL) ||
474             !g_ascii_strcasecmp (content_type, "unknown/unknown") ||
475             !g_ascii_strcasecmp (content_type, "application/unknown") ||
476             !g_ascii_strcasecmp (content_type, "*/*"))
477                 return sniff_unknown (sniffer, buffer, FALSE);
478
479         if (g_str_has_suffix (content_type, "+xml") ||
480             !g_ascii_strcasecmp (content_type, "text/xml") ||
481             !g_ascii_strcasecmp (content_type, "application/xml"))
482                 return g_strdup (content_type);
483
484         /* 2.7.5 Content-Type sniffing: image
485          * The spec says:
486          *
487          *   If the resource's official type is "image/svg+xml", then
488          *   the sniffed type of the resource is its official type (an
489          *   XML type)
490          *
491          * The XML case is handled by the if above; if you refactor
492          * this code, keep this in mind.
493          */
494         if (!g_ascii_strncasecmp (content_type, "image/", 6))
495                 return sniff_images (sniffer, buffer, content_type);
496
497         /* If we got text/plain, use text_or_binary */
498         if (g_str_equal (content_type, "text/plain")) {
499                 return sniff_text_or_binary (sniffer, buffer);
500         }
501
502         if (!g_ascii_strcasecmp (content_type, "text/html"))
503                 return sniff_feed_or_html (sniffer, buffer);
504
505         return g_strdup (content_type);
506 }
507
508 static gsize
509 soup_content_sniffer_real_get_buffer_size (SoupContentSniffer *sniffer)
510 {
511         return 512;
512 }
513
514 static void
515 soup_content_sniffer_got_headers_cb (SoupMessage *msg, SoupContentSniffer *sniffer)
516 {
517         SoupMessagePrivate *priv = SOUP_MESSAGE_GET_PRIVATE (msg);
518
519         priv->bytes_for_sniffing = soup_content_sniffer_get_buffer_size (sniffer);
520 }
521
522 static void
523 soup_content_sniffer_request_queued (SoupSessionFeature *feature,
524                                      SoupSession *session,
525                                      SoupMessage *msg)
526 {
527         SoupMessagePrivate *priv = SOUP_MESSAGE_GET_PRIVATE (msg);
528
529         priv->sniffer = g_object_ref (feature);
530         g_signal_connect (msg, "got-headers",
531                           G_CALLBACK (soup_content_sniffer_got_headers_cb),
532                           feature);
533 }
534
535 static void
536 soup_content_sniffer_request_unqueued (SoupSessionFeature *feature,
537                                        SoupSession *session,
538                                        SoupMessage *msg)
539 {
540         SoupMessagePrivate *priv = SOUP_MESSAGE_GET_PRIVATE (msg);
541
542         g_object_unref (priv->sniffer);
543         priv->sniffer = NULL;
544
545         g_signal_handlers_disconnect_by_func (msg, soup_content_sniffer_got_headers_cb, feature);
546 }
547
548 static void
549 soup_content_sniffer_class_init (SoupContentSnifferClass *content_sniffer_class)
550 {
551         content_sniffer_class->sniff = soup_content_sniffer_real_sniff;
552         content_sniffer_class->get_buffer_size = soup_content_sniffer_real_get_buffer_size;
553 }
554
555 static void
556 soup_content_sniffer_session_feature_init (SoupSessionFeatureInterface *feature_interface,
557                                            gpointer interface_data)
558 {
559         feature_interface->request_queued = soup_content_sniffer_request_queued;
560         feature_interface->request_unqueued = soup_content_sniffer_request_unqueued;
561 }
562
563 /**
564  * soup_content_sniffer_new:
565  *
566  * Creates a new #SoupContentSniffer.
567  *
568  * Returns: a new #SoupContentSniffer
569  *
570  * Since: 2.28
571  **/
572 SoupContentSniffer *
573 soup_content_sniffer_new ()
574 {
575         return g_object_new (SOUP_TYPE_CONTENT_SNIFFER, NULL);
576 }
577
578 /**
579  * soup_content_sniffer_sniff:
580  * @sniffer: a #SoupContentSniffer
581  * @msg: the message to sniff
582  * @buffer: a buffer containing the start of @msg's response body
583  * @params: (element-type utf8 utf8) (out) (transfer full) (allow-none): return
584  *   location for Content-Type parameters (eg, "charset"), or %NULL
585  *
586  * Sniffs @buffer to determine its Content-Type. The result may also
587  * be influenced by the Content-Type declared in @msg's response
588  * headers.
589  *
590  * Return value: the sniffed Content-Type of @buffer; this will never be %NULL,
591  *   but may be "application/octet-stream".
592  *
593  * Since: 2.28
594  */
595 char *
596 soup_content_sniffer_sniff (SoupContentSniffer *sniffer,
597                             SoupMessage *msg, SoupBuffer *buffer,
598                             GHashTable **params)
599 {
600         g_return_val_if_fail (SOUP_IS_CONTENT_SNIFFER (sniffer), NULL);
601         g_return_val_if_fail (SOUP_IS_MESSAGE (msg), NULL);
602         g_return_val_if_fail (buffer != NULL, NULL);
603
604         return SOUP_CONTENT_SNIFFER_GET_CLASS (sniffer)->sniff (sniffer, msg, buffer, params);
605 }
606
607 /**
608  * soup_content_sniffer_get_buffer_size:
609  * @sniffer: a #SoupContentSniffer
610  *
611  * Gets the number of bytes @sniffer needs in order to properly sniff
612  * a buffer.
613  *
614  * Return value: the number of bytes to sniff
615  *
616  * Since: 2.28
617  */
618 gsize
619 soup_content_sniffer_get_buffer_size (SoupContentSniffer *sniffer)
620 {
621         g_return_val_if_fail (SOUP_IS_CONTENT_SNIFFER (sniffer), 0);
622
623         return SOUP_CONTENT_SNIFFER_GET_CLASS (sniffer)->get_buffer_size (sniffer);
624 }