Remove build warning
[platform/upstream/libsoup.git] / libsoup / soup-content-sniffer.c
1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
2 /*
3  * soup-content-sniffer.c
4  *
5  * Copyright (C) 2009, 2013 Gustavo Noronha Silva.
6  *
7  * This code implements the following specification:
8  *
9  *  http://mimesniff.spec.whatwg.org/ as of 11 June 2013
10  */
11
12 #ifdef HAVE_CONFIG_H
13 #include <config.h>
14 #endif
15
16 #include <string.h>
17
18 #include "soup-content-sniffer.h"
19 #include "soup.h"
20 #include "soup-content-processor.h"
21 #include "soup-content-sniffer-stream.h"
22 #include "soup-message-private.h"
23
24 #include "TIZEN.h"
25
26 /**
27  * SECTION:soup-content-sniffer
28  * @short_description: Content sniffing for SoupSession
29  *
30  * A #SoupContentSniffer tries to detect the actual content type of
31  * the files that are being downloaded by looking at some of the data
32  * before the #SoupMessage emits its #SoupMessage::got-headers signal.
33  * #SoupContentSniffer implements #SoupSessionFeature, so you can add
34  * content sniffing to a session with soup_session_add_feature() or
35  * soup_session_add_feature_by_type().
36  *
37  * Since: 2.28
38  **/
39
40 static void soup_content_sniffer_session_feature_init (SoupSessionFeatureInterface *feature_interface, gpointer interface_data);
41
42 static SoupContentProcessorInterface *soup_content_sniffer_default_content_processor_interface;
43 static void soup_content_sniffer_content_processor_init (SoupContentProcessorInterface *interface, gpointer interface_data);
44
45
46 G_DEFINE_TYPE_WITH_CODE (SoupContentSniffer, soup_content_sniffer, G_TYPE_OBJECT,
47                          G_IMPLEMENT_INTERFACE (SOUP_TYPE_SESSION_FEATURE,
48                                                 soup_content_sniffer_session_feature_init)
49                          G_IMPLEMENT_INTERFACE (SOUP_TYPE_CONTENT_PROCESSOR,
50                                                 soup_content_sniffer_content_processor_init))
51
52
53 static GInputStream *
54 soup_content_sniffer_content_processor_wrap_input (SoupContentProcessor *processor,
55                                                    GInputStream *base_stream,
56                                                    SoupMessage *msg,
57                                                    GError **error)
58 {
59         return g_object_new (SOUP_TYPE_CONTENT_SNIFFER_STREAM,
60                              "base-stream", base_stream,
61                              "message", msg,
62                              "sniffer", SOUP_CONTENT_SNIFFER (processor),
63                              NULL);
64 }
65
66 static void
67 soup_content_sniffer_content_processor_init (SoupContentProcessorInterface *processor_interface,
68                                             gpointer interface_data)
69 {
70         soup_content_sniffer_default_content_processor_interface =
71                 g_type_default_interface_peek (SOUP_TYPE_CONTENT_PROCESSOR);
72
73         processor_interface->processing_stage = SOUP_STAGE_BODY_DATA;
74         processor_interface->wrap_input = soup_content_sniffer_content_processor_wrap_input;
75 }
76
77 static void
78 soup_content_sniffer_init (SoupContentSniffer *content_sniffer)
79 {
80 }
81
82 typedef struct {
83         const guchar *mask;
84         const guchar *pattern;
85         guint         pattern_length;
86         const char   *sniffed_type;
87 } SoupContentSnifferMediaPattern;
88
89 static char*
90 sniff_media (SoupContentSniffer *sniffer,
91              SoupBuffer *buffer,
92              SoupContentSnifferMediaPattern table[],
93              int table_length)
94 {
95         const guchar *resource = (const guchar *)buffer->data;
96         int resource_length = MIN (512, buffer->length);
97         int i;
98
99         for (i = 0; i < table_length; i++) {
100                 SoupContentSnifferMediaPattern *type_row = &(table[i]);
101                 int j;
102
103                 if (resource_length < type_row->pattern_length)
104                         continue;
105
106                 for (j = 0; j < type_row->pattern_length; j++) {
107                         if ((type_row->mask[j] & resource[j]) != type_row->pattern[j])
108                                 break;
109                 }
110
111                 /* This means our comparison above matched completely */
112                 if (j == type_row->pattern_length)
113                         return g_strdup (type_row->sniffed_type);
114         }
115
116         return NULL;
117 }
118
119 /* This table is based on the MIMESNIFF spec;
120  * See 6.1 Matching an image type pattern
121  */
122 static SoupContentSnifferMediaPattern image_types_table[] = {
123
124         /* Windows icon signature. */
125         { (const guchar *)"\xFF\xFF\xFF\xFF",
126           (const guchar *)"\x00\x00\x01\x00",
127           4,
128           "image/x-icon" },
129
130         /* Windows cursor signature. */
131         { (const guchar *)"\xFF\xFF\xFF\xFF",
132           (const guchar *)"\x00\x00\x02\x00",
133           4,
134           "image/x-icon" },
135
136         /* BMP. */
137         { (const guchar *)"\xFF\xFF",
138           (const guchar *)"BM",
139           2,
140           "image/bmp" },
141
142         /* GIFs. */
143         { (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF",
144           (const guchar *)"GIF87a",
145           6,
146           "image/gif" },
147
148         { (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF",
149           (const guchar *)"GIF89a",
150           6,
151           "image/gif" },
152
153         /* WEBP. */
154         { (const guchar *)"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF\xFF\xFF",
155           (const guchar *)"RIFF\x00\x00\x00\x00WEBPVP",
156           14,
157           "image/webp" },
158
159         /* PNG. */
160         { (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF",
161           (const guchar *)"\x89PNG\x0D\x0A\x1A\x0A",
162           8,
163           "image/png" },
164
165         /* JPEG. */
166         { (const guchar *)"\xFF\xFF\xFF",
167           (const guchar *)"\xFF\xD8\xFF",
168           3,
169           "image/jpeg" },
170 };
171
172 static char*
173 sniff_images (SoupContentSniffer *sniffer, SoupBuffer *buffer)
174 {
175         return sniff_media (sniffer,
176                             buffer,
177                             image_types_table,
178                             G_N_ELEMENTS (image_types_table));
179 }
180
181 /* This table is based on the MIMESNIFF spec;
182  * See 6.2 Matching an audio or video type pattern
183  */
184 static SoupContentSnifferMediaPattern audio_video_types_table[] = {
185         { (const guchar *)"\xFF\xFF\xFF\xFF",
186           (const guchar *)"\x1A\x45\xDF\xA3",
187           4,
188           "video/webm" },
189
190         { (const guchar *)"\xFF\xFF\xFF\xFF",
191           (const guchar *)".snd",
192           4,
193           "audio/basic" },
194
195
196         { (const guchar *)"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF",
197           (const guchar *)"FORM\0\0\0\0AIFF",
198           12,
199           "audio/aiff" },
200
201         { (const guchar *)"\xFF\xFF\xFF",
202           (const guchar *)"ID3",
203           3,
204           "audio/mpeg" },
205
206         { (const guchar *)"\xFF\xFF\xFF\xFF\xFF",
207           (const guchar *)"OggS\0",
208           5,
209           "application/ogg" },
210
211         { (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF",
212           (const guchar *)"MThd\x00\x00\x00\x06",
213           8,
214           "audio/midi" },
215
216         { (const guchar *)"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF",
217           (const guchar *)"RIFF\x00\x00\x00\x00AVI ",
218           12,
219           "video/avi" },
220
221         { (const guchar *)"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF",
222           (const guchar *)"RIFF\x00\x00\x00\x00WAVE",
223           12,
224           "audio/wave" },
225 };
226
227 static gboolean
228 sniff_mp4 (SoupContentSniffer *sniffer, SoupBuffer *buffer)
229 {
230         const char *resource = (const char *)buffer->data;
231         int resource_length = MIN (512, buffer->length);
232         guint32 box_size = *((guint32*)resource);
233         int i;
234
235 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
236         box_size = ((box_size >> 24) |
237                     ((box_size << 8) & 0x00FF0000) |
238                     ((box_size >> 8) & 0x0000FF00) |
239                     (box_size << 24));
240 #endif
241
242         if (resource_length < 12 || resource_length < box_size || box_size % 4 != 0)
243                 return FALSE;
244
245         if (!g_str_has_prefix (resource + 4, "ftyp"))
246                 return FALSE;
247
248         if (!g_str_has_prefix (resource + 8, "mp4"))
249                 return FALSE;
250
251         for (i = 16; i < box_size && i < resource_length; i = i + 4) {
252                 if (g_str_has_prefix (resource + i, "mp4"))
253                         return TRUE;
254         }
255
256         return FALSE;
257 }
258
259 static char*
260 sniff_audio_video (SoupContentSniffer *sniffer, SoupBuffer *buffer)
261 {
262         char *sniffed_type;
263
264         sniffed_type = sniff_media (sniffer,
265                                     buffer,
266                                     audio_video_types_table,
267                                     G_N_ELEMENTS (audio_video_types_table));
268
269         if (sniffed_type != NULL)
270                 return sniffed_type;
271
272         if (sniff_mp4 (sniffer, buffer))
273                 return g_strdup ("video/mp4");
274
275         return NULL;
276 }
277
278 /* This table is based on the MIMESNIFF spec;
279  * See 7.1 Identifying a resource with an unknown MIME type
280  */
281 typedef struct {
282         /* @has_ws is TRUE if @pattern contains "generic" whitespace */
283         gboolean      has_ws;
284         /* @has_tag_termination is TRUE if we should check for a tag-terminating
285          * byte (0x20 " " or 0x3E ">") after the pattern match.
286          */
287         gboolean      has_tag_termination;
288         const guchar *mask;
289         const guchar *pattern;
290         guint         pattern_length;
291         const char   *sniffed_type;
292         gboolean      scriptable;
293 } SoupContentSnifferPattern;
294
295
296 /* When has_ws is TRUE, spaces in the pattern will indicate where insignificant space
297  * is allowed. Those spaces are marked with \x00 on the mask.
298  */
299 static SoupContentSnifferPattern types_table[] = {
300         /* Scriptable types. */
301
302         { TRUE, TRUE,
303           (const guchar *)"\x00\xFF\xFF\xDF\xDF\xDF\xDF\xDF\xDF\xDF\xFF\xDF\xDF\xDF\xDF",
304           (const guchar *)" <!DOCTYPE HTML",
305           14,
306           "text/html",
307           TRUE },
308
309         { TRUE, TRUE,
310           (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF",
311           (const guchar *)" <HTML",
312           5,
313           "text/html",
314           TRUE },
315
316         { TRUE, TRUE,
317           (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF",
318           (const guchar *)" <HEAD",
319           5,
320           "text/html",
321           TRUE },
322
323         { TRUE, TRUE,
324           (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF\xDF\xDF",
325           (const guchar *)" <SCRIPT",
326           7,
327           "text/html",
328           TRUE },
329
330         { TRUE, TRUE,
331           (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF\xDF\xDF",
332           (const guchar *)" <IFRAME",
333           7,
334           "text/html",
335           TRUE },
336
337         { TRUE, TRUE,
338           (const guchar *)"\x00\xFF\xDF\xFF",
339           (const guchar *)" <H1",
340           3,
341           "text/html",
342           TRUE },
343
344         { TRUE, TRUE,
345           (const guchar *)"\x00\xFF\xDF\xDF\xDF",
346           (const guchar *)" <DIV",
347           4,
348           "text/html",
349           TRUE },
350
351         { TRUE, TRUE,
352           (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF",
353           (const guchar *)" <FONT",
354           5,
355           "text/html",
356           TRUE },
357
358         { TRUE, TRUE,
359           (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF\xDF",
360           (const guchar *)" <TABLE",
361           6,
362           "text/html",
363           TRUE },
364
365         { TRUE, TRUE,
366           (const guchar *)"\x00\xFF\xDF",
367           (const guchar *)" <A",
368           2,
369           "text/html",
370           TRUE },
371
372         { TRUE, TRUE,
373           (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF\xDF",
374           (const guchar *)" <STYLE",
375           6,
376           "text/html",
377           TRUE },
378
379         { TRUE, TRUE,
380           (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF\xDF",
381           (const guchar *)" <TITLE",
382           6,
383           "text/html",
384           TRUE },
385
386         { TRUE, TRUE,
387           (const guchar *)"\x00\xFF\xDF",
388           (const guchar *)" <B",
389           2,
390           "text/html",
391           TRUE },
392
393         { TRUE, TRUE,
394           (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF",
395           (const guchar *)" <BODY",
396           5,
397           "text/html",
398           TRUE },
399
400         { TRUE, TRUE,
401           (const guchar *)"\x00\xFF\xDF\xDF",
402           (const guchar *)" <BR",
403           3,
404           "text/html",
405           TRUE },
406
407         { TRUE, TRUE,
408           (const guchar *)"\x00\xFF\xDF",
409           (const guchar *)" <P",
410           2,
411           "text/html",
412           TRUE },
413
414         { TRUE, TRUE,
415           (const guchar *)"\x00\xFF\xFF\xFF\xFF",
416           (const guchar *)" <!--",
417           4,
418           "text/html",
419           TRUE },
420
421         { TRUE, FALSE,
422           (const guchar *)"\x00\xFF\xFF\xFF\xFF\xFF",
423           (const guchar *)" <?xml",
424           5,
425           "text/html",
426           TRUE },
427
428         { FALSE, FALSE,
429           (const guchar *)"\xFF\xFF\xFF\xFF\xFF",
430           (const guchar *)"%PDF-",
431           5,
432           "application/pdf",
433           TRUE },
434
435         /* Non-scriptable types. */
436         { FALSE, FALSE,
437           (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF",
438           (const guchar *)"%!PS-Adobe-",
439           11,
440           "application/postscript",
441           FALSE },
442
443         { FALSE, FALSE, /* UTF-16BE BOM */
444           (const guchar *)"\xFF\xFF\x00\x00",
445           (const guchar *)"\xFE\xFF\x00\x00",
446           4,
447           "text/plain",
448           FALSE },
449
450         { FALSE, FALSE, /* UTF-16LE BOM */
451           (const guchar *)"\xFF\xFF\x00\x00",
452           (const guchar *)"\xFF\xFE\x00\x00",
453           4,
454           "text/plain",
455           FALSE },
456
457         { FALSE, FALSE, /* UTF-8 BOM */
458           (const guchar *)"\xFF\xFF\xFF\x00",
459           (const guchar *)"\xEF\xBB\xBF\x00",
460           4,
461           "text/plain",
462           FALSE },
463 };
464
465 /* Whether a given byte looks like it might be part of binary content.
466  * Source: HTML5 spec; borrowed from the Chromium mime sniffer code,
467  * which is BSD-licensed
468  */
469 static char byte_looks_binary[] = {
470         1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1,  /* 0x00 - 0x0F */
471         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,  /* 0x10 - 0x1F */
472         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x20 - 0x2F */
473         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x30 - 0x3F */
474         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x40 - 0x4F */
475         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x50 - 0x5F */
476         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x60 - 0x6F */
477         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x70 - 0x7F */
478         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x80 - 0x8F */
479         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x90 - 0x9F */
480         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0xA0 - 0xAF */
481         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0xB0 - 0xBF */
482         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0xC0 - 0xCF */
483         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0xD0 - 0xDF */
484         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0xE0 - 0xEF */
485         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0xF0 - 0xFF */
486 };
487
488 #if ENABLE(TIZEN_TV_DISABLE_MIME_SNIFF)
489 gboolean soup_Disable_Mime_Sniff = FALSE;
490 void
491 soup_content_mime_sniff_set (gboolean gDisableMimeSniff)
492 {
493         soup_Disable_Mime_Sniff = gDisableMimeSniff;
494 }
495 #endif
496
497 /* HTML5: 2.7.4 Content-Type sniffing: unknown type */
498 static char*
499 sniff_unknown (SoupContentSniffer *sniffer, SoupBuffer *buffer,
500                gboolean sniff_scriptable)
501 {
502         char *sniffed_type = NULL;
503         const guchar *resource = (const guchar *)buffer->data;
504         int resource_length = MIN (512, buffer->length);
505         int i;
506
507 #if ENABLE(TIZEN_TV_DISABLE_MIME_SNIFF)
508         if (soup_Disable_Mime_Sniff && !sniff_scriptable){
509                 return g_strdup ("text/plain");
510         }
511 #endif
512         for (i = 0; i < G_N_ELEMENTS (types_table); i++) {
513                 SoupContentSnifferPattern *type_row = &(types_table[i]);
514
515                 if (!sniff_scriptable && type_row->scriptable)
516                         continue;
517
518                 if (type_row->has_ws) {
519                         int index_stream = 0;
520                         int index_pattern = 0;
521                         gboolean skip_row = FALSE;
522
523                         while ((index_stream < resource_length) &&
524                                (index_pattern <= type_row->pattern_length)) {
525                                 /* Skip insignificant white space ("WS" in the spec) */
526                                 if (type_row->pattern[index_pattern] == ' ') {
527                                         if (resource[index_stream] == '\x09' ||
528                                             resource[index_stream] == '\x0a' ||
529                                             resource[index_stream] == '\x0c' ||
530                                             resource[index_stream] == '\x0d' ||
531                                             resource[index_stream] == '\x20')
532                                                 index_stream++;
533                                         else
534                                                 index_pattern++;
535                                 } else {
536                                         if ((type_row->mask[index_pattern] & resource[index_stream]) != type_row->pattern[index_pattern]) {
537                                                 skip_row = TRUE;
538                                                 break;
539                                         }
540                                         index_pattern++;
541                                         index_stream++;
542                                 }
543                         }
544
545                         if (skip_row)
546                                 continue;
547
548                         if (index_pattern > type_row->pattern_length) {
549                                 if (type_row->has_tag_termination &&
550                                     resource[index_stream] != '\x20' &&
551                                     resource[index_stream] != '\x3E')
552                                         continue;
553
554                                 return g_strdup (type_row->sniffed_type);
555                         }
556                 } else {
557                         int j;
558
559                         if (resource_length < type_row->pattern_length)
560                                 continue;
561
562                         for (j = 0; j < type_row->pattern_length; j++) {
563                                 if ((type_row->mask[j] & resource[j]) != type_row->pattern[j])
564                                         break;
565                         }
566
567                         /* This means our comparison above matched completely */
568                         if (j == type_row->pattern_length)
569                                 return g_strdup (type_row->sniffed_type);
570                 }
571         }
572
573         sniffed_type = sniff_images (sniffer, buffer);
574
575         if (sniffed_type != NULL)
576                 return sniffed_type;
577
578         sniffed_type = sniff_audio_video (sniffer, buffer);
579
580         if (sniffed_type != NULL)
581                 return sniffed_type;
582
583         for (i = 0; i < resource_length; i++) {
584                 if (byte_looks_binary[resource[i]])
585                         return g_strdup ("application/octet-stream");
586         }
587
588 #if ENABLE(TIZEN_TV_DISABLE_MIME_SNIFF)
589         /* Refer to Orsay's implementation, modify the default value from "text/plain" to "text/html". */
590         return g_strdup ("text/html");
591 #else
592         return g_strdup ("text/plain");
593 #endif
594 }
595
596 /* MIMESNIFF: 7.2 Sniffing a mislabeled binary resource */
597 static char*
598 sniff_text_or_binary (SoupContentSniffer *sniffer, SoupBuffer *buffer)
599 {
600         const guchar *resource = (const guchar *)buffer->data;
601         int resource_length = MIN (512, buffer->length);
602         gboolean looks_binary = FALSE;
603         int i;
604
605         /* 2. Detecting UTF-16BE, UTF-16LE BOMs means it's text/plain */
606         if (resource_length >= 2) {
607                 if ((resource[0] == 0xFE && resource[1] == 0xFF) ||
608                     (resource[0] == 0xFF && resource[1] == 0xFE))
609                         return g_strdup ("text/plain");
610         }
611
612         /* 3. UTF-8 BOM. */
613         if (resource_length >= 3) {
614                 if (resource[0] == 0xEF && resource[1] == 0xBB && resource[2] == 0xBF)
615                         return g_strdup ("text/plain");
616         }
617
618         /* 4. Look to see if any of the first n bytes looks binary */
619         for (i = 0; i < resource_length; i++) {
620                 if (byte_looks_binary[resource[i]]) {
621                         looks_binary = TRUE;
622                         break;
623                 }
624         }
625
626         if (!looks_binary)
627                 return g_strdup ("text/plain");
628
629         /* 5. Execute 7.1 Identifying a resource with an unknown MIME type.
630          * TODO: sniff-scriptable needs to be unset.
631          */
632         return sniff_unknown (sniffer, buffer, TRUE);
633 }
634
635 static gboolean
636 skip_insignificant_space (const char *resource, int *pos, int resource_length)
637 {
638         while ((resource[*pos] == '\x09') ||
639                (resource[*pos] == '\x20') ||
640                (resource[*pos] == '\x0A') ||
641                (resource[*pos] == '\x0D')) {
642                 *pos = *pos + 1;
643
644                 if (*pos > resource_length)
645                         return TRUE;
646         }
647
648         return FALSE;
649 }
650
651 static char*
652 sniff_feed_or_html (SoupContentSniffer *sniffer, SoupBuffer *buffer)
653 {
654         const char *resource = (const char *)buffer->data;
655         int resource_length = MIN (512, buffer->length);
656         int pos = 0;
657
658         if (resource_length < 3)
659                 goto text_html;
660
661         /* Skip a leading UTF-8 BOM */
662         if (resource[0] == 0xEF && resource[1] == 0xBB && resource[2] == 0xBF)
663                 pos = 3;
664
665  look_for_tag:
666         if (pos > resource_length)
667                 goto text_html;
668
669         if (skip_insignificant_space (resource, &pos, resource_length))
670                 goto text_html;
671
672         if (resource[pos] != '<')
673                 return g_strdup ("text/html");
674
675         pos++;
676
677         if ((pos + 2) > resource_length)
678                 goto text_html;
679
680         /* Skip comments. */
681         if (g_str_has_prefix (resource + pos, "!--")) {
682                 pos = pos + 3;
683
684                 if ((pos + 2) > resource_length)
685                         goto text_html;
686
687                 while (!g_str_has_prefix (resource + pos, "-->")) {
688                         pos++;
689
690                         if ((pos + 2) > resource_length)
691                                 goto text_html;
692                 }
693
694                 pos = pos + 3;
695
696                 goto look_for_tag;
697         }
698
699         if (pos > resource_length)
700                 goto text_html;
701
702         if (resource[pos] == '!') {
703                 do {
704                         pos++;
705
706                         if (pos > resource_length)
707                                 goto text_html;
708                 } while (resource[pos] != '>');
709
710                 pos++;
711
712                 goto look_for_tag;
713         } else if (resource[pos] == '?') {
714                 do {
715                         pos++;
716
717                         if ((pos + 1) > resource_length)
718                                 goto text_html;
719                 } while (!g_str_has_prefix (resource + pos, "?>"));
720
721                 pos = pos + 2;
722
723                 goto look_for_tag;
724         }
725
726         if ((pos + 3) > resource_length)
727                 goto text_html;
728
729         if (g_str_has_prefix (resource + pos, "rss"))
730                 return g_strdup ("application/rss+xml");
731
732         if ((pos + 4) > resource_length)
733                 goto text_html;
734
735         if (g_str_has_prefix (resource + pos, "feed"))
736                 return g_strdup ("application/atom+xml");
737
738         if ((pos + 7) > resource_length)
739                 goto text_html;
740
741         if (g_str_has_prefix (resource + pos, "rdf:RDF")) {
742                 pos = pos + 7;
743
744                 if (skip_insignificant_space (resource, &pos, resource_length))
745                         goto text_html;
746
747                 if ((pos + 32) > resource_length)
748                         goto text_html;
749
750                 if (g_str_has_prefix (resource + pos, "xmlns=\"http://purl.org/rss/1.0/\"")) {
751                         pos = pos + 32;
752
753                         if (skip_insignificant_space (resource, &pos, resource_length))
754                                 goto text_html;
755
756                         if ((pos + 55) > resource_length)
757                                 goto text_html;
758
759                         if (g_str_has_prefix (resource + pos, "xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\""))
760                                 return g_strdup ("application/rss+xml");
761                 }
762
763                 if ((pos + 55) > resource_length)
764                         goto text_html;
765
766                 if (g_str_has_prefix (resource + pos, "xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"")) {
767                         pos = pos + 55;
768
769                         if (skip_insignificant_space (resource, &pos, resource_length))
770                                 goto text_html;
771
772                         if ((pos + 32) > resource_length)
773                                 goto text_html;
774
775                         if (g_str_has_prefix (resource + pos, "xmlns=\"http://purl.org/rss/1.0/\""))
776                                 return g_strdup ("application/rss+xml");
777                 }
778         }
779
780  text_html:
781         return g_strdup ("text/html");
782 }
783
784 static char *
785 soup_content_sniffer_real_sniff (SoupContentSniffer *sniffer, SoupMessage *msg,
786                                  SoupBuffer *buffer, GHashTable **params)
787 {
788         const char *content_type;
789         const char *x_content_type_options;
790         char *sniffed_type = NULL;
791         gboolean no_sniff = FALSE;
792
793         content_type = soup_message_headers_get_content_type (msg->response_headers, params);
794
795         /* MIMESNIFF: 7 Determining the sniffed MIME type of a resource. */
796
797         x_content_type_options = soup_message_headers_get_one (msg->response_headers, "X-Content-Type-Options");
798         if (!g_strcmp0 (x_content_type_options, "nosniff"))
799                 no_sniff = TRUE;
800
801         /* 1. Unknown/undefined supplied type with sniff-scritable = !nosniff. */
802         if ((content_type == NULL) ||
803             !g_ascii_strcasecmp (content_type, "unknown/unknown") ||
804             !g_ascii_strcasecmp (content_type, "application/unknown") ||
805             !g_ascii_strcasecmp (content_type, "*/*"))
806                 return sniff_unknown (sniffer, buffer, !no_sniff);
807
808         /* 2. If nosniff is specified in X-Content-Type-Options use the supplied MIME type. */
809         if (no_sniff)
810                 return g_strdup (content_type);
811
812         /* 3. check-for-apache-bug */
813         if ((content_type != NULL) &&
814             (g_str_equal (content_type, "text/plain") ||
815              g_str_equal (content_type, "text/plain; charset=ISO-8859-1") ||
816              g_str_equal (content_type, "text/plain; charset=iso-8859-1") ||
817              g_str_equal (content_type, "text/plain; charset=UTF-8")))
818                 return sniff_text_or_binary (sniffer, buffer);
819
820         /* 4. XML types sent by the server are always used. */
821         if (g_str_has_suffix (content_type, "+xml") ||
822             !g_ascii_strcasecmp (content_type, "text/xml") ||
823             !g_ascii_strcasecmp (content_type, "application/xml"))
824                 return g_strdup (content_type);
825
826         /* 5. Distinguish feed from HTML. */
827         if (!g_ascii_strcasecmp (content_type, "text/html"))
828                 return sniff_feed_or_html (sniffer, buffer);
829
830         /* 6. Image types.
831          */
832         if (!g_ascii_strncasecmp (content_type, "image/", 6)) {
833                 sniffed_type = sniff_images (sniffer, buffer);
834                 if (sniffed_type != NULL)
835                         return sniffed_type;
836                 return g_strdup (content_type);
837         }
838
839         /* 7. Audio and video types. */
840         if (!g_ascii_strncasecmp (content_type, "audio/", 6) ||
841             !g_ascii_strncasecmp (content_type, "video/", 6) ||
842             !g_ascii_strcasecmp (content_type, "application/ogg")) {
843                 sniffed_type = sniff_audio_video (sniffer, buffer);
844                 if (sniffed_type != NULL)
845                         return sniffed_type;
846                 return g_strdup (content_type);
847         }
848
849         /* If we got text/plain, use text_or_binary */
850         if (g_str_equal (content_type, "text/plain")) {
851                 return sniff_text_or_binary (sniffer, buffer);
852         }
853
854         return g_strdup (content_type);
855 }
856
857 static gsize
858 soup_content_sniffer_real_get_buffer_size (SoupContentSniffer *sniffer)
859 {
860         return 512;
861 }
862
863 static void
864 soup_content_sniffer_got_headers_cb (SoupMessage *msg, SoupContentSniffer *sniffer)
865 {
866         SoupMessagePrivate *priv = SOUP_MESSAGE_GET_PRIVATE (msg);
867
868         priv->bytes_for_sniffing = soup_content_sniffer_get_buffer_size (sniffer);
869 }
870
871 static void
872 soup_content_sniffer_request_queued (SoupSessionFeature *feature,
873                                      SoupSession *session,
874                                      SoupMessage *msg)
875 {
876         SoupMessagePrivate *priv = SOUP_MESSAGE_GET_PRIVATE (msg);
877
878         priv->sniffer = g_object_ref (feature);
879         g_signal_connect (msg, "got-headers",
880                           G_CALLBACK (soup_content_sniffer_got_headers_cb),
881                           feature);
882 }
883
884 static void
885 soup_content_sniffer_request_unqueued (SoupSessionFeature *feature,
886                                        SoupSession *session,
887                                        SoupMessage *msg)
888 {
889         SoupMessagePrivate *priv = SOUP_MESSAGE_GET_PRIVATE (msg);
890
891         g_object_unref (priv->sniffer);
892         priv->sniffer = NULL;
893
894         g_signal_handlers_disconnect_by_func (msg, soup_content_sniffer_got_headers_cb, feature);
895 }
896
897 static void
898 soup_content_sniffer_class_init (SoupContentSnifferClass *content_sniffer_class)
899 {
900         content_sniffer_class->sniff = soup_content_sniffer_real_sniff;
901         content_sniffer_class->get_buffer_size = soup_content_sniffer_real_get_buffer_size;
902 }
903
904 static void
905 soup_content_sniffer_session_feature_init (SoupSessionFeatureInterface *feature_interface,
906                                            gpointer interface_data)
907 {
908         feature_interface->request_queued = soup_content_sniffer_request_queued;
909         feature_interface->request_unqueued = soup_content_sniffer_request_unqueued;
910 }
911
912 /**
913  * soup_content_sniffer_new:
914  *
915  * Creates a new #SoupContentSniffer.
916  *
917  * Returns: a new #SoupContentSniffer
918  *
919  * Since: 2.28
920  **/
921 SoupContentSniffer *
922 soup_content_sniffer_new ()
923 {
924         return g_object_new (SOUP_TYPE_CONTENT_SNIFFER, NULL);
925 }
926
927 /**
928  * soup_content_sniffer_sniff:
929  * @sniffer: a #SoupContentSniffer
930  * @msg: the message to sniff
931  * @buffer: a buffer containing the start of @msg's response body
932  * @params: (element-type utf8 utf8) (out) (transfer full) (allow-none): return
933  *   location for Content-Type parameters (eg, "charset"), or %NULL
934  *
935  * Sniffs @buffer to determine its Content-Type. The result may also
936  * be influenced by the Content-Type declared in @msg's response
937  * headers.
938  *
939  * Return value: the sniffed Content-Type of @buffer; this will never be %NULL,
940  *   but may be "application/octet-stream".
941  *
942  * Since: 2.28
943  */
944 char *
945 soup_content_sniffer_sniff (SoupContentSniffer *sniffer,
946                             SoupMessage *msg, SoupBuffer *buffer,
947                             GHashTable **params)
948 {
949         g_return_val_if_fail (SOUP_IS_CONTENT_SNIFFER (sniffer), NULL);
950         g_return_val_if_fail (SOUP_IS_MESSAGE (msg), NULL);
951         g_return_val_if_fail (buffer != NULL, NULL);
952
953         return SOUP_CONTENT_SNIFFER_GET_CLASS (sniffer)->sniff (sniffer, msg, buffer, params);
954 }
955
956 /**
957  * soup_content_sniffer_get_buffer_size:
958  * @sniffer: a #SoupContentSniffer
959  *
960  * Gets the number of bytes @sniffer needs in order to properly sniff
961  * a buffer.
962  *
963  * Return value: the number of bytes to sniff
964  *
965  * Since: 2.28
966  */
967 gsize
968 soup_content_sniffer_get_buffer_size (SoupContentSniffer *sniffer)
969 {
970         g_return_val_if_fail (SOUP_IS_CONTENT_SNIFFER (sniffer), 0);
971
972         return SOUP_CONTENT_SNIFFER_GET_CLASS (sniffer)->get_buffer_size (sniffer);
973 }