Git init
[profile/ivi/libsoup2.4.git] / libsoup / soup-content-sniffer.c
1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
2 /*
3  * soup-content-sniffer.c
4  *
5  * Copyright (C) 2009 Gustavo Noronha Silva.
6  */
7
8 #ifdef HAVE_CONFIG_H
9 #include <config.h>
10 #endif
11
12 #include <string.h>
13
14 #include "soup-content-sniffer.h"
15 #include "soup-enum-types.h"
16 #include "soup-message.h"
17 #include "soup-message-private.h"
18 #include "soup-session-feature.h"
19 #include "soup-uri.h"
20 /*TIZEN patch*/
21 #include "TIZEN.h"
22
23 /**
24  * SECTION:soup-content-sniffer
25  * @short_description: Content sniffing for #SoupSession
26  *
27  * A #SoupContentSniffer tries to detect the actual content type of
28  * the files that are being downloaded by looking at some of the data
29  * before the #SoupMessage emits its #SoupMessage::got-headers signal.
30  * #SoupContentSniffer implements #SoupSessionFeature, so you can add
31  * content sniffing to a session with soup_session_add_feature() or
32  * soup_session_add_feature_by_type().
33  *
34  * Since: 2.27.3
35  **/
36
37 static char *sniff (SoupContentSniffer *sniffer, SoupMessage *msg, SoupBuffer *buffer, GHashTable **params);
38 static gsize get_buffer_size (SoupContentSniffer *sniffer);
39
40 static void soup_content_sniffer_session_feature_init (SoupSessionFeatureInterface *feature_interface, gpointer interface_data);
41
42 static void request_queued (SoupSessionFeature *feature, SoupSession *session, SoupMessage *msg);
43 static void request_unqueued (SoupSessionFeature *feature, SoupSession *session, SoupMessage *msg);
44
45 G_DEFINE_TYPE_WITH_CODE (SoupContentSniffer, soup_content_sniffer, G_TYPE_OBJECT,
46                          G_IMPLEMENT_INTERFACE (SOUP_TYPE_SESSION_FEATURE,
47                                                 soup_content_sniffer_session_feature_init))
48
49 static void
50 soup_content_sniffer_init (SoupContentSniffer *content_sniffer)
51 {
52 }
53
54 static void
55 soup_content_sniffer_class_init (SoupContentSnifferClass *content_sniffer_class)
56 {
57         content_sniffer_class->sniff = sniff;
58         content_sniffer_class->get_buffer_size = get_buffer_size;
59 }
60
61 static void
62 soup_content_sniffer_session_feature_init (SoupSessionFeatureInterface *feature_interface,
63                                            gpointer interface_data)
64 {
65         feature_interface->request_queued = request_queued;
66         feature_interface->request_unqueued = request_unqueued;
67 }
68
69 /**
70  * soup_content_sniffer_new:
71  *
72  * Creates a new #SoupContentSniffer.
73  *
74  * Returns: a new #SoupContentSniffer
75  *
76  * Since: 2.27.3
77  **/
78 SoupContentSniffer *
79 soup_content_sniffer_new ()
80 {
81         return g_object_new (SOUP_TYPE_CONTENT_SNIFFER, NULL);
82 }
83
84 /**
85  * soup_content_sniffer_sniff:
86  * @sniffer: a #SoupContentSniffer
87  * @msg: the message to sniff
88  * @buffer: a buffer containing the start of @msg's response body
89  * @params: (element-type utf8 utf8) (out) (transfer full) (allow-none): return
90  *   location for Content-Type parameters (eg, "charset"), or %NULL
91  *
92  * Sniffs @buffer to determine its Content-Type. The result may also
93  * be influenced by the Content-Type declared in @msg's response
94  * headers.
95  *
96  * Return value: the sniffed Content-Type of @buffer; this will never be %NULL,
97  *   but may be "application/octet-stream".
98  */
99 char *
100 soup_content_sniffer_sniff (SoupContentSniffer *sniffer,
101                             SoupMessage *msg, SoupBuffer *buffer,
102                             GHashTable **params)
103 {
104         g_return_val_if_fail (SOUP_IS_CONTENT_SNIFFER (sniffer), NULL);
105         g_return_val_if_fail (SOUP_IS_MESSAGE (msg), NULL);
106         g_return_val_if_fail (buffer != NULL, NULL);
107
108         return SOUP_CONTENT_SNIFFER_GET_CLASS (sniffer)->sniff (sniffer, msg, buffer, params);
109 }
110
111 /* This table is based on the HTML5 spec;
112  * See 2.7.4 Content-Type sniffing: unknown type
113  */
114 typedef struct {
115         /* @has_ws is TRUE if @pattern contains "generic" whitespace */
116         gboolean      has_ws;
117         const guchar *mask;
118         const guchar *pattern;
119         guint         pattern_length;
120         const char   *sniffed_type;
121         gboolean      scriptable;
122 } SoupContentSnifferPattern;
123
124 #if ENABLE(TIZEN_FIX_CONTENT_SNIFFER_PATTERN)
125 /* This table is updated by TIZEN team(steve.jun@samsung.com), based on draft-abarth-mime-sniff-06
126  * (http://tools.ietf.org/html/draft-abarth-mime-sniff-06);
127  * See 5. Unknown Type
128  */
129 static SoupContentSnifferPattern types_table[] = {
130         // <!DOCTYPE\xHTML
131         { TRUE,
132                 (const guchar *)"\xFF\xFF\xFF\xDF\xDF\xDF\xDF\xDF\xDF\xDF\xFF\xDF\xDF\xDF\xDF\xFF",
133                 (const guchar *)" \x3C\x21\x44\x4F\x43\x54\x59\x50\x45\x20\x48\x54\x4D\x4C",
134                 14,
135                 "text/html",
136                 TRUE },
137         // <HTML
138         { TRUE,
139                 (const guchar *)"\xFF\xFF\xDF\xDF\xDF\xDF\xFF",
140                 (const guchar *)" \x3C\x48\x54\x4D\x4C",
141                 5,
142                 "text/html",
143                 TRUE },
144         // <HEAD
145         { TRUE,
146                 (const guchar *)"\xFF\xFF\xDF\xDF\xDF\xDF\xFF",
147                 (const guchar *)" \x3C\x48\x45\x41\x44",
148                 5,
149                 "text/html",
150                 TRUE },
151         // <SCRIPT
152         { TRUE,
153                 (const guchar *)"\xFF\xFF\xDF\xDF\xDF\xDF\xDF\xDF\xFF",
154                 (const guchar *)" \x3C\x53\x43\x52\x49\x50\x54",
155                 7,
156                 "text/html",
157                 TRUE },
158         // <IFRAME
159         { TRUE,
160                 (const guchar *)"\xFF\xFF\xDF\xDF\xDF\xDF\xDF\xDF\xFF",
161                 (const guchar *)" \x3C\x49\x46\x52\x41\x4d\x45",
162                 7,
163                 "text/html",
164                 TRUE },
165         // <H1
166         { TRUE,
167                 (const guchar *)"\xFF\xFF\xDF\xFF\xFF",
168                 (const guchar *)" \x3C\x48\x31",
169                 3,
170                 "text/html",
171                 TRUE },
172         // <DIV
173         { TRUE,
174                 (const guchar *)"\xFF\xFF\xDF\xDF\xDF\xFF",
175                 (const guchar *)" \x3C\x44\x49\x56",
176                 4,
177                 "text/html",
178                 TRUE },
179         // <FONT
180         { TRUE,
181                 (const guchar *)"\xFF\xFF\xDF\xDF\xDF\xDF\xFF",
182                 (const guchar *)" \x3C\x46\x4f\x4e\x54",
183                 5,
184                 "text/html",
185                 TRUE },
186         // <TABLE
187         { TRUE,
188                 (const guchar *)"\xFF\xFF\xDF\xDF\xDF\xDF\xDF\xFF",
189                 (const guchar *)" \x3C\x54\x41\x42\x4c\x45",
190                 6,
191                 "text/html",
192                 TRUE },
193         // <A
194         { TRUE,
195                 (const guchar *)"\xFF\xFF\xDF\xFF",
196                 (const guchar *)" \x3C\x41",
197                 2,
198                 "text/html",
199                 TRUE },
200         // <STYLE
201         { TRUE,
202                 (const guchar *)"\xFF\xFF\xDF\xDF\xDF\xDF\xDF\xFF",
203                 (const guchar *)" \x3C\x53\x54\x59\x4c\x45",
204                 6,
205                 "text/html",
206                 TRUE },
207         // <TITLE
208         { TRUE,
209                 (const guchar *)"\xFF\xFF\xDF\xDF\xDF\xDF\xDF\xFF",
210                 (const guchar *)" \x3C\x54\x49\x54\x4c\x45",
211                 6,
212                 "text/html",
213                 TRUE },
214         // <B
215         { TRUE,
216                 (const guchar *)"\xFF\xFF\xDF\xFF",
217                 (const guchar *)" \x3C\x42",
218                 2,
219                 "text/html",
220                 TRUE },
221         // <BODY
222         { TRUE,
223                 (const guchar *)"\xFF\xFF\xDF\xDF\xDF\xDF\xFF",
224                 (const guchar *)" \x3C\x42\x4f\x44\x59",
225                 5,
226                 "text/html",
227                 TRUE },
228         // <BR
229         { TRUE,
230                 (const guchar *)"\xFF\xFF\xDF\xDF\xFF",
231                 (const guchar *)" \x3C\x42\x52",
232                 3,
233                 "text/html",
234                 TRUE },
235         // <P
236         { TRUE,
237                 (const guchar *)"\xFF\xFF\xDF\xFF",
238                 (const guchar *)" \x3C\x50",
239                 2,
240                 "text/html",
241                 TRUE },
242         // <!--
243         { TRUE,
244                 (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF",
245                 (const guchar *)" \x3C\x21\x2d\x2d",
246                 4,
247                 "text/html",
248                 TRUE },
249         // <?xml (Note the case sensitivity and lack of trailing _>)
250         { TRUE,
251                 (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF",
252                 (const guchar *)" \x3C\x3f\x78\x6d\x6c",
253                 5,
254                 "text/xml",
255                 TRUE },
256         // The string "%PDF-", the PDF signature.
257         { FALSE,
258                 (const guchar *)"\xFF\xFF\xFF\xFF\xFF",
259                 (const guchar *)"\x25\x50\x44\x46\x2D",
260                 5,
261                 "application/pdf",
262                 TRUE },
263         // The string "%!PS-Adobe-", the PostScript signature.
264         { FALSE,
265                 (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF",
266                 (const guchar *)"\x25\x21\x50\x53\x2D\x41\x64\x6F\x62\x65\x2D",
267                 11,
268                 "application/postscript",
269                 FALSE },
270         // UTF-16BE BOM
271         { FALSE,
272                 (const guchar *)"\xFF\xFF\x00\x00",
273                 (const guchar *)"\xFE\xFF\x00\x00",
274                 4,
275                 "text/plain",
276                 FALSE },
277         // UTF-16LE BOM
278         { FALSE,
279                 (const guchar *)"\xFF\xFF\x00\x00",
280                 (const guchar *)"\xFF\xFE\x00\x00",
281                 4,
282                 "text/plain",
283                 FALSE },
284         // UTF-8 BOM
285         { FALSE,
286                 (const guchar *)"\xFF\xFF\xFF\x00",
287                 (const guchar *)"\xEF\xBB\xBF\x00",
288                 4,
289                 "text/plain",
290                 FALSE },
291         // The string "GIF87a", a GIF signature.
292         { FALSE,
293                 (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF",
294                 (const guchar *)"\x47\x49\x46\x38\x37\x61",
295                 6,
296                 "image/gif",
297                 FALSE },
298         // The string "GIF89a", a GIF signature.
299         { FALSE,
300                 (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF",
301                 (const guchar *)"\x47\x49\x46\x38\x39\x61",
302                 6,
303                 "image/gif",
304                 FALSE },
305         // The PNG signature.
306         { FALSE,
307                 (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF",
308                 (const guchar *)"\x89\x50\x4E\x47\x0D\x0A\x1A\x0A",
309                 8,
310                 "image/png",
311                 FALSE },
312         // A JPEG SOI marker followed by a octet of another marker.
313         { FALSE,
314                 (const guchar *)"\xFF\xFF\xFF",
315                 (const guchar *)"\xFF\xD8\xFF",
316                 3,
317                 "image/jpeg",
318                 FALSE },
319         // The string "BM", a BMP signature.
320         { FALSE,
321                 (const guchar *)"\xFF\xFF",
322                 (const guchar *)"\x42\x4D",
323                 2,
324                 "image/bmp",
325                 FALSE },
326         // "RIFF" followed by four bytes, followed by "WEBPVP".
327         { FALSE,
328                 (const guchar *)"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF\xFF\xFF",
329                 (const guchar *)"\x52\x49\x46\x46\x00\x00\x00\x00\x57\x45\x42\x50\x56\x50",
330                 14,
331                 "image/webp",
332                 FALSE },
333         // A Windows Icon signature.
334         { FALSE,
335                 (const guchar *)"\xFF\xFF\xFF\xFF",
336                 (const guchar *)"\x00\x00\x01\x00",
337                 4,
338                 "image/vnd.microsoft.icon",
339                 FALSE },
340         // An Ogg Vorbis audio or video signature.
341         { FALSE,
342                 (const guchar *)"\xFF\xFF\xFF\xFF\xFF",
343                 (const guchar *)"\x4F\x67\x67\x53\x00",
344                 5,
345                 "application/ogg",
346                 FALSE },
347         // "RIFF" followed by four bytes, followed by "WAVE".
348         { FALSE,
349                 (const guchar *)"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF",
350                 (const guchar *)"\x52\x49\x46\x46\x00\x00\x00\x00\x57\x41\x56\x45",
351                 12,
352                 "audio/x-wave",
353                 FALSE },
354         // The WebM signature [TODO: Use more octets?] vidow: typo(!)
355         { FALSE,
356                 (const guchar *)"\xFF\xFF\xFF\xFF",
357                 (const guchar *)"\x1A\x45\xDF\xA3",
358                 4,
359                 "vidow/webm",
360                 FALSE },
361         // A RAR archive.
362         { FALSE,
363                 (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF\xFF",
364                 (const guchar *)"\x52\x61\x72\x20\x1A\x07\x00",
365                 7,
366                 "application/x-rar-compressed",
367                 FALSE },
368         // A ZIP archive.
369         { FALSE,
370                 (const guchar *)"\xFF\xFF\xFF\xFF",
371                 (const guchar *)"\x50\x4B\x03\x04",
372                 4,
373                 "application/zip",
374                 FALSE },
375         // A GZIP archive.
376         { FALSE,
377                 (const guchar *)"\xFF\xFF\xFF",
378                 (const guchar *)"\x1F\x8B\x08",
379                 3,
380                 "application/x-gzip",
381                 FALSE }
382 };
383 #else
384 static SoupContentSnifferPattern types_table[] = {
385         { FALSE,
386           (const guchar *)"\xFF\xFF\xDF\xDF\xDF\xDF\xDF\xDF\xDF\xFF\xDF\xDF\xDF\xDF",
387           (const guchar *)"\x3C\x21\x44\x4F\x43\x54\x59\x50\x45\x20\x48\x54\x4D\x4C",
388           14,
389           "text/html",
390           TRUE },
391
392         { TRUE,
393           (const guchar *)"\xFF\xFF\xDF\xDF\xDF\xDF",
394           (const guchar *)" \x3C\x48\x54\x4D\x4C",
395           5,
396           "text/html",
397           TRUE },
398
399         { TRUE,
400           (const guchar *)"\xFF\xFF\xDF\xDF\xDF\xDF",
401           (const guchar *)" \x3C\x48\x45\x41\x44",
402           5,
403           "text/html",
404           TRUE },
405
406         { TRUE,
407           (const guchar *)"\xFF\xFF\xDF\xDF\xDF\xDF\xDF\xDF",
408           (const guchar *)" \x3C\x53\x43\x52\x49\x50\x54",
409           7,
410           "text/html",
411           TRUE },
412
413         { FALSE,
414           (const guchar *)"\xFF\xFF\xFF\xFF\xFF",
415           (const guchar *)"\x25\x50\x44\x46\x2D",
416           5,
417           "application/pdf",
418           TRUE },
419
420         { FALSE,
421           (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF",
422           (const guchar *)"\x25\x21\x50\x53\x2D\x41\x64\x6F\x62\x65\x2D",
423           11,
424           "application/postscript",
425           FALSE },
426
427         { FALSE,
428           (const guchar *)"\xFF\xFF\x00\x00",
429           (const guchar *)"\xFE\xFF\x00\x00",
430           4,
431           "text/plain",
432           FALSE },
433
434         { FALSE,
435           (const guchar *)"\xFF\xFF\x00\x00",
436           (const guchar *)"\xFF\xFF\x00\x00",
437           4,
438           "text/plain",
439           FALSE },
440
441         { FALSE,
442           (const guchar *)"\xFF\xFF\xFF\x00",
443           (const guchar *)"\xEF\xBB\xBF\x00",
444           4,
445           "text/plain",
446           FALSE },
447
448         { FALSE,
449           (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF",
450           (const guchar *)"\x47\x49\x46\x38\x37\x61",
451           6,
452           "image/gif",
453           FALSE },
454
455         { FALSE,
456           (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF",
457           (const guchar *)"\x47\x49\x46\x38\x39\x61",
458           6,
459           "image/gif",
460           FALSE },
461
462         { FALSE,
463           (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF",
464           (const guchar *)"\x89\x50\x4E\x47\x0D\x0A\x1A\x0A",
465           8,
466           "image/png",
467           FALSE },
468
469         { FALSE,
470           (const guchar *)"\xFF\xFF\xFF",
471           (const guchar *)"\xFF\xD8\xFF",
472           3,
473           "image/jpeg",
474           FALSE },
475
476         { FALSE,
477           (const guchar *)"\xFF\xFF",
478           (const guchar *)"\x42\x4D",
479           2,
480           "image/bmp",
481           FALSE },
482
483         { FALSE,
484           (const guchar *)"\xFF\xFF\xFF\xFF",
485           (const guchar *)"\x00\x00\x01\x00",
486           4,
487           "image/vnd.microsoft.icon",
488           FALSE }
489 };
490 #endif
491
492 /* Whether a given byte looks like it might be part of binary content.
493  * Source: HTML5 spec; borrowed from the Chromium mime sniffer code,
494  * which is BSD-licensed
495  */
496 static char byte_looks_binary[] = {
497         1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1,  /* 0x00 - 0x0F */
498         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,  /* 0x10 - 0x1F */
499         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x20 - 0x2F */
500         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x30 - 0x3F */
501         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x40 - 0x4F */
502         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x50 - 0x5F */
503         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x60 - 0x6F */
504         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x70 - 0x7F */
505         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x80 - 0x8F */
506         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x90 - 0x9F */
507         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0xA0 - 0xAF */
508         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0xB0 - 0xBF */
509         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0xC0 - 0xCF */
510         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0xD0 - 0xDF */
511         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0xE0 - 0xEF */
512         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0xF0 - 0xFF */
513 };
514
515 /* HTML5: 2.7.4 Content-Type sniffing: unknown type */
516 static char*
517 sniff_unknown (SoupContentSniffer *sniffer, SoupMessage *msg,
518                SoupBuffer *buffer, gboolean for_text_or_binary)
519 {
520         const guchar *resource = (const guchar *)buffer->data;
521         int resource_length = MIN (512, buffer->length);
522         int i;
523
524         for (i = 0; i < G_N_ELEMENTS (types_table); i++) {
525                 SoupContentSnifferPattern *type_row = &(types_table[i]);
526
527                 /* The scriptable types should be skiped for the text
528                  * or binary path, but considered for other paths */
529                 if (for_text_or_binary && type_row->scriptable)
530                         continue;
531
532                 if (type_row->has_ws) {
533                         int index_stream = 0;
534                         int index_pattern = 0;
535                         gboolean skip_row = FALSE;
536
537                         while ((index_stream < resource_length) &&
538                                (index_pattern <= type_row->pattern_length)) {
539                                 /* Skip insignificant white space ("WS" in the spec) */
540                                 if (type_row->pattern[index_pattern] == ' ') {
541                                         if (resource[index_stream] == '\x09' ||
542                                             resource[index_stream] == '\x0a' ||
543                                             resource[index_stream] == '\x0c' ||
544                                             resource[index_stream] == '\x0d' ||
545                                             resource[index_stream] == '\x20')
546                                                 index_stream++;
547                                         else
548                                                 index_pattern++;
549                                 } else {
550                                         if ((type_row->mask[index_pattern] & resource[index_stream]) != type_row->pattern[index_pattern]) {
551                                                 skip_row = TRUE;
552                                                 break;
553                                         }
554                                         index_pattern++;
555                                         index_stream++;
556                                 }
557                         }
558
559                         if (skip_row)
560                                 continue;
561
562                         if (index_pattern > type_row->pattern_length)
563                                 return g_strdup (type_row->sniffed_type);
564                 } else {
565                         int j;
566
567                         if (resource_length < type_row->pattern_length)
568                                 continue;
569
570                         for (j = 0; j < type_row->pattern_length; j++) {
571                                 if ((type_row->mask[j] & resource[j]) != type_row->pattern[j])
572                                         break;
573                         }
574
575                         /* This means our comparison above matched completely */
576                         if (j == type_row->pattern_length)
577                                 return g_strdup (type_row->sniffed_type);
578                 }
579         }
580
581         if (for_text_or_binary)
582                 return g_strdup ("application/octet-stream");
583
584         for (i = 0; i < resource_length; i++) {
585                 if (byte_looks_binary[resource[i]])
586                         return g_strdup ("application/octet-stream");
587         }
588
589         return g_strdup ("text/plain");
590 }
591
592 /* HTML5: 2.7.3 Content-Type sniffing: text or binary */
593 static char*
594 sniff_text_or_binary (SoupContentSniffer *sniffer, SoupMessage *msg,
595                       SoupBuffer *buffer)
596 {
597         const guchar *resource = (const guchar *)buffer->data;
598         int resource_length = MIN (512, buffer->length);
599         gboolean looks_binary = FALSE;
600         int i;
601
602         /* Detecting UTF-16BE, UTF-16LE, or UTF-8 BOMs means it's text/plain */
603         if (resource_length >= 4) {
604                 if ((resource[0] == 0xFE && resource[1] == 0xFF) ||
605                     (resource[0] == 0xFF && resource[1] == 0xFE) ||
606                     (resource[0] == 0xEF && resource[1] == 0xBB && resource[2] == 0xBF))
607                         return g_strdup ("text/plain");
608         }
609
610         /* Look to see if any of the first n bytes looks binary */
611         for (i = 0; i < resource_length; i++) {
612                 if (byte_looks_binary[resource[i]]) {
613                         looks_binary = TRUE;
614                         break;
615                 }
616         }
617
618         if (!looks_binary)
619                 return g_strdup ("text/plain");
620
621         return sniff_unknown (sniffer, msg, buffer, TRUE);
622 }
623
624 static char*
625 sniff_images (SoupContentSniffer *sniffer, SoupMessage *msg,
626               SoupBuffer *buffer, const char *content_type)
627 {
628         const guchar *resource = (const guchar *)buffer->data;
629         int resource_length = MIN (512, buffer->length);
630         int i;
631
632         for (i = 0; i < G_N_ELEMENTS (types_table); i++) {
633                 SoupContentSnifferPattern *type_row = &(types_table[i]);
634
635                 if (resource_length < type_row->pattern_length)
636                         continue;
637
638                 if (!g_str_has_prefix (type_row->sniffed_type, "image/"))
639                         continue;
640
641                 /* All of the image types use all-\xFF for the mask,
642                  * so we can just memcmp.
643                  */
644                 if (memcmp (type_row->pattern, resource, type_row->pattern_length) == 0)
645                         return g_strdup (type_row->sniffed_type);
646         }
647
648         return g_strdup (content_type);
649 }
650
651 static char*
652 sniff_feed_or_html (SoupContentSniffer *sniffer, SoupMessage *msg, SoupBuffer *buffer)
653 {
654         const guchar *resource = (const guchar *)buffer->data;
655         int resource_length = MIN (512, buffer->length);
656         int pos = 0;
657
658         if (resource_length < 3)
659                 goto text_html;
660
661         /* Skip a leading UTF-8 BOM */
662         if (resource[0] == 0xEF && resource[1] == 0xBB && resource[2] == 0xBF)
663                 pos = 3;
664
665  look_for_tag:
666         if (pos > resource_length)
667                 goto text_html;
668
669         /* Skip insignificant white space */
670         while ((resource[pos] == '\x09') ||
671                (resource[pos] == '\x20') ||
672                (resource[pos] == '\x0A') ||
673                (resource[pos] == '\x0D')) {
674                 pos++;
675
676                 if (pos > resource_length)
677                         goto text_html;
678         }
679
680         /* != < */
681         if (resource[pos] != '\x3C')
682                 return g_strdup ("text/html");
683
684         pos++;
685
686         if ((pos + 2) > resource_length)
687                 goto text_html;
688
689         /* Skipping comments */
690         if ((resource[pos] == '\x2D') ||
691             (resource[pos+1] == '\x2D') ||
692             (resource[pos+2] == '\x3E')) {
693                 pos = pos + 3;
694
695                 if ((pos + 2) > resource_length)
696                         goto text_html;
697
698                 while ((resource[pos] != '\x2D') &&
699                        (resource[pos+1] != '\x2D') &&
700                        (resource[pos+2] != '\x3E')) {
701                         pos++;
702
703                         if ((pos + 2) > resource_length)
704                                 goto text_html;
705                 }
706
707                 goto look_for_tag;
708         }
709
710         if (pos > resource_length)
711                 goto text_html;
712
713         /* == ! */
714         if (resource[pos] == '\x21') {
715                 do {
716                         pos++;
717
718                         if (pos > resource_length)
719                                 goto text_html;
720                 } while (resource[pos] != '\x3E');
721
722                 pos++;
723
724                 goto look_for_tag;
725         } else if (resource[pos] == '\x3F') { /* ? */
726                 do {
727                         pos++;
728
729                         if ((pos + 1) > resource_length)
730                                 goto text_html;
731                 } while ((resource[pos] != '\x3F') &&
732                          (resource[pos+1] != '\x3E'));
733
734                 pos = pos + 2;
735
736                 goto look_for_tag;
737         }
738
739         if ((pos + 2) > resource_length)
740                 goto text_html;
741
742         if ((resource[pos] == '\x72') &&
743             (resource[pos+1] == '\x73') &&
744             (resource[pos+2] == '\x73'))
745                 return g_strdup ("application/rss+xml");
746
747         if ((pos + 3) > resource_length)
748                 goto text_html;
749
750         if ((resource[pos] == '\x66') &&
751             (resource[pos+1] == '\x65') &&
752             (resource[pos+2] == '\x65') &&
753             (resource[pos+3] == '\x64'))
754                 return g_strdup ("application/atom+xml");
755
756  text_html:
757         return g_strdup ("text/html");
758 }
759
760 static char*
761 sniff (SoupContentSniffer *sniffer, SoupMessage *msg, SoupBuffer *buffer, GHashTable **params)
762 {
763         const char *content_type;
764
765         content_type = soup_message_headers_get_content_type (msg->response_headers, params);
766
767         /* These comparisons are done in an ASCII-case-insensitive
768          * manner because the spec requires it */
769         if ((content_type == NULL) ||
770             !g_ascii_strcasecmp (content_type, "unknown/unknown") ||
771             !g_ascii_strcasecmp (content_type, "application/unknown") ||
772             !g_ascii_strcasecmp (content_type, "*/*"))
773                 return sniff_unknown (sniffer, msg, buffer, FALSE);
774
775         if (g_str_has_suffix (content_type, "+xml") ||
776             !g_ascii_strcasecmp (content_type, "text/xml") ||
777             !g_ascii_strcasecmp (content_type, "application/xml"))
778                 return g_strdup (content_type);
779
780         /* 2.7.5 Content-Type sniffing: image
781          * The spec says:
782          *
783          *   If the resource's official type is "image/svg+xml", then
784          *   the sniffed type of the resource is its official type (an
785          *   XML type)
786          *
787          * The XML case is handled by the if above; if you refactor
788          * this code, keep this in mind.
789          */
790         if (!g_ascii_strncasecmp (content_type, "image/", 6))
791                 return sniff_images (sniffer, msg, buffer, content_type);
792
793         /* If we got text/plain, use text_or_binary */
794         if (g_str_equal (content_type, "text/plain")) {
795                 return sniff_text_or_binary (sniffer, msg, buffer);
796         }
797
798         if (!g_ascii_strcasecmp (content_type, "text/html"))
799                 return sniff_feed_or_html (sniffer, msg, buffer);
800
801         return g_strdup (content_type);
802 }
803
804 static gsize
805 get_buffer_size (SoupContentSniffer *sniffer)
806 {
807         return 512;
808 }
809
810 static void
811 soup_content_sniffer_got_headers_cb (SoupMessage *msg, SoupContentSniffer *sniffer)
812 {
813         SoupMessagePrivate *priv = SOUP_MESSAGE_GET_PRIVATE (msg);
814         SoupContentSnifferClass *content_sniffer_class = SOUP_CONTENT_SNIFFER_GET_CLASS (sniffer);
815
816         priv->bytes_for_sniffing = content_sniffer_class->get_buffer_size (sniffer);
817 }
818
819 static void
820 request_queued (SoupSessionFeature *feature, SoupSession *session,
821                 SoupMessage *msg)
822 {
823         SoupMessagePrivate *priv = SOUP_MESSAGE_GET_PRIVATE (msg);
824
825         priv->sniffer = g_object_ref (feature);
826         g_signal_connect (msg, "got-headers",
827                           G_CALLBACK (soup_content_sniffer_got_headers_cb),
828                           feature);
829 }
830
831 static void
832 request_unqueued (SoupSessionFeature *feature, SoupSession *session,
833                   SoupMessage *msg)
834 {
835         SoupMessagePrivate *priv = SOUP_MESSAGE_GET_PRIVATE (msg);
836
837         g_object_unref (priv->sniffer);
838         priv->sniffer = NULL;
839
840         g_signal_handlers_disconnect_by_func (msg, soup_content_sniffer_got_headers_cb, feature);
841 }