soup-request-file: plug leak
[platform/upstream/libsoup.git] / libsoup / soup-content-sniffer.c
1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
2 /*
3  * soup-content-sniffer.c
4  *
5  * Copyright (C) 2009 Gustavo Noronha Silva.
6  */
7
8 #ifdef HAVE_CONFIG_H
9 #include <config.h>
10 #endif
11
12 #include <string.h>
13 #include <gio/gio.h>
14
15 #include "soup-content-sniffer.h"
16 #include "soup-enum-types.h"
17 #include "soup-message.h"
18 #include "soup-message-private.h"
19 #include "soup-session-feature.h"
20 #include "soup-uri.h"
21
22 /**
23  * SECTION:soup-content-sniffer
24  * @short_description: Content sniffing for #SoupSession
25  *
26  * A #SoupContentSniffer tries to detect the actual content type of
27  * the files that are being downloaded by looking at some of the data
28  * before the #SoupMessage emits its #SoupMessage::got-headers signal.
29  * #SoupContentSniffer implements #SoupSessionFeature, so you can add
30  * content sniffing to a session with soup_session_add_feature() or
31  * soup_session_add_feature_by_type().
32  *
33  * Since: 2.27.3
34  **/
35
36 static char *sniff (SoupContentSniffer *sniffer, SoupMessage *msg, SoupBuffer *buffer, GHashTable **params);
37 static gsize get_buffer_size (SoupContentSniffer *sniffer);
38
39 static void soup_content_sniffer_session_feature_init (SoupSessionFeatureInterface *feature_interface, gpointer interface_data);
40
41 static void request_queued (SoupSessionFeature *feature, SoupSession *session, SoupMessage *msg);
42 static void request_unqueued (SoupSessionFeature *feature, SoupSession *session, SoupMessage *msg);
43
44 G_DEFINE_TYPE_WITH_CODE (SoupContentSniffer, soup_content_sniffer, G_TYPE_OBJECT,
45                          G_IMPLEMENT_INTERFACE (SOUP_TYPE_SESSION_FEATURE,
46                                                 soup_content_sniffer_session_feature_init))
47
48 static void
49 soup_content_sniffer_init (SoupContentSniffer *content_sniffer)
50 {
51 }
52
53 static void
54 soup_content_sniffer_class_init (SoupContentSnifferClass *content_sniffer_class)
55 {
56         content_sniffer_class->sniff = sniff;
57         content_sniffer_class->get_buffer_size = get_buffer_size;
58 }
59
60 static void
61 soup_content_sniffer_session_feature_init (SoupSessionFeatureInterface *feature_interface,
62                                            gpointer interface_data)
63 {
64         feature_interface->request_queued = request_queued;
65         feature_interface->request_unqueued = request_unqueued;
66 }
67
68 /**
69  * soup_content_sniffer_new:
70  *
71  * Creates a new #SoupContentSniffer.
72  *
73  * Returns: a new #SoupContentSniffer
74  *
75  * Since: 2.27.3
76  **/
77 SoupContentSniffer *
78 soup_content_sniffer_new ()
79 {
80         return g_object_new (SOUP_TYPE_CONTENT_SNIFFER, NULL);
81 }
82
83 char *
84 soup_content_sniffer_sniff (SoupContentSniffer *sniffer,
85                             SoupMessage *msg, SoupBuffer *buffer,
86                             GHashTable **params)
87 {
88         g_return_val_if_fail (SOUP_IS_CONTENT_SNIFFER (sniffer), NULL);
89         g_return_val_if_fail (SOUP_IS_MESSAGE (msg), NULL);
90         g_return_val_if_fail (buffer != NULL, NULL);
91
92         return SOUP_CONTENT_SNIFFER_GET_CLASS (sniffer)->sniff (sniffer, msg, buffer, params);
93 }
94
95 /* This table is based on the HTML5 spec;
96  * See 2.7.4 Content-Type sniffing: unknown type
97  */
98 typedef struct {
99         /* @has_ws is TRUE if @pattern contains "generic" whitespace */
100         gboolean      has_ws;
101         const guchar *mask;
102         const guchar *pattern;
103         guint         pattern_length;
104         const char   *sniffed_type;
105         gboolean      scriptable;
106 } SoupContentSnifferPattern;
107
108 static SoupContentSnifferPattern types_table[] = {
109         { FALSE,
110           (const guchar *)"\xFF\xFF\xDF\xDF\xDF\xDF\xDF\xDF\xDF\xFF\xDF\xDF\xDF\xDF",
111           (const guchar *)"\x3C\x21\x44\x4F\x43\x54\x59\x50\x45\x20\x48\x54\x4D\x4C",
112           14,
113           "text/html",
114           TRUE },
115
116         { TRUE,
117           (const guchar *)"\xFF\xFF\xDF\xDF\xDF\xDF",
118           (const guchar *)" \x3C\x48\x54\x4D\x4C",
119           5,
120           "text/html",
121           TRUE },
122
123         { TRUE,
124           (const guchar *)"\xFF\xFF\xDF\xDF\xDF\xDF",
125           (const guchar *)" \x3C\x48\x45\x41\x44",
126           5,
127           "text/html",
128           TRUE },
129
130         { TRUE,
131           (const guchar *)"\xFF\xFF\xDF\xDF\xDF\xDF\xDF\xDF",
132           (const guchar *)" \x3C\x53\x43\x52\x49\x50\x54",
133           7,
134           "text/html",
135           TRUE },
136
137         { FALSE,
138           (const guchar *)"\xFF\xFF\xFF\xFF\xFF",
139           (const guchar *)"\x25\x50\x44\x46\x2D",
140           5,
141           "application/pdf",
142           TRUE },
143
144         { FALSE,
145           (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF",
146           (const guchar *)"\x25\x21\x50\x53\x2D\x41\x64\x6F\x62\x65\x2D",
147           11,
148           "application/postscript",
149           FALSE },
150
151         { FALSE,
152           (const guchar *)"\xFF\xFF\x00\x00",
153           (const guchar *)"\xFE\xFF\x00\x00",
154           4,
155           "text/plain",
156           FALSE },
157
158         { FALSE,
159           (const guchar *)"\xFF\xFF\x00\x00",
160           (const guchar *)"\xFF\xFF\x00\x00",
161           4,
162           "text/plain",
163           FALSE },
164
165         { FALSE,
166           (const guchar *)"\xFF\xFF\xFF\x00",
167           (const guchar *)"\xEF\xBB\xBF\x00",
168           4,
169           "text/plain",
170           FALSE },
171
172         { FALSE,
173           (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF",
174           (const guchar *)"\x47\x49\x46\x38\x37\x61",
175           6,
176           "image/gif",
177           FALSE },
178
179         { FALSE,
180           (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF",
181           (const guchar *)"\x47\x49\x46\x38\x39\x61",
182           6,
183           "image/gif",
184           FALSE },
185
186         { FALSE,
187           (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF",
188           (const guchar *)"\x89\x50\x4E\x47\x0D\x0A\x1A\x0A",
189           8,
190           "image/png",
191           FALSE },
192
193         { FALSE,
194           (const guchar *)"\xFF\xFF\xFF",
195           (const guchar *)"\xFF\xD8\xFF",
196           3,
197           "image/jpeg",
198           FALSE },
199
200         { FALSE,
201           (const guchar *)"\xFF\xFF",
202           (const guchar *)"\x42\x4D",
203           2,
204           "image/bmp",
205           FALSE },
206
207         { FALSE,
208           (const guchar *)"\xFF\xFF\xFF\xFF",
209           (const guchar *)"\x00\x00\x01\x00",
210           4,
211           "image/vnd.microsoft.icon",
212           FALSE }
213 };
214
215 /* Whether a given byte looks like it might be part of binary content.
216  * Source: HTML5 spec; borrowed from the Chromium mime sniffer code,
217  * which is BSD-licensed
218  */
219 static char byte_looks_binary[] = {
220         1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1,  /* 0x00 - 0x0F */
221         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,  /* 0x10 - 0x1F */
222         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x20 - 0x2F */
223         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x30 - 0x3F */
224         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x40 - 0x4F */
225         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x50 - 0x5F */
226         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x60 - 0x6F */
227         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x70 - 0x7F */
228         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x80 - 0x8F */
229         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x90 - 0x9F */
230         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0xA0 - 0xAF */
231         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0xB0 - 0xBF */
232         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0xC0 - 0xCF */
233         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0xD0 - 0xDF */
234         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0xE0 - 0xEF */
235         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0xF0 - 0xFF */
236 };
237
238 static char *
239 sniff_gio (SoupContentSniffer *sniffer, SoupMessage *msg, SoupBuffer *buffer)
240 {
241         SoupURI *uri;
242         char *uri_path;
243         char *content_type;
244         char *mime_type;
245         gboolean uncertain;
246
247         uri = soup_message_get_uri (msg);
248         uri_path = soup_uri_to_string (uri, TRUE);
249
250         content_type= g_content_type_guess (uri_path, (const guchar*)buffer->data, buffer->length, &uncertain);
251         mime_type = g_content_type_get_mime_type (content_type);
252
253         g_free (uri_path);
254         g_free (content_type);
255
256         return mime_type;
257 }
258
259 /* HTML5: 2.7.4 Content-Type sniffing: unknown type */
260 static char*
261 sniff_unknown (SoupContentSniffer *sniffer, SoupMessage *msg,
262                SoupBuffer *buffer, gboolean for_text_or_binary)
263 {
264         const guchar *resource = (const guchar *)buffer->data;
265         int resource_length = MIN (512, buffer->length);
266         char *gio_guess;
267         int i;
268
269         for (i = 0; i < G_N_ELEMENTS (types_table); i++) {
270                 SoupContentSnifferPattern *type_row = &(types_table[i]);
271
272                 /* The scriptable types should be skiped for the text
273                  * or binary path, but considered for other paths */
274                 if (for_text_or_binary && type_row->scriptable)
275                         continue;
276
277                 if (type_row->has_ws) {
278                         int index_stream = 0;
279                         int index_pattern = 0;
280                         gboolean skip_row = FALSE;
281
282                         while ((index_stream < resource_length) &&
283                                (index_pattern <= type_row->pattern_length)) {
284                                 /* Skip insignificant white space ("WS" in the spec) */
285                                 if (type_row->pattern[index_pattern] == ' ') {
286                                         if (resource[index_stream] == '\x09' ||
287                                             resource[index_stream] == '\x0a' ||
288                                             resource[index_stream] == '\x0c' ||
289                                             resource[index_stream] == '\x0d' ||
290                                             resource[index_stream] == '\x20')
291                                                 index_stream++;
292                                         else
293                                                 index_pattern++;
294                                 } else {
295                                         if ((type_row->mask[index_pattern] & resource[index_stream]) != type_row->pattern[index_pattern]) {
296                                                 skip_row = TRUE;
297                                                 break;
298                                         }
299                                         index_pattern++;
300                                         index_stream++;
301                                 }
302                         }
303
304                         if (skip_row)
305                                 continue;
306
307                         if (index_pattern > type_row->pattern_length)
308                                 return g_strdup (type_row->sniffed_type);
309                 } else {
310                         int j;
311
312                         if (resource_length < type_row->pattern_length)
313                                 continue;
314
315                         for (j = 0; j < type_row->pattern_length; j++) {
316                                 if ((type_row->mask[j] & resource[j]) != type_row->pattern[j])
317                                         break;
318                         }
319
320                         /* This means our comparison above matched completely */
321                         if (j == type_row->pattern_length)
322                                 return g_strdup (type_row->sniffed_type);
323                 }
324         }
325
326         /* The spec allows us to use platform sniffing to find out
327          * about other types that are not covered, but we need to be
328          * careful to not escalate privileges, if on text or binary.
329          */
330         gio_guess = sniff_gio (sniffer, msg, buffer);
331
332         if (for_text_or_binary) {
333                 for (i = 0; i < G_N_ELEMENTS (types_table); i++) {
334                         SoupContentSnifferPattern *type_row = &(types_table[i]);
335
336                         if (!g_ascii_strcasecmp (type_row->sniffed_type, gio_guess) &&
337                             type_row->scriptable) {
338                                 g_free (gio_guess);
339                                 gio_guess = NULL;
340                                 break;
341                         }
342                 }
343         }
344
345         if (gio_guess)
346                 return gio_guess;
347
348         return g_strdup ("application/octet-stream");
349 }
350
351 /* HTML5: 2.7.3 Content-Type sniffing: text or binary */
352 static char*
353 sniff_text_or_binary (SoupContentSniffer *sniffer, SoupMessage *msg,
354                       SoupBuffer *buffer)
355 {
356         const guchar *resource = (const guchar *)buffer->data;
357         int resource_length = MIN (512, buffer->length);
358         gboolean looks_binary = FALSE;
359         int i;
360
361         /* Detecting UTF-16BE, UTF-16LE, or UTF-8 BOMs means it's text/plain */
362         if (resource_length >= 4) {
363                 if ((resource[0] == 0xFE && resource[1] == 0xFF) ||
364                     (resource[0] == 0xFF && resource[1] == 0xFE) ||
365                     (resource[0] == 0xEF && resource[1] == 0xBB && resource[2] == 0xBF))
366                         return g_strdup ("text/plain");
367         }
368
369         /* Look to see if any of the first n bytes looks binary */
370         for (i = 0; i < resource_length; i++) {
371                 if (byte_looks_binary[resource[i]]) {
372                         looks_binary = TRUE;
373                         break;
374                 }
375         }
376
377         if (!looks_binary)
378                 return g_strdup ("text/plain");
379
380         return sniff_unknown (sniffer, msg, buffer, TRUE);
381 }
382
383 static char*
384 sniff_images (SoupContentSniffer *sniffer, SoupMessage *msg,
385               SoupBuffer *buffer, const char *content_type)
386 {
387         const guchar *resource = (const guchar *)buffer->data;
388         int resource_length = MIN (512, buffer->length);
389         int i;
390
391         for (i = 0; i < G_N_ELEMENTS (types_table); i++) {
392                 SoupContentSnifferPattern *type_row = &(types_table[i]);
393
394                 if (resource_length < type_row->pattern_length)
395                         continue;
396
397                 if (!g_str_has_prefix (type_row->sniffed_type, "image/"))
398                         continue;
399
400                 /* All of the image types use all-\xFF for the mask,
401                  * so we can just memcmp.
402                  */
403                 if (memcmp (type_row->pattern, resource, type_row->pattern_length) == 0)
404                         return g_strdup (type_row->sniffed_type);
405         }
406
407         return g_strdup (content_type);
408 }
409
410 static char*
411 sniff_feed_or_html (SoupContentSniffer *sniffer, SoupMessage *msg, SoupBuffer *buffer)
412 {
413         const guchar *resource = (const guchar *)buffer->data;
414         int resource_length = MIN (512, buffer->length);
415         int pos = 0;
416
417         if (resource_length < 3)
418                 goto text_html;
419
420         /* Skip a leading UTF-8 BOM */
421         if (resource[0] == 0xEF && resource[1] == 0xBB && resource[2] == 0xBF)
422                 pos = 3;
423
424  look_for_tag:
425         if (pos > resource_length)
426                 goto text_html;
427
428         /* Skip insignificant white space */
429         while ((resource[pos] == '\x09') ||
430                (resource[pos] == '\x20') ||
431                (resource[pos] == '\x0A') ||
432                (resource[pos] == '\x0D')) {
433                 pos++;
434
435                 if (pos > resource_length)
436                         goto text_html;
437         }
438
439         /* != < */
440         if (resource[pos] != '\x3C')
441                 return g_strdup ("text/html");
442
443         pos++;
444
445         if ((pos + 2) > resource_length)
446                 goto text_html;
447
448         /* Skipping comments */
449         if ((resource[pos] == '\x2D') ||
450             (resource[pos+1] == '\x2D') ||
451             (resource[pos+2] == '\x3E')) {
452                 pos = pos + 3;
453
454                 if ((pos + 2) > resource_length)
455                         goto text_html;
456
457                 while ((resource[pos] != '\x2D') &&
458                        (resource[pos+1] != '\x2D') &&
459                        (resource[pos+2] != '\x3E')) {
460                         pos++;
461
462                         if ((pos + 2) > resource_length)
463                                 goto text_html;
464                 }
465
466                 goto look_for_tag;
467         }
468
469         if (pos > resource_length)
470                 goto text_html;
471
472         /* == ! */
473         if (resource[pos] == '\x21') {
474                 do {
475                         pos++;
476
477                         if (pos > resource_length)
478                                 goto text_html;
479                 } while (resource[pos] != '\x3E');
480
481                 pos++;
482
483                 goto look_for_tag;
484         } else if (resource[pos] == '\x3F') { /* ? */
485                 do {
486                         pos++;
487
488                         if ((pos + 1) > resource_length)
489                                 goto text_html;
490                 } while ((resource[pos] != '\x3F') &&
491                          (resource[pos+1] != '\x3E'));
492
493                 pos = pos + 2;
494
495                 goto look_for_tag;
496         }
497
498         if ((pos + 2) > resource_length)
499                 goto text_html;
500
501         if ((resource[pos] == '\x72') &&
502             (resource[pos+1] == '\x73') &&
503             (resource[pos+2] == '\x73'))
504                 return g_strdup ("application/rss+xml");
505
506         if ((pos + 3) > resource_length)
507                 goto text_html;
508
509         if ((resource[pos] == '\x66') &&
510             (resource[pos+1] == '\x65') &&
511             (resource[pos+2] == '\x65') &&
512             (resource[pos+3] == '\x64'))
513                 return g_strdup ("application/atom+xml");
514
515  text_html:
516         return g_strdup ("text/html");
517 }
518
519 static char*
520 sniff (SoupContentSniffer *sniffer, SoupMessage *msg, SoupBuffer *buffer, GHashTable **params)
521 {
522         const char *content_type;
523
524         content_type = soup_message_headers_get_content_type (msg->response_headers, params);
525
526         /* These comparisons are done in an ASCII-case-insensitive
527          * manner because the spec requires it */
528         if ((content_type == NULL) ||
529             !g_ascii_strcasecmp (content_type, "unknown/unknown") ||
530             !g_ascii_strcasecmp (content_type, "application/unknown") ||
531             !g_ascii_strcasecmp (content_type, "*/*"))
532                 return sniff_unknown (sniffer, msg, buffer, FALSE);
533
534         if (g_str_has_suffix (content_type, "+xml") ||
535             !g_ascii_strcasecmp (content_type, "text/xml") ||
536             !g_ascii_strcasecmp (content_type, "application/xml"))
537                 return g_strdup (content_type);
538
539         /* 2.7.5 Content-Type sniffing: image
540          * The spec says:
541          *
542          *   If the resource's official type is "image/svg+xml", then
543          *   the sniffed type of the resource is its official type (an
544          *   XML type)
545          *
546          * The XML case is handled by the if above; if you refactor
547          * this code, keep this in mind.
548          */
549         if (!g_ascii_strncasecmp (content_type, "image/", 6))
550                 return sniff_images (sniffer, msg, buffer, content_type);
551
552         /* If we got text/plain, use text_or_binary */
553         if (g_str_equal (content_type, "text/plain")) {
554                 return sniff_text_or_binary (sniffer, msg, buffer);
555         }
556
557         if (!g_ascii_strcasecmp (content_type, "text/html"))
558                 return sniff_feed_or_html (sniffer, msg, buffer);
559
560         return g_strdup (content_type);
561 }
562
563 static gsize
564 get_buffer_size (SoupContentSniffer *sniffer)
565 {
566         return 512;
567 }
568
569 static void
570 soup_content_sniffer_got_headers_cb (SoupMessage *msg, SoupContentSniffer *sniffer)
571 {
572         SoupMessagePrivate *priv = SOUP_MESSAGE_GET_PRIVATE (msg);
573         SoupContentSnifferClass *content_sniffer_class = SOUP_CONTENT_SNIFFER_GET_CLASS (sniffer);
574
575         priv->bytes_for_sniffing = content_sniffer_class->get_buffer_size (sniffer);
576 }
577
578 static void
579 request_queued (SoupSessionFeature *feature, SoupSession *session,
580                 SoupMessage *msg)
581 {
582         SoupMessagePrivate *priv = SOUP_MESSAGE_GET_PRIVATE (msg);
583
584         priv->sniffer = g_object_ref (feature);
585         g_signal_connect (msg, "got-headers",
586                           G_CALLBACK (soup_content_sniffer_got_headers_cb),
587                           feature);
588 }
589
590 static void
591 request_unqueued (SoupSessionFeature *feature, SoupSession *session,
592                   SoupMessage *msg)
593 {
594         SoupMessagePrivate *priv = SOUP_MESSAGE_GET_PRIVATE (msg);
595
596         g_object_unref (priv->sniffer);
597         priv->sniffer = NULL;
598
599         g_signal_handlers_disconnect_by_func (msg, soup_content_sniffer_got_headers_cb, feature);
600 }