gst-libs/gst/tag/id3v2frames.c

   1 /* -*- Mode: C; tab-width: 2; indent-tabs-mode: t; c-basic-offset: 2 -*- */
   2 /* Copyright 2006-2008 Tim-Philipp Müller <tim centricular net>
   3  * Copyright 2005 Jan Schmidt <thaytan@mad.scientist.com>
   4  * Copyright 2002,2003 Scott Wheeler <wheeler@kde.org> (portions from taglib)
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Library General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Library General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Library General Public
  17  * License along with this library; if not, write to the
  18  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  19  * Boston, MA 02111-1307, USA.
  20  */
  21
  22 #ifdef HAVE_CONFIG_H
  23 #include "config.h"
  24 #endif
  25
  26 #include <string.h>
  27 #include <stdio.h>
  28 #include <stdlib.h>
  29 #include <gst/tag/tag.h>
  30 #include <gst/base/gsttypefindhelper.h>
  31
  32 #ifdef HAVE_ZLIB
  33 #include <zlib.h>
  34 #endif
  35
  36 #include "id3v2.h"
  37
  38 #ifndef GST_DISABLE_GST_DEBUG
  39 #define GST_CAT_DEFAULT id3v2_ensure_debug_category()
  40 #endif
  41
  42 static gboolean parse_comment_frame (ID3TagsWorking * work);
  43 static gchar *parse_url_link_frame (ID3TagsWorking * work,
  44     const gchar ** tag_name);
  45 static GArray *parse_text_identification_frame (ID3TagsWorking * work);
  46 static gchar *parse_user_text_identification_frame (ID3TagsWorking * work,
  47     const gchar ** tag_name);
  48 static gchar *parse_unique_file_identifier (ID3TagsWorking * work,
  49     const gchar ** tag_name);
  50 static gboolean parse_relative_volume_adjustment_two (ID3TagsWorking * work);
  51 static void parse_obsolete_tdat_frame (ID3TagsWorking * work);
  52 static gboolean id3v2_tag_to_taglist (ID3TagsWorking * work,
  53     const gchar * tag_name, const gchar * tag_str);
  54 /* Parse a single string into an array of gchar* */
  55 static void parse_split_strings (guint8 encoding, gchar * data, gint data_size,
  56     GArray ** out_fields);
  57 static void free_tag_strings (GArray * fields);
  58 static gboolean
  59 id3v2_genre_fields_to_taglist (ID3TagsWorking * work, const gchar * tag_name,
  60     GArray * tag_fields);
  61 static gboolean parse_picture_frame (ID3TagsWorking * work);
  62
  63 #define ID3V2_ENCODING_ISO8859 0x00
  64 #define ID3V2_ENCODING_UTF16   0x01
  65 #define ID3V2_ENCODING_UTF16BE 0x02
  66 #define ID3V2_ENCODING_UTF8    0x03
  67
  68 gboolean
  69 id3v2_parse_frame (ID3TagsWorking * work)
  70 {
  71   const gchar *tag_name;
  72   gboolean result = FALSE;
  73   gint i;
  74   guint8 *frame_data = work->hdr.frame_data;
  75   guint frame_data_size = work->cur_frame_size;
  76   gchar *tag_str = NULL;
  77   GArray *tag_fields = NULL;
  78   guint8 *uu_data = NULL;
  79
  80 #ifdef HAVE_ZLIB
  81   guint8 *uncompressed_data = NULL;
  82 #endif
  83
  84   /* Check that the frame id is valid */
  85   for (i = 0; i < 5 && work->frame_id[i] != '\0'; i++) {
  86     if (!g_ascii_isalnum (work->frame_id[i])) {
  87       GST_DEBUG ("Encountered invalid frame_id");
  88       return FALSE;
  89     }
  90   }
  91
  92   /* Can't handle encrypted frames right now (in case we ever do, we'll have
  93    * to do the decryption after the un-unsynchronisation and decompression,
  94    * not here) */
  95   if (work->frame_flags & ID3V2_FRAME_FORMAT_ENCRYPTION) {
  96     GST_WARNING ("Encrypted frames are not supported");
  97     return FALSE;
  98   }
  99
 100   tag_name = gst_tag_from_id3_tag (work->frame_id);
 101   if (tag_name == NULL &&
 102       strncmp (work->frame_id, "RVA2", 4) != 0 &&
 103       strncmp (work->frame_id, "TXXX", 4) != 0 &&
 104       strncmp (work->frame_id, "TDAT", 4) != 0 &&
 105       strncmp (work->frame_id, "UFID", 4) != 0) {
 106     return FALSE;
 107   }
 108
 109   if (work->frame_flags & (ID3V2_FRAME_FORMAT_COMPRESSION |
 110           ID3V2_FRAME_FORMAT_DATA_LENGTH_INDICATOR)) {
 111     if (work->hdr.frame_data_size <= 4)
 112       return FALSE;
 113     if (ID3V2_VER_MAJOR (work->hdr.version) == 3) {
 114       work->parse_size = GST_READ_UINT32_BE (frame_data);
 115     } else {
 116       work->parse_size = id3v2_read_synch_uint (frame_data, 4);
 117     }
 118     frame_data += 4;
 119     frame_data_size -= 4;
 120     GST_LOG ("Un-unsynced data size %d (of %d)", work->parse_size,
 121         frame_data_size);
 122     if (work->parse_size > frame_data_size) {
 123       GST_WARNING ("ID3v2 frame %s data has invalid size %d (>%d)",
 124           work->frame_id, work->parse_size, frame_data_size);
 125       return FALSE;
 126     }
 127   }
 128
 129   /* in v2.3 the frame sizes are not syncsafe, so the entire tag had to be
 130    * unsynced. In v2.4 the frame sizes are syncsafe so it's just the frame
 131    * data that needs un-unsyncing, but not the frame headers. */
 132   if (ID3V2_VER_MAJOR (work->hdr.version) == 4) {
 133     if ((work->hdr.flags & ID3V2_HDR_FLAG_UNSYNC) != 0 ||
 134         ((work->frame_flags & ID3V2_FRAME_FORMAT_UNSYNCHRONISATION) != 0)) {
 135       GST_DEBUG ("Un-unsyncing frame %s", work->frame_id);
 136       uu_data = id3v2_ununsync_data (frame_data, &frame_data_size);
 137       frame_data = uu_data;
 138       GST_MEMDUMP ("ID3v2 frame (un-unsyced)", frame_data, frame_data_size);
 139     }
 140   }
 141
 142   work->parse_size = frame_data_size;
 143
 144   if (work->frame_flags & ID3V2_FRAME_FORMAT_COMPRESSION) {
 145 #ifdef HAVE_ZLIB
 146     uLongf destSize = work->parse_size;
 147     Bytef *dest, *src;
 148
 149     uncompressed_data = g_malloc (work->parse_size);
 150
 151     dest = (Bytef *) uncompressed_data;
 152     src = (Bytef *) frame_data;
 153
 154     if (uncompress (dest, &destSize, src, frame_data_size) != Z_OK) {
 155       g_free (uncompressed_data);
 156       g_free (uu_data);
 157       return FALSE;
 158     }
 159     if (destSize != work->parse_size) {
 160       GST_WARNING
 161           ("Decompressing ID3v2 frame %s did not produce expected size %d bytes (got %lu)",
 162           tag_name, work->parse_size, destSize);
 163       g_free (uncompressed_data);
 164       g_free (uu_data);
 165       return FALSE;
 166     }
 167     work->parse_data = uncompressed_data;
 168 #else
 169     GST_WARNING ("Compressed ID3v2 tag frame could not be decompressed, because"
 170         " libgsttag-" GST_MAJORMINOR " was compiled without zlib support");
 171     g_free (uu_data);
 172     return FALSE;
 173 #endif
 174   } else {
 175     work->parse_data = frame_data;
 176   }
 177
 178   if (work->frame_id[0] == 'T') {
 179     if (strcmp (work->frame_id, "TDAT") == 0) {
 180       parse_obsolete_tdat_frame (work);
 181       result = TRUE;
 182     } else if (strcmp (work->frame_id, "TXXX") == 0) {
 183       /* Handle user text frame */
 184       tag_str = parse_user_text_identification_frame (work, &tag_name);
 185     } else {
 186       /* Text identification frame */
 187       tag_fields = parse_text_identification_frame (work);
 188     }
 189   } else if (work->frame_id[0] == 'W' && strcmp (work->frame_id, "WXXX") != 0) {
 190     /* URL link frame: ISO-8859-1 encoded, one frame per tag */
 191     tag_str = parse_url_link_frame (work, &tag_name);
 192   } else if (!strcmp (work->frame_id, "COMM")) {
 193     /* Comment */
 194     result = parse_comment_frame (work);
 195   } else if (!strcmp (work->frame_id, "APIC")) {
 196     /* Attached picture */
 197     result = parse_picture_frame (work);
 198   } else if (!strcmp (work->frame_id, "RVA2")) {
 199     /* Relative volume */
 200     result = parse_relative_volume_adjustment_two (work);
 201   } else if (!strcmp (work->frame_id, "UFID")) {
 202     /* Unique file identifier */
 203     tag_str = parse_unique_file_identifier (work, &tag_name);
 204   }
 205 #ifdef HAVE_ZLIB
 206   if (work->frame_flags & ID3V2_FRAME_FORMAT_COMPRESSION) {
 207     g_free (uncompressed_data);
 208     uncompressed_data = NULL;
 209     work->parse_data = frame_data;
 210   }
 211 #endif
 212
 213   if (tag_str != NULL) {
 214     /* g_print ("Tag %s value %s\n", tag_name, tag_str); */
 215     result = id3v2_tag_to_taglist (work, tag_name, tag_str);
 216     g_free (tag_str);
 217   }
 218   if (tag_fields != NULL) {
 219     if (strcmp (work->frame_id, "TCON") == 0) {
 220       /* Genre strings need special treatment */
 221       result |= id3v2_genre_fields_to_taglist (work, tag_name, tag_fields);
 222     } else {
 223       gint t;
 224
 225       for (t = 0; t < tag_fields->len; t++) {
 226         tag_str = g_array_index (tag_fields, gchar *, t);
 227         if (tag_str != NULL && tag_str[0] != '\0')
 228           result |= id3v2_tag_to_taglist (work, tag_name, tag_str);
 229       }
 230     }
 231     free_tag_strings (tag_fields);
 232   }
 233
 234   g_free (uu_data);
 235
 236   return result;
 237 }
 238
 239 static gboolean
 240 parse_comment_frame (ID3TagsWorking * work)
 241 {
 242   guint dummy;
 243   guint8 encoding;
 244   gchar language[4];
 245   GArray *fields = NULL;
 246   gchar *description, *text;
 247
 248   if (work->parse_size < 6)
 249     return FALSE;
 250
 251   encoding = work->parse_data[0];
 252   language[0] = g_ascii_tolower (work->parse_data[1]);
 253   language[1] = g_ascii_tolower (work->parse_data[2]);
 254   language[2] = g_ascii_tolower (work->parse_data[3]);
 255   language[3] = '\0';
 256
 257   parse_split_strings (encoding, (gchar *) work->parse_data + 4,
 258       work->parse_size - 4, &fields);
 259
 260   if (fields == NULL || fields->len < 2) {
 261     GST_WARNING ("Failed to decode comment frame");
 262     goto fail;
 263   }
 264   description = g_array_index (fields, gchar *, 0);
 265   text = g_array_index (fields, gchar *, 1);
 266
 267   if (!g_utf8_validate (text, -1, NULL)) {
 268     GST_WARNING ("Converted string is not valid utf-8");
 269     goto fail;
 270   }
 271
 272   /* skip our own dummy descriptions (from id3v2mux) */
 273   if (strlen (description) > 0 && g_utf8_validate (description, -1, NULL) &&
 274       sscanf (description, "c%u", &dummy) != 1) {
 275     gchar *s;
 276
 277     /* must be either an ISO-639-1 or ISO-639-2 language code */
 278     if (language[0] != '\0' &&
 279         g_ascii_isalpha (language[0]) &&
 280         g_ascii_isalpha (language[1]) &&
 281         (g_ascii_isalpha (language[2]) || language[2] == '\0')) {
 282       const gchar *lang_code;
 283
 284       /* prefer two-letter ISO 639-1 code if we have a mapping */
 285       lang_code = gst_tag_get_language_code (language);
 286       s = g_strdup_printf ("%s[%s]=%s", description,
 287           (lang_code) ? lang_code : language, text);
 288     } else {
 289       s = g_strdup_printf ("%s=%s", description, text);
 290     }
 291     gst_tag_list_add (work->tags, GST_TAG_MERGE_APPEND,
 292         GST_TAG_EXTENDED_COMMENT, s, NULL);
 293     g_free (s);
 294   } else if (text != NULL && *text != '\0') {
 295     gst_tag_list_add (work->tags, GST_TAG_MERGE_APPEND,
 296         GST_TAG_COMMENT, text, NULL);
 297   } else {
 298     goto fail;
 299   }
 300
 301   free_tag_strings (fields);
 302   return TRUE;
 303
 304 fail:
 305   {
 306     GST_WARNING ("failed to parse COMM frame");
 307     free_tag_strings (fields);
 308     return FALSE;
 309   }
 310 }
 311
 312 static GArray *
 313 parse_text_identification_frame (ID3TagsWorking * work)
 314 {
 315   guchar encoding;
 316   GArray *fields = NULL;
 317
 318   if (work->parse_size < 2)
 319     return NULL;
 320
 321   encoding = work->parse_data[0];
 322   parse_split_strings (encoding, (gchar *) work->parse_data + 1,
 323       work->parse_size - 1, &fields);
 324   if (fields) {
 325     if (fields->len > 0) {
 326       GST_LOG ("Read %d fields from Text ID frame of size %d with encoding %d"
 327           ". First is '%s'", fields->len, work->parse_size - 1, encoding,
 328           g_array_index (fields, gchar *, 0));
 329     } else {
 330       GST_LOG ("Read 0 fields from Text ID frame of size %d with encoding %d",
 331           work->parse_size - 1, encoding);
 332     }
 333   }
 334
 335   return fields;
 336 }
 337
 338 static gboolean
 339 link_is_known_license (const gchar * url)
 340 {
 341   return g_str_has_prefix (url, "http://creativecommons.org/licenses/");
 342 }
 343
 344 static gchar *
 345 parse_url_link_frame (ID3TagsWorking * work, const gchar ** tag_name)
 346 {
 347   gsize len;
 348   gchar *nul, *data, *link;
 349
 350   *tag_name = NULL;
 351
 352   if (work->parse_size == 0)
 353     return NULL;
 354
 355   data = (gchar *) work->parse_data;
 356   /* if there's more data then the string is long, we only want to parse the
 357    * data up to the terminating zero to g_convert and ignore the rest, as
 358    * per spec */
 359   nul = memchr (data, '\0', work->parse_size);
 360   if (nul != NULL) {
 361     len = (gsize) (nul - data);
 362   } else {
 363     len = work->parse_size;
 364   }
 365
 366   link = g_convert (data, len, "UTF-8", "ISO-8859-1", NULL, NULL, NULL);
 367
 368   if (link == NULL || !gst_uri_is_valid (link)) {
 369     GST_DEBUG ("Invalid URI in %s frame: %s", work->frame_id,
 370         GST_STR_NULL (link));
 371     g_free (link);
 372     return NULL;
 373   }
 374
 375   /* we don't know if it's a link to a page that explains the copyright
 376    * situation, or a link that points to/represents a license, the ID3 spec
 377    * does not separate those two things; for now only put known license URIs
 378    * into GST_TAG_LICENSE_URI and everything else into GST_TAG_COPYRIGHT_URI */
 379   if (strcmp (work->frame_id, "WCOP") == 0) {
 380     if (link_is_known_license (link))
 381       *tag_name = GST_TAG_LICENSE_URI;
 382     else
 383       *tag_name = GST_TAG_COPYRIGHT_URI;
 384   } else if (strcmp (work->frame_id, "WOAF") == 0) {
 385     /* can't be bothered to create a CONTACT_URI tag for this, so let's just
 386      * put into into GST_TAG_CONTACT, which is where it ends up when reading
 387      * the info from vorbis comments as well */
 388     *tag_name = GST_TAG_CONTACT;
 389   }
 390
 391   return link;
 392 }
 393
 394
 395 static gchar *
 396 parse_user_text_identification_frame (ID3TagsWorking * work,
 397     const gchar ** tag_name)
 398 {
 399   gchar *ret;
 400   guchar encoding;
 401   GArray *fields = NULL;
 402
 403   *tag_name = NULL;
 404
 405   if (work->parse_size < 2)
 406     return NULL;
 407
 408   encoding = work->parse_data[0];
 409
 410   parse_split_strings (encoding, (gchar *) work->parse_data + 1,
 411       work->parse_size - 1, &fields);
 412
 413   if (fields == NULL)
 414     return NULL;
 415
 416   if (fields->len != 2) {
 417     GST_WARNING ("Expected 2 fields in TXXX frame, but got %d", fields->len);
 418     free_tag_strings (fields);
 419     return NULL;
 420   }
 421
 422   *tag_name =
 423       gst_tag_from_id3_user_tag ("TXXX", g_array_index (fields, gchar *, 0));
 424
 425   GST_LOG ("TXXX frame of size %d. Mapped descriptor '%s' to GStreamer tag %s",
 426       work->parse_size - 1, g_array_index (fields, gchar *, 0),
 427       GST_STR_NULL (*tag_name));
 428
 429   if (*tag_name) {
 430     ret = g_strdup (g_array_index (fields, gchar *, 1));
 431     /* GST_LOG ("%s = %s", *tag_name, GST_STR_NULL (ret)); */
 432   } else {
 433     ret = NULL;
 434   }
 435
 436   free_tag_strings (fields);
 437   return ret;
 438 }
 439
 440 static gboolean
 441 parse_id_string (ID3TagsWorking * work, gchar ** p_str, gint * p_len,
 442     gint * p_datalen)
 443 {
 444   gint len, datalen;
 445
 446   if (work->parse_size < 2)
 447     return FALSE;
 448
 449   for (len = 0; len < work->parse_size - 1; ++len) {
 450     if (work->parse_data[len] == '\0')
 451       break;
 452   }
 453
 454   datalen = work->parse_size - (len + 1);
 455   if (len == 0 || datalen <= 0)
 456     return FALSE;
 457
 458   *p_str = g_strndup ((gchar *) work->parse_data, len);
 459   *p_len = len;
 460   *p_datalen = datalen;
 461
 462   return TRUE;
 463 }
 464
 465 static gchar *
 466 parse_unique_file_identifier (ID3TagsWorking * work, const gchar ** tag_name)
 467 {
 468   gint len, datalen;
 469   gchar *owner_id, *data, *ret = NULL;
 470
 471   GST_LOG ("parsing UFID frame of size %d", work->parse_size);
 472
 473   if (!parse_id_string (work, &owner_id, &len, &datalen))
 474     return NULL;
 475
 476   data = (gchar *) work->parse_data + len + 1;
 477   GST_LOG ("UFID owner ID: %s (+ %d bytes of data)", owner_id, datalen);
 478
 479   if (strcmp (owner_id, "http://musicbrainz.org") == 0 &&
 480       g_utf8_validate (data, datalen, NULL)) {
 481     *tag_name = GST_TAG_MUSICBRAINZ_TRACKID;
 482     ret = g_strndup (data, datalen);
 483   } else {
 484     GST_INFO ("Unknown UFID owner ID: %s", owner_id);
 485   }
 486   g_free (owner_id);
 487
 488   return ret;
 489 }
 490
 491 /* parse data and return length of the next string in the given encoding,
 492  * including the NUL terminator */
 493 static gint
 494 scan_encoded_string (guint8 encoding, gchar * data, gint data_size)
 495 {
 496   gint i;
 497
 498   switch (encoding) {
 499     case ID3V2_ENCODING_ISO8859:
 500     case ID3V2_ENCODING_UTF8:
 501       for (i = 0; i < data_size; ++i) {
 502         if (data[i] == '\0')
 503           return i + 1;
 504       }
 505       break;
 506     case ID3V2_ENCODING_UTF16:
 507     case ID3V2_ENCODING_UTF16BE:
 508       /* we don't care about BOMs here and treat them as part of the string */
 509       /* Find '\0\0' terminator */
 510       for (i = 0; i < data_size - 1; i += 2) {
 511         if (data[i] == '\0' && data[i + 1] == '\0')
 512           return i + 2;
 513       }
 514       break;
 515     default:
 516       break;
 517   }
 518
 519   return 0;
 520 }
 521
 522 static gboolean
 523 parse_picture_frame (ID3TagsWorking * work)
 524 {
 525   guint8 txt_encoding, pic_type;
 526   gchar *mime_str = NULL;
 527   gint len, datalen;
 528
 529   GST_LOG ("APIC frame (ID3v2.%u)", ID3V2_VER_MAJOR (work->hdr.version));
 530
 531   if (work->parse_size < 1 + 1 + 1 + 1 + 1)
 532     goto not_enough_data;
 533
 534   txt_encoding = work->parse_data[0];
 535   ++work->parse_data;
 536   --work->parse_size;
 537
 538   /* Read image format; in early ID3v2 versions this is a fixed-length
 539    * 3-character string without terminator; in later versions (>= 2.3.0)
 540    * this is a NUL-terminated string of variable length */
 541   if (ID3V2_VER_MAJOR (work->hdr.version) < 3) {
 542     if (work->parse_size < 3)
 543       goto not_enough_data;
 544
 545     mime_str = g_strndup ((gchar *) work->parse_data, 3);
 546     len = 3;
 547   } else {
 548     if (!parse_id_string (work, &mime_str, &len, &datalen))
 549       return FALSE;
 550     ++len;                      /* for string terminator */
 551   }
 552
 553   if (work->parse_size < len + 1 + 1 + 1)
 554     goto not_enough_data;
 555
 556   work->parse_data += len;
 557   work->parse_size -= len;
 558
 559   /* Read image type */
 560   pic_type = work->parse_data[0];
 561   ++work->parse_data;
 562   --work->parse_size;
 563
 564   GST_LOG ("APIC frame mime type    : %s", GST_STR_NULL (mime_str));
 565   GST_LOG ("APIC frame picture type : 0x%02x", (guint) pic_type);
 566
 567   if (work->parse_size < 1 + 1)
 568     goto not_enough_data;
 569
 570   len = scan_encoded_string (txt_encoding, (gchar *) work->parse_data,
 571       work->parse_size);
 572
 573   if (len < 1)
 574     goto error;
 575
 576   /* just skip the description string ... */
 577   GST_LOG ("Skipping description string (%d bytes in original coding)", len);
 578
 579   if (work->parse_size < len + 1)
 580     goto not_enough_data;
 581
 582   work->parse_data += len;
 583   work->parse_size -= len;
 584
 585   GST_DEBUG ("image data is %u bytes", work->parse_size);
 586
 587   if (work->parse_size <= 0)
 588     goto not_enough_data;
 589
 590   if (!gst_tag_list_add_id3_image (work->tags, (guint8 *) work->parse_data,
 591           work->parse_size, pic_type)) {
 592     goto error;
 593   }
 594
 595   g_free (mime_str);
 596   return TRUE;
 597
 598 not_enough_data:
 599   {
 600     GST_DEBUG ("not enough data, skipping APIC frame");
 601     /* fall through to error */
 602   }
 603 error:
 604   {
 605     GST_DEBUG ("problem parsing APIC frame, skipping");
 606     g_free (mime_str);
 607     return FALSE;
 608   }
 609 }
 610
 611 #define ID3V2_RVA2_CHANNEL_MASTER  1
 612
 613 static gboolean
 614 parse_relative_volume_adjustment_two (ID3TagsWorking * work)
 615 {
 616   const gchar *gain_tag_name = NULL;
 617   const gchar *peak_tag_name = NULL;
 618   gdouble gain_dB, peak_val;
 619   guint64 peak;
 620   guint8 *data, chan, peak_bits;
 621   gchar *id;
 622   gint len, datalen, i;
 623
 624   if (!parse_id_string (work, &id, &len, &datalen))
 625     return FALSE;
 626
 627   if (datalen < (1 + 2 + 1)) {
 628     GST_WARNING ("broken RVA2 frame, data size only %d bytes", datalen);
 629     g_free (id);
 630     return FALSE;
 631   }
 632
 633   data = work->parse_data + len + 1;
 634   chan = GST_READ_UINT8 (data);
 635   gain_dB = (gdouble) ((gint16) GST_READ_UINT16_BE (data + 1)) / 512.0;
 636   /* The meaning of the peak value is not defined in the ID3v2 spec. However,
 637    * the first/only implementation of this seems to have been in XMMS, and
 638    * other libs (like mutagen) seem to follow that implementation as well:
 639    * see http://bugs.xmms.org/attachment.cgi?id=113&action=view */
 640   peak_bits = GST_READ_UINT8 (data + 1 + 2);
 641   if (peak_bits > 64) {
 642     GST_WARNING ("silly peak precision of %d bits, ignoring", (gint) peak_bits);
 643     peak_bits = 0;
 644   }
 645   data += 1 + 2 + 1;
 646   datalen -= 1 + 2 + 1;
 647   if (peak_bits == 16) {
 648     peak = GST_READ_UINT16_BE (data);
 649   } else {
 650     peak = 0;
 651     for (i = 0; i < (GST_ROUND_UP_8 (peak_bits) / 8) && datalen > 0; ++i) {
 652       peak = peak << 8;
 653       peak |= GST_READ_UINT8 (data);
 654       ++data;
 655       --datalen;
 656     }
 657   }
 658
 659   peak = peak << (64 - GST_ROUND_UP_8 (peak_bits));
 660   peak_val =
 661       gst_guint64_to_gdouble (peak) / gst_util_guint64_to_gdouble (G_MAXINT64);
 662   GST_LOG ("RVA2 frame: id=%s, chan=%u, adj=%.2fdB, peak_bits=%u, peak=%.2f",
 663       id, chan, gain_dB, (guint) peak_bits, peak_val);
 664
 665   if (chan == ID3V2_RVA2_CHANNEL_MASTER && strcmp (id, "track") == 0) {
 666     gain_tag_name = GST_TAG_TRACK_GAIN;
 667     peak_tag_name = GST_TAG_TRACK_PEAK;
 668   } else if (chan == ID3V2_RVA2_CHANNEL_MASTER && strcmp (id, "album") == 0) {
 669     gain_tag_name = GST_TAG_ALBUM_GAIN;
 670     peak_tag_name = GST_TAG_ALBUM_PEAK;
 671   } else {
 672     GST_INFO ("Unhandled RVA2 frame id '%s' for channel %d", id, chan);
 673   }
 674
 675   if (gain_tag_name) {
 676     gst_tag_list_add (work->tags, GST_TAG_MERGE_APPEND,
 677         gain_tag_name, gain_dB, NULL);
 678   }
 679   if (peak_tag_name && peak_bits > 0) {
 680     gst_tag_list_add (work->tags, GST_TAG_MERGE_APPEND,
 681         peak_tag_name, peak_val, NULL);
 682   }
 683
 684   g_free (id);
 685
 686   return (gain_tag_name != NULL || peak_tag_name != NULL);
 687 }
 688
 689 static void
 690 parse_obsolete_tdat_frame (ID3TagsWorking * work)
 691 {
 692   if (work->parse_size >= 5 &&
 693       work->parse_data[0] == ID3V2_ENCODING_ISO8859 &&
 694       g_ascii_isdigit (work->parse_data[1]) &&
 695       g_ascii_isdigit (work->parse_data[2]) &&
 696       g_ascii_isdigit (work->parse_data[3]) &&
 697       g_ascii_isdigit (work->parse_data[4])) {
 698     work->pending_day = (10 * g_ascii_digit_value (work->parse_data[1])) +
 699         g_ascii_digit_value (work->parse_data[2]);
 700     work->pending_month = (10 * g_ascii_digit_value (work->parse_data[3])) +
 701         g_ascii_digit_value (work->parse_data[4]);
 702     GST_LOG ("date (dd/mm) %02u/%02u", work->pending_day, work->pending_month);
 703   }
 704 }
 705
 706 static gboolean
 707 id3v2_tag_to_taglist (ID3TagsWorking * work, const gchar * tag_name,
 708     const gchar * tag_str)
 709 {
 710   GType tag_type = gst_tag_get_type (tag_name);
 711   GstTagList *tag_list = work->tags;
 712
 713   if (tag_str == NULL)
 714     return FALSE;
 715
 716   switch (tag_type) {
 717     case G_TYPE_UINT:
 718     {
 719       gint current, total;
 720
 721       if (sscanf (tag_str, "%d/%d", &current, &total) == 2) {
 722         if (total <= 0) {
 723           GST_WARNING ("Ignoring invalid value for total %d in tag %s",
 724               total, tag_name);
 725         } else {
 726           if (strcmp (tag_name, GST_TAG_TRACK_NUMBER) == 0) {
 727             gst_tag_list_add (tag_list, GST_TAG_MERGE_APPEND,
 728                 GST_TAG_TRACK_COUNT, total, NULL);
 729           } else if (strcmp (tag_name, GST_TAG_ALBUM_VOLUME_NUMBER) == 0) {
 730             gst_tag_list_add (tag_list, GST_TAG_MERGE_APPEND,
 731                 GST_TAG_ALBUM_VOLUME_COUNT, total, NULL);
 732           }
 733         }
 734       } else if (sscanf (tag_str, "%d", &current) != 1) {
 735         /* Not an integer in the string */
 736         GST_WARNING ("Tag string for tag %s does not contain an integer - "
 737             "ignoring", tag_name);
 738         break;
 739       }
 740
 741       if (current <= 0) {
 742         GST_WARNING ("Ignoring invalid value %d in tag %s", current, tag_name);
 743       } else {
 744         gst_tag_list_add (tag_list, GST_TAG_MERGE_APPEND, tag_name, current,
 745             NULL);
 746       }
 747       break;
 748     }
 749     case G_TYPE_UINT64:
 750     {
 751       guint64 tmp;
 752
 753       g_assert (strcmp (tag_name, GST_TAG_DURATION) == 0);
 754       tmp = strtoul (tag_str, NULL, 10);
 755       if (tmp == 0) {
 756         break;
 757       }
 758       gst_tag_list_add (tag_list, GST_TAG_MERGE_APPEND,
 759           GST_TAG_DURATION, tmp * 1000 * 1000, NULL);
 760       break;
 761     }
 762     case G_TYPE_STRING:{
 763       const GValue *val;
 764       guint i, num;
 765
 766       /* make sure we add each unique string only once per tag, we don't want
 767        * to have the same genre in the genre list multiple times, for example,
 768        * or the same DiscID in there twice just because it's contained in the
 769        * tag multiple times under different TXXX user tags */
 770       num = gst_tag_list_get_tag_size (tag_list, tag_name);
 771       for (i = 0; i < num; ++i) {
 772         val = gst_tag_list_get_value_index (tag_list, tag_name, i);
 773         if (val != NULL && strcmp (g_value_get_string (val), tag_str) == 0)
 774           break;
 775       }
 776       if (i == num) {
 777         gst_tag_list_add (tag_list, GST_TAG_MERGE_APPEND,
 778             tag_name, tag_str, NULL);
 779       }
 780       break;
 781     }
 782
 783     default:{
 784       gchar *tmp = NULL;
 785       GValue src = { 0, };
 786       GValue dest = { 0, };
 787
 788       /* Ensure that any date string is complete */
 789       if (tag_type == GST_TYPE_DATE) {
 790         guint year = 1901, month = 1, day = 1;
 791
 792         /* Dates can be yyyy-MM-dd, yyyy-MM or yyyy, but we need
 793          * the first type */
 794         if (sscanf (tag_str, "%04u-%02u-%02u", &year, &month, &day) == 0)
 795           break;
 796
 797         tmp = g_strdup_printf ("%04u-%02u-%02u", year, month, day);
 798         tag_str = tmp;
 799       }
 800
 801       /* handles anything else */
 802       g_value_init (&src, G_TYPE_STRING);
 803       g_value_set_string (&src, (const gchar *) tag_str);
 804       g_value_init (&dest, tag_type);
 805
 806       if (g_value_transform (&src, &dest)) {
 807         gst_tag_list_add_values (tag_list, GST_TAG_MERGE_APPEND,
 808             tag_name, &dest, NULL);
 809       } else if (tag_type == G_TYPE_DOUBLE) {
 810         /* replaygain tags in TXXX frames ... */
 811         g_value_set_double (&dest, g_strtod (tag_str, NULL));
 812         gst_tag_list_add_values (tag_list, GST_TAG_MERGE_KEEP,
 813             tag_name, &dest, NULL);
 814         GST_LOG ("Converted string '%s' to double %f", tag_str,
 815             g_value_get_double (&dest));
 816       } else {
 817         GST_WARNING ("Failed to transform tag from string to type '%s'",
 818             g_type_name (tag_type));
 819       }
 820
 821       g_value_unset (&src);
 822       g_value_unset (&dest);
 823       g_free (tmp);
 824       break;
 825     }
 826   }
 827
 828   return TRUE;
 829 }
 830
 831 /* Check that an array of characters contains only digits */
 832 static gboolean
 833 id3v2_are_digits (const gchar * chars, gint size)
 834 {
 835   gint i;
 836
 837   for (i = 0; i < size; i++) {
 838     if (!g_ascii_isdigit (chars[i]))
 839       return FALSE;
 840   }
 841   return TRUE;
 842 }
 843
 844 static gboolean
 845 id3v2_genre_string_to_taglist (ID3TagsWorking * work, const gchar * tag_name,
 846     const gchar * tag_str, gint len)
 847 {
 848   g_return_val_if_fail (tag_str != NULL, FALSE);
 849
 850   /* If it's a number, it might be a defined genre */
 851   if (id3v2_are_digits (tag_str, len)) {
 852     tag_str = gst_tag_id3_genre_get (strtol (tag_str, NULL, 10));
 853     return id3v2_tag_to_taglist (work, tag_name, tag_str);
 854   }
 855   /* Otherwise it might be "RX" or "CR" */
 856   if (len == 2) {
 857     if (g_ascii_strncasecmp ("rx", tag_str, len) == 0)
 858       return id3v2_tag_to_taglist (work, tag_name, "Remix");
 859
 860     if (g_ascii_strncasecmp ("cr", tag_str, len) == 0)
 861       return id3v2_tag_to_taglist (work, tag_name, "Cover");
 862   }
 863
 864   /* Otherwise it's a string */
 865   return id3v2_tag_to_taglist (work, tag_name, tag_str);
 866 }
 867
 868 static gboolean
 869 id3v2_genre_fields_to_taglist (ID3TagsWorking * work, const gchar * tag_name,
 870     GArray * tag_fields)
 871 {
 872   gchar *tag_str = NULL;
 873   gboolean result = FALSE;
 874   gint i;
 875
 876   for (i = 0; i < tag_fields->len; i++) {
 877     gint len;
 878
 879     tag_str = g_array_index (tag_fields, gchar *, i);
 880     if (tag_str == NULL)
 881       continue;
 882
 883     len = strlen (tag_str);
 884     /* Only supposed to see '(n)' type numeric genre strings in ID3 <= 2.3.0
 885      * but apparently we see them in 2.4.0 sometimes too */
 886     if (TRUE || work->hdr.version <= 0x300) {   /* <= 2.3.0 */
 887       /* Check for genre numbers wrapped in parentheses, possibly
 888        * followed by a string */
 889       while (len >= 2) {
 890         gint pos;
 891         gboolean found = FALSE;
 892
 893         /* Double parenthesis ends the numeric genres, but we need
 894          * to swallow the first one so we actually output '(' */
 895         if (tag_str[0] == '(' && tag_str[1] == '(') {
 896           tag_str++;
 897           len--;
 898           break;
 899         }
 900
 901         /* If the first char is not a parenthesis, then stop
 902          * looking for parenthesised genre strings */
 903         if (tag_str[0] != '(')
 904           break;
 905
 906         for (pos = 1; pos < len; pos++) {
 907           if (tag_str[pos] == ')') {
 908             gchar *tmp_str;
 909
 910             tmp_str = g_strndup (tag_str + 1, pos - 1);
 911             result |=
 912                 id3v2_genre_string_to_taglist (work, tag_name, tmp_str,
 913                 pos - 1);
 914             g_free (tmp_str);
 915             tag_str += pos + 1;
 916             len -= pos + 1;
 917             found = TRUE;
 918             break;
 919           }
 920
 921           /* If we encounter a non-digit while searching for a closing
 922            * parenthesis, we should not try and interpret this as a
 923            * numeric genre string */
 924           if (!g_ascii_isdigit (tag_str[pos]))
 925             break;
 926         }
 927         if (!found)
 928           break;                /* There was no closing parenthesis */
 929       }
 930     }
 931
 932     if (len > 0 && tag_str != NULL)
 933       result |= id3v2_genre_string_to_taglist (work, tag_name, tag_str, len);
 934   }
 935   return result;
 936 }
 937
 938 static const gchar utf16enc[] = "UTF-16";
 939 static const gchar utf16leenc[] = "UTF-16LE";
 940 static const gchar utf16beenc[] = "UTF-16BE";
 941
 942 static gboolean
 943 find_utf16_bom (gchar * data, const gchar ** p_in_encoding)
 944 {
 945   guint16 marker = (GST_READ_UINT8 (data) << 8) | GST_READ_UINT8 (data + 1);
 946
 947   switch (marker) {
 948     case 0xFFFE:
 949       *p_in_encoding = utf16leenc;
 950       return TRUE;
 951     case 0xFEFF:
 952       *p_in_encoding = utf16beenc;
 953       return TRUE;
 954     default:
 955       break;
 956   }
 957   return FALSE;
 958 }
 959
 960 static void *
 961 string_utf8_dup (const gchar * start, const guint size)
 962 {
 963   const gchar *env;
 964   gsize bytes_read;
 965   gchar *utf8;
 966
 967   /* Should we try the charsets specified
 968    * via environment variables FIRST ? */
 969   if (g_utf8_validate (start, size, NULL)) {
 970     utf8 = g_strndup (start, size);
 971     goto beach;
 972   }
 973
 974   env = g_getenv ("GST_ID3V1_TAG_ENCODING");
 975   if (!env || *env == '\0')
 976     env = g_getenv ("GST_ID3_TAG_ENCODING");
 977   if (!env || *env == '\0')
 978     env = g_getenv ("GST_TAG_ENCODING");
 979
 980   /* Try charsets specified via the environment */
 981   if (env && *env != '\0') {
 982     gchar **c, **csets;
 983
 984     csets = g_strsplit (env, G_SEARCHPATH_SEPARATOR_S, -1);
 985
 986     for (c = csets; c && *c; ++c) {
 987       if ((utf8 =
 988               g_convert (start, size, "UTF-8", *c, &bytes_read, NULL, NULL))) {
 989         if (bytes_read == size) {
 990           GST_DEBUG ("Using charset %s to interperate id3 tags\n", *c);
 991           g_strfreev (csets);
 992           goto beach;
 993         }
 994         g_free (utf8);
 995         utf8 = NULL;
 996       }
 997     }
 998   }
 999   /* Try current locale (if not UTF-8) */
1000   if (!g_get_charset (&env)) {
1001     if ((utf8 = g_locale_to_utf8 (start, size, &bytes_read, NULL, NULL))) {
1002       if (bytes_read == size) {
1003         goto beach;
1004       }
1005       g_free (utf8);
1006       utf8 = NULL;
1007     }
1008   }
1009
1010   /* Try ISO-8859-1 */
1011   utf8 =
1012       g_convert (start, size, "UTF-8", "ISO-8859-1", &bytes_read, NULL, NULL);
1013   if (utf8 != NULL && bytes_read == size) {
1014     goto beach;
1015   }
1016
1017   g_free (utf8);
1018   return NULL;
1019
1020 beach:
1021
1022   g_strchomp (utf8);
1023
1024   return (utf8);
1025 }
1026
1027 static void
1028 parse_insert_string_field (guint8 encoding, gchar * data, gint data_size,
1029     GArray * fields)
1030 {
1031   gchar *field = NULL;
1032
1033   switch (encoding) {
1034     case ID3V2_ENCODING_UTF16:
1035     case ID3V2_ENCODING_UTF16BE:
1036     {
1037       const gchar *in_encode;
1038
1039       if (encoding == ID3V2_ENCODING_UTF16)
1040         in_encode = utf16enc;
1041       else
1042         in_encode = utf16beenc;
1043
1044       /* Sometimes we see strings with multiple BOM markers at the start.
1045        * In that case, we assume the innermost one is correct. If that fails
1046        * to produce valid UTF-8, we try the other endianness anyway */
1047       while (data_size > 2 && find_utf16_bom (data, &in_encode)) {
1048         data += 2;              /* skip BOM */
1049         data_size -= 2;
1050       }
1051
1052       field = g_convert (data, data_size, "UTF-8", in_encode, NULL, NULL, NULL);
1053
1054       if (field == NULL || g_utf8_validate (field, -1, NULL) == FALSE) {
1055         /* As a fallback, try interpreting UTF-16 in the other endianness */
1056         if (in_encode == utf16beenc)
1057           field = g_convert (data, data_size, "UTF-8", utf16leenc,
1058               NULL, NULL, NULL);
1059       }
1060     }
1061
1062       break;
1063     case ID3V2_ENCODING_ISO8859:
1064       if (g_utf8_validate (data, data_size, NULL))
1065         field = g_strndup (data, data_size);
1066       else
1067         /* field = g_convert (data, data_size, "UTF-8", "ISO-8859-1",
1068            NULL, NULL, NULL); */
1069         field = string_utf8_dup (data, data_size);
1070       break;
1071     default:
1072       field = g_strndup (data, data_size);
1073       break;
1074   }
1075
1076   if (field) {
1077     if (g_utf8_validate (field, -1, NULL)) {
1078       g_array_append_val (fields, field);
1079       return;
1080     }
1081
1082     GST_DEBUG ("%s was bad UTF-8 after conversion from encoding %d. Ignoring",
1083         field, encoding);
1084     g_free (field);
1085   }
1086 }
1087
1088 static void
1089 parse_split_strings (guint8 encoding, gchar * data, gint data_size,
1090     GArray ** out_fields)
1091 {
1092   GArray *fields = g_array_new (FALSE, TRUE, sizeof (gchar *));
1093   gint text_pos;
1094   gint prev = 0;
1095
1096   g_return_if_fail (out_fields != NULL);
1097
1098   switch (encoding) {
1099     case ID3V2_ENCODING_ISO8859:
1100       for (text_pos = 0; text_pos < data_size; text_pos++) {
1101         if (data[text_pos] == 0) {
1102           parse_insert_string_field (encoding, data + prev,
1103               text_pos - prev + 1, fields);
1104           prev = text_pos + 1;
1105         }
1106       }
1107       if (data_size - prev > 0 && data[prev] != 0x00) {
1108         parse_insert_string_field (encoding, data + prev,
1109             data_size - prev, fields);
1110       }
1111
1112       break;
1113     case ID3V2_ENCODING_UTF8:
1114       for (prev = 0, text_pos = 0; text_pos < data_size; text_pos++) {
1115         if (data[text_pos] == '\0') {
1116           parse_insert_string_field (encoding, data + prev,
1117               text_pos - prev + 1, fields);
1118           prev = text_pos + 1;
1119         }
1120       }
1121       if (data_size - prev > 0 && data[prev] != 0x00) {
1122         parse_insert_string_field (encoding, data + prev,
1123             data_size - prev, fields);
1124       }
1125       break;
1126     case ID3V2_ENCODING_UTF16:
1127     case ID3V2_ENCODING_UTF16BE:
1128     {
1129       /* Find '\0\0' terminator */
1130       for (text_pos = 0; text_pos < data_size - 1; text_pos += 2) {
1131         if (data[text_pos] == '\0' && data[text_pos + 1] == '\0') {
1132           /* found a delimiter */
1133           parse_insert_string_field (encoding, data + prev,
1134               text_pos - prev + 2, fields);
1135           text_pos++;           /* Advance to the 2nd NULL terminator */
1136           prev = text_pos + 1;
1137           break;
1138         }
1139       }
1140       if (data_size - prev > 1 &&
1141           (data[prev] != 0x00 || data[prev + 1] != 0x00)) {
1142         /* There were 2 or more non-null chars left, convert those too */
1143         parse_insert_string_field (encoding, data + prev,
1144             data_size - prev, fields);
1145       }
1146       break;
1147     }
1148   }
1149   if (fields->len > 0)
1150     *out_fields = fields;
1151   else
1152     g_array_free (fields, TRUE);
1153 }
1154
1155 static void
1156 free_tag_strings (GArray * fields)
1157 {
1158   if (fields) {
1159     gint i;
1160     gchar *c;
1161
1162     for (i = 0; i < fields->len; i++) {
1163       c = g_array_index (fields, gchar *, i);
1164       g_free (c);
1165     }
1166     g_array_free (fields, TRUE);
1167   }
1168 }