gst/id3demux/id3v2frames.c

   1 /* -*- Mode: C; tab-width: 2; indent-tabs-mode: t; c-basic-offset: 2 -*- */
   2 /* Copyright 2006-2008 Tim-Philipp Müller <tim centricular net>
   3  * Copyright 2005 Jan Schmidt <thaytan@mad.scientist.com>
   4  * Copyright 2002,2003 Scott Wheeler <wheeler@kde.org> (portions from taglib)
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Library General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Library General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Library General Public
  17  * License along with this library; if not, write to the
  18  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  19  * Boston, MA 02111-1307, USA.
  20  */
  21
  22 #ifdef HAVE_CONFIG_H
  23 #include "config.h"
  24 #endif
  25
  26 #include <string.h>
  27 #include <stdio.h>
  28 #include <stdlib.h>
  29 #include <gst/tag/tag.h>
  30 #include <gst/base/gsttypefindhelper.h>
  31
  32 #ifdef HAVE_ZLIB
  33 #include <zlib.h>
  34 #endif
  35
  36 #include "id3tags.h"
  37
  38 GST_DEBUG_CATEGORY_EXTERN (id3demux_debug);
  39 #define GST_CAT_DEFAULT (id3demux_debug)
  40
  41 static gboolean parse_comment_frame (ID3TagsWorking * work);
  42 static gchar *parse_url_link_frame (ID3TagsWorking * work,
  43     const gchar ** tag_name);
  44 static GArray *parse_text_identification_frame (ID3TagsWorking * work);
  45 static gchar *parse_user_text_identification_frame (ID3TagsWorking * work,
  46     const gchar ** tag_name);
  47 static gchar *parse_unique_file_identifier (ID3TagsWorking * work,
  48     const gchar ** tag_name);
  49 static gboolean parse_relative_volume_adjustment_two (ID3TagsWorking * work);
  50 static void parse_obsolete_tdat_frame (ID3TagsWorking * work);
  51 static gboolean id3v2_tag_to_taglist (ID3TagsWorking * work,
  52     const gchar * tag_name, const gchar * tag_str);
  53 /* Parse a single string into an array of gchar* */
  54 static void parse_split_strings (guint8 encoding, gchar * data, gint data_size,
  55     GArray ** out_fields);
  56 static void free_tag_strings (GArray * fields);
  57 static gboolean
  58 id3v2_genre_fields_to_taglist (ID3TagsWorking * work, const gchar * tag_name,
  59     GArray * tag_fields);
  60 static gboolean parse_picture_frame (ID3TagsWorking * work);
  61
  62 #define ID3V2_ENCODING_ISO8859 0x00
  63 #define ID3V2_ENCODING_UTF16   0x01
  64 #define ID3V2_ENCODING_UTF16BE 0x02
  65 #define ID3V2_ENCODING_UTF8    0x03
  66
  67 gboolean
  68 id3demux_id3v2_parse_frame (ID3TagsWorking * work)
  69 {
  70   const gchar *tag_name;
  71   gboolean result = FALSE;
  72   gint i;
  73   guint8 *frame_data = work->hdr.frame_data;
  74   guint frame_data_size = work->cur_frame_size;
  75   gchar *tag_str = NULL;
  76   GArray *tag_fields = NULL;
  77   guint8 *uu_data = NULL;
  78
  79 #ifdef HAVE_ZLIB
  80   guint8 *uncompressed_data = NULL;
  81 #endif
  82
  83   /* Check that the frame id is valid */
  84   for (i = 0; i < 5 && work->frame_id[i] != '\0'; i++) {
  85     if (!g_ascii_isalnum (work->frame_id[i])) {
  86       GST_DEBUG ("Encountered invalid frame_id");
  87       return FALSE;
  88     }
  89   }
  90
  91   /* Can't handle encrypted frames right now (in case we ever do, we'll have
  92    * to do the decryption after the un-unsynchronisation and decompression,
  93    * not here) */
  94   if (work->frame_flags & ID3V2_FRAME_FORMAT_ENCRYPTION) {
  95     GST_WARNING ("Encrypted frames are not supported");
  96     return FALSE;
  97   }
  98
  99   tag_name = gst_tag_from_id3_tag (work->frame_id);
 100   if (tag_name == NULL &&
 101       strncmp (work->frame_id, "RVA2", 4) != 0 &&
 102       strncmp (work->frame_id, "TXXX", 4) != 0 &&
 103       strncmp (work->frame_id, "TDAT", 4) != 0 &&
 104       strncmp (work->frame_id, "UFID", 4) != 0) {
 105     return FALSE;
 106   }
 107
 108   if (work->frame_flags & (ID3V2_FRAME_FORMAT_COMPRESSION |
 109           ID3V2_FRAME_FORMAT_DATA_LENGTH_INDICATOR)) {
 110     if (work->hdr.frame_data_size <= 4)
 111       return FALSE;
 112     if (ID3V2_VER_MAJOR (work->hdr.version) == 3) {
 113       work->parse_size = GST_READ_UINT32_BE (frame_data);
 114     } else {
 115       work->parse_size = read_synch_uint (frame_data, 4);
 116     }
 117     frame_data += 4;
 118     frame_data_size -= 4;
 119     GST_LOG ("Un-unsynced data size %d (of %d)", work->parse_size,
 120         frame_data_size);
 121     if (work->parse_size > frame_data_size) {
 122       GST_WARNING ("ID3v2 frame %s data has invalid size %d (>%d)",
 123           work->frame_id, work->parse_size, frame_data_size);
 124       return FALSE;
 125     }
 126   }
 127
 128   /* in v2.3 the frame sizes are not syncsafe, so the entire tag had to be
 129    * unsynced. In v2.4 the frame sizes are syncsafe so it's just the frame
 130    * data that needs un-unsyncing, but not the frame headers. */
 131   if (ID3V2_VER_MAJOR (work->hdr.version) == 4) {
 132     if ((work->hdr.flags & ID3V2_HDR_FLAG_UNSYNC) != 0 ||
 133         ((work->frame_flags & ID3V2_FRAME_FORMAT_UNSYNCHRONISATION) != 0)) {
 134       GST_DEBUG ("Un-unsyncing frame %s", work->frame_id);
 135       uu_data = id3demux_ununsync_data (frame_data, &frame_data_size);
 136       frame_data = uu_data;
 137       GST_MEMDUMP ("ID3v2 frame (un-unsyced)", frame_data, frame_data_size);
 138     }
 139   }
 140
 141   work->parse_size = frame_data_size;
 142
 143   if (work->frame_flags & ID3V2_FRAME_FORMAT_COMPRESSION) {
 144 #ifdef HAVE_ZLIB
 145     uLongf destSize = work->parse_size;
 146     Bytef *dest, *src;
 147
 148     uncompressed_data = g_malloc (work->parse_size);
 149
 150     dest = (Bytef *) uncompressed_data;
 151     src = (Bytef *) frame_data;
 152
 153     if (uncompress (dest, &destSize, src, frame_data_size) != Z_OK) {
 154       g_free (uncompressed_data);
 155       g_free (uu_data);
 156       return FALSE;
 157     }
 158     if (destSize != work->parse_size) {
 159       GST_WARNING
 160           ("Decompressing ID3v2 frame %s did not produce expected size %d bytes (got %lu)",
 161           tag_name, work->parse_size, destSize);
 162       g_free (uncompressed_data);
 163       g_free (uu_data);
 164       return FALSE;
 165     }
 166     work->parse_data = uncompressed_data;
 167 #else
 168     GST_WARNING ("Compressed ID3v2 tag frame could not be decompressed"
 169         " because gstid3demux was compiled without zlib support");
 170     g_free (uu_data);
 171     return FALSE;
 172 #endif
 173   } else {
 174     work->parse_data = frame_data;
 175   }
 176
 177   if (work->frame_id[0] == 'T') {
 178     if (strcmp (work->frame_id, "TDAT") == 0) {
 179       parse_obsolete_tdat_frame (work);
 180       result = TRUE;
 181     } else if (strcmp (work->frame_id, "TXXX") == 0) {
 182       /* Handle user text frame */
 183       tag_str = parse_user_text_identification_frame (work, &tag_name);
 184     } else {
 185       /* Text identification frame */
 186       tag_fields = parse_text_identification_frame (work);
 187     }
 188   } else if (work->frame_id[0] == 'W' && strcmp (work->frame_id, "WXXX") != 0) {
 189     /* URL link frame: ISO-8859-1 encoded, one frame per tag */
 190     tag_str = parse_url_link_frame (work, &tag_name);
 191   } else if (!strcmp (work->frame_id, "COMM")) {
 192     /* Comment */
 193     result = parse_comment_frame (work);
 194   } else if (!strcmp (work->frame_id, "APIC")) {
 195     /* Attached picture */
 196     result = parse_picture_frame (work);
 197   } else if (!strcmp (work->frame_id, "RVA2")) {
 198     /* Relative volume */
 199     result = parse_relative_volume_adjustment_two (work);
 200   } else if (!strcmp (work->frame_id, "UFID")) {
 201     /* Unique file identifier */
 202     tag_str = parse_unique_file_identifier (work, &tag_name);
 203   }
 204 #ifdef HAVE_ZLIB
 205   if (work->frame_flags & ID3V2_FRAME_FORMAT_COMPRESSION) {
 206     g_free (uncompressed_data);
 207     uncompressed_data = NULL;
 208     work->parse_data = frame_data;
 209   }
 210 #endif
 211
 212   if (tag_str != NULL) {
 213     /* g_print ("Tag %s value %s\n", tag_name, tag_str); */
 214     result = id3v2_tag_to_taglist (work, tag_name, tag_str);
 215     g_free (tag_str);
 216   }
 217   if (tag_fields != NULL) {
 218     if (strcmp (work->frame_id, "TCON") == 0) {
 219       /* Genre strings need special treatment */
 220       result |= id3v2_genre_fields_to_taglist (work, tag_name, tag_fields);
 221     } else {
 222       gint t;
 223
 224       for (t = 0; t < tag_fields->len; t++) {
 225         tag_str = g_array_index (tag_fields, gchar *, t);
 226         if (tag_str != NULL && tag_str[0] != '\0')
 227           result |= id3v2_tag_to_taglist (work, tag_name, tag_str);
 228       }
 229     }
 230     free_tag_strings (tag_fields);
 231   }
 232
 233   g_free (uu_data);
 234
 235   return result;
 236 }
 237
 238 static gboolean
 239 parse_comment_frame (ID3TagsWorking * work)
 240 {
 241   guint dummy;
 242   guint8 encoding;
 243   gchar language[4];
 244   GArray *fields = NULL;
 245   gchar *description, *text;
 246
 247   if (work->parse_size < 6)
 248     return FALSE;
 249
 250   encoding = work->parse_data[0];
 251   language[0] = g_ascii_tolower (work->parse_data[1]);
 252   language[1] = g_ascii_tolower (work->parse_data[2]);
 253   language[2] = g_ascii_tolower (work->parse_data[3]);
 254   language[3] = '\0';
 255
 256   parse_split_strings (encoding, (gchar *) work->parse_data + 4,
 257       work->parse_size - 4, &fields);
 258
 259   if (fields == NULL || fields->len < 2) {
 260     GST_WARNING ("Failed to decode comment frame");
 261     goto fail;
 262   }
 263   description = g_array_index (fields, gchar *, 0);
 264   text = g_array_index (fields, gchar *, 1);
 265
 266   if (!g_utf8_validate (text, -1, NULL)) {
 267     GST_WARNING ("Converted string is not valid utf-8");
 268     goto fail;
 269   }
 270
 271   /* skip our own dummy descriptions (from id3v2mux) */
 272   if (strlen (description) > 0 && g_utf8_validate (description, -1, NULL) &&
 273       sscanf (description, "c%u", &dummy) != 1) {
 274     gchar *s;
 275
 276     /* must be either an ISO-639-1 or ISO-639-2 language code */
 277     if (language[0] != '\0' &&
 278         g_ascii_isalpha (language[0]) &&
 279         g_ascii_isalpha (language[1]) &&
 280         (g_ascii_isalpha (language[2]) || language[2] == '\0')) {
 281       const gchar *lang_code;
 282
 283       /* prefer two-letter ISO 639-1 code if we have a mapping */
 284       lang_code = gst_tag_get_language_code (language);
 285       s = g_strdup_printf ("%s[%s]=%s", description,
 286           (lang_code) ? lang_code : language, text);
 287     } else {
 288       s = g_strdup_printf ("%s=%s", description, text);
 289     }
 290     gst_tag_list_add (work->tags, GST_TAG_MERGE_APPEND,
 291         GST_TAG_EXTENDED_COMMENT, s, NULL);
 292     g_free (s);
 293   } else if (text != NULL && *text != '\0') {
 294     gst_tag_list_add (work->tags, GST_TAG_MERGE_APPEND,
 295         GST_TAG_COMMENT, text, NULL);
 296   } else {
 297     goto fail;
 298   }
 299
 300   free_tag_strings (fields);
 301   return TRUE;
 302
 303 fail:
 304   {
 305     GST_WARNING ("failed to parse COMM frame");
 306     free_tag_strings (fields);
 307     return FALSE;
 308   }
 309 }
 310
 311 static GArray *
 312 parse_text_identification_frame (ID3TagsWorking * work)
 313 {
 314   guchar encoding;
 315   GArray *fields = NULL;
 316
 317   if (work->parse_size < 2)
 318     return NULL;
 319
 320   encoding = work->parse_data[0];
 321   parse_split_strings (encoding, (gchar *) work->parse_data + 1,
 322       work->parse_size - 1, &fields);
 323   if (fields) {
 324     if (fields->len > 0) {
 325       GST_LOG ("Read %d fields from Text ID frame of size %d with encoding %d"
 326           ". First is '%s'", fields->len, work->parse_size - 1, encoding,
 327           g_array_index (fields, gchar *, 0));
 328     } else {
 329       GST_LOG ("Read 0 fields from Text ID frame of size %d with encoding %d",
 330           work->parse_size - 1, encoding);
 331     }
 332   }
 333
 334   return fields;
 335 }
 336
 337 static gboolean
 338 link_is_known_license (const gchar * url)
 339 {
 340   return g_str_has_prefix (url, "http://creativecommons.org/licenses/");
 341 }
 342
 343 static gchar *
 344 parse_url_link_frame (ID3TagsWorking * work, const gchar ** tag_name)
 345 {
 346   gsize len;
 347   gchar *nul, *data, *link;
 348
 349   *tag_name = NULL;
 350
 351   if (work->parse_size == 0)
 352     return NULL;
 353
 354   data = (gchar *) work->parse_data;
 355   /* if there's more data then the string is long, we only want to parse the
 356    * data up to the terminating zero to g_convert and ignore the rest, as
 357    * per spec */
 358   nul = memchr (data, '\0', work->parse_size);
 359   if (nul != NULL) {
 360     len = (gsize) (nul - data);
 361   } else {
 362     len = work->parse_size;
 363   }
 364
 365   link = g_convert (data, len, "UTF-8", "ISO-8859-1", NULL, NULL, NULL);
 366
 367   if (link == NULL || !gst_uri_is_valid (link)) {
 368     GST_DEBUG ("Invalid URI in %s frame: %s", work->frame_id,
 369         GST_STR_NULL (link));
 370     g_free (link);
 371     return NULL;
 372   }
 373
 374   /* we don't know if it's a link to a page that explains the copyright
 375    * situation, or a link that points to/represents a license, the ID3 spec
 376    * does not separate those two things; for now only put known license URIs
 377    * into GST_TAG_LICENSE_URI and everything else into GST_TAG_COPYRIGHT_URI */
 378   if (strcmp (work->frame_id, "WCOP") == 0) {
 379     if (link_is_known_license (link))
 380       *tag_name = GST_TAG_LICENSE_URI;
 381     else
 382       *tag_name = GST_TAG_COPYRIGHT_URI;
 383   } else if (strcmp (work->frame_id, "WOAF") == 0) {
 384     /* can't be bothered to create a CONTACT_URI tag for this, so let's just
 385      * put into into GST_TAG_CONTACT, which is where it ends up when reading
 386      * the info from vorbis comments as well */
 387     *tag_name = GST_TAG_CONTACT;
 388   }
 389
 390   return link;
 391 }
 392
 393
 394 static gchar *
 395 parse_user_text_identification_frame (ID3TagsWorking * work,
 396     const gchar ** tag_name)
 397 {
 398   gchar *ret;
 399   guchar encoding;
 400   GArray *fields = NULL;
 401
 402   *tag_name = NULL;
 403
 404   if (work->parse_size < 2)
 405     return NULL;
 406
 407   encoding = work->parse_data[0];
 408
 409   parse_split_strings (encoding, (gchar *) work->parse_data + 1,
 410       work->parse_size - 1, &fields);
 411
 412   if (fields == NULL)
 413     return NULL;
 414
 415   if (fields->len != 2) {
 416     GST_WARNING ("Expected 2 fields in TXXX frame, but got %d", fields->len);
 417     free_tag_strings (fields);
 418     return NULL;
 419   }
 420
 421   *tag_name =
 422       gst_tag_from_id3_user_tag ("TXXX", g_array_index (fields, gchar *, 0));
 423
 424   GST_LOG ("TXXX frame of size %d. Mapped descriptor '%s' to GStreamer tag %s",
 425       work->parse_size - 1, g_array_index (fields, gchar *, 0),
 426       GST_STR_NULL (*tag_name));
 427
 428   if (*tag_name) {
 429     ret = g_strdup (g_array_index (fields, gchar *, 1));
 430     /* GST_LOG ("%s = %s", *tag_name, GST_STR_NULL (ret)); */
 431   } else {
 432     ret = NULL;
 433   }
 434
 435   free_tag_strings (fields);
 436   return ret;
 437 }
 438
 439 static gboolean
 440 parse_id_string (ID3TagsWorking * work, gchar ** p_str, gint * p_len,
 441     gint * p_datalen)
 442 {
 443   gint len, datalen;
 444
 445   if (work->parse_size < 2)
 446     return FALSE;
 447
 448   for (len = 0; len < work->parse_size - 1; ++len) {
 449     if (work->parse_data[len] == '\0')
 450       break;
 451   }
 452
 453   datalen = work->parse_size - (len + 1);
 454   if (len == 0 || datalen <= 0)
 455     return FALSE;
 456
 457   *p_str = g_strndup ((gchar *) work->parse_data, len);
 458   *p_len = len;
 459   *p_datalen = datalen;
 460
 461   return TRUE;
 462 }
 463
 464 static gchar *
 465 parse_unique_file_identifier (ID3TagsWorking * work, const gchar ** tag_name)
 466 {
 467   gint len, datalen;
 468   gchar *owner_id, *data, *ret = NULL;
 469
 470   GST_LOG ("parsing UFID frame of size %d", work->parse_size);
 471
 472   if (!parse_id_string (work, &owner_id, &len, &datalen))
 473     return NULL;
 474
 475   data = (gchar *) work->parse_data + len + 1;
 476   GST_LOG ("UFID owner ID: %s (+ %d bytes of data)", owner_id, datalen);
 477
 478   if (strcmp (owner_id, "http://musicbrainz.org") == 0 &&
 479       g_utf8_validate (data, datalen, NULL)) {
 480     *tag_name = GST_TAG_MUSICBRAINZ_TRACKID;
 481     ret = g_strndup (data, datalen);
 482   } else {
 483     GST_INFO ("Unknown UFID owner ID: %s", owner_id);
 484   }
 485   g_free (owner_id);
 486
 487   return ret;
 488 }
 489
 490 /* parse data and return length of the next string in the given encoding,
 491  * including the NUL terminator */
 492 static gint
 493 scan_encoded_string (guint8 encoding, gchar * data, gint data_size)
 494 {
 495   gint i;
 496
 497   switch (encoding) {
 498     case ID3V2_ENCODING_ISO8859:
 499     case ID3V2_ENCODING_UTF8:
 500       for (i = 0; i < data_size; ++i) {
 501         if (data[i] == '\0')
 502           return i + 1;
 503       }
 504       break;
 505     case ID3V2_ENCODING_UTF16:
 506     case ID3V2_ENCODING_UTF16BE:
 507       /* we don't care about BOMs here and treat them as part of the string */
 508       /* Find '\0\0' terminator */
 509       for (i = 0; i < data_size - 1; i += 2) {
 510         if (data[i] == '\0' && data[i + 1] == '\0')
 511           return i + 2;
 512       }
 513       break;
 514     default:
 515       break;
 516   }
 517
 518   return 0;
 519 }
 520
 521 static gboolean
 522 parse_picture_frame (ID3TagsWorking * work)
 523 {
 524   guint8 txt_encoding, pic_type;
 525   gchar *mime_str = NULL;
 526   gint len, datalen;
 527
 528   GST_LOG ("APIC frame (ID3v2.%u)", ID3V2_VER_MAJOR (work->hdr.version));
 529
 530   if (work->parse_size < 1 + 1 + 1 + 1 + 1)
 531     goto not_enough_data;
 532
 533   txt_encoding = work->parse_data[0];
 534   ++work->parse_data;
 535   --work->parse_size;
 536
 537   /* Read image format; in early ID3v2 versions this is a fixed-length
 538    * 3-character string without terminator; in later versions (>= 2.3.0)
 539    * this is a NUL-terminated string of variable length */
 540   if (ID3V2_VER_MAJOR (work->hdr.version) < 3) {
 541     if (work->parse_size < 3)
 542       goto not_enough_data;
 543
 544     mime_str = g_strndup ((gchar *) work->parse_data, 3);
 545     len = 3;
 546   } else {
 547     if (!parse_id_string (work, &mime_str, &len, &datalen))
 548       return FALSE;
 549     ++len;                      /* for string terminator */
 550   }
 551
 552   if (work->parse_size < len + 1 + 1 + 1)
 553     goto not_enough_data;
 554
 555   work->parse_data += len;
 556   work->parse_size -= len;
 557
 558   /* Read image type */
 559   pic_type = work->parse_data[0];
 560   ++work->parse_data;
 561   --work->parse_size;
 562
 563   GST_LOG ("APIC frame mime type    : %s", GST_STR_NULL (mime_str));
 564   GST_LOG ("APIC frame picture type : 0x%02x", (guint) pic_type);
 565
 566   if (work->parse_size < 1 + 1)
 567     goto not_enough_data;
 568
 569   len = scan_encoded_string (txt_encoding, (gchar *) work->parse_data,
 570       work->parse_size);
 571
 572   if (len < 1)
 573     goto error;
 574
 575   /* just skip the description string ... */
 576   GST_LOG ("Skipping description string (%d bytes in original coding)", len);
 577
 578   if (work->parse_size < len + 1)
 579     goto not_enough_data;
 580
 581   work->parse_data += len;
 582   work->parse_size -= len;
 583
 584   GST_DEBUG ("image data is %u bytes", work->parse_size);
 585
 586   if (work->parse_size <= 0)
 587     goto not_enough_data;
 588
 589   if (!gst_tag_list_add_id3_image (work->tags, (guint8 *) work->parse_data,
 590           work->parse_size, pic_type)) {
 591     goto error;
 592   }
 593
 594   g_free (mime_str);
 595   return TRUE;
 596
 597 not_enough_data:
 598   {
 599     GST_DEBUG ("not enough data, skipping APIC frame");
 600     /* fall through to error */
 601   }
 602 error:
 603   {
 604     GST_DEBUG ("problem parsing APIC frame, skipping");
 605     g_free (mime_str);
 606     return FALSE;
 607   }
 608 }
 609
 610 #define ID3V2_RVA2_CHANNEL_MASTER  1
 611
 612 static gboolean
 613 parse_relative_volume_adjustment_two (ID3TagsWorking * work)
 614 {
 615   const gchar *gain_tag_name = NULL;
 616   const gchar *peak_tag_name = NULL;
 617   gdouble gain_dB, peak_val;
 618   guint64 peak;
 619   guint8 *data, chan, peak_bits;
 620   gchar *id;
 621   gint len, datalen, i;
 622
 623   if (!parse_id_string (work, &id, &len, &datalen))
 624     return FALSE;
 625
 626   if (datalen < (1 + 2 + 1)) {
 627     GST_WARNING ("broken RVA2 frame, data size only %d bytes", datalen);
 628     g_free (id);
 629     return FALSE;
 630   }
 631
 632   data = work->parse_data + len + 1;
 633   chan = GST_READ_UINT8 (data);
 634   gain_dB = (gdouble) ((gint16) GST_READ_UINT16_BE (data + 1)) / 512.0;
 635   /* The meaning of the peak value is not defined in the ID3v2 spec. However,
 636    * the first/only implementation of this seems to have been in XMMS, and
 637    * other libs (like mutagen) seem to follow that implementation as well:
 638    * see http://bugs.xmms.org/attachment.cgi?id=113&action=view */
 639   peak_bits = GST_READ_UINT8 (data + 1 + 2);
 640   if (peak_bits > 64) {
 641     GST_WARNING ("silly peak precision of %d bits, ignoring", (gint) peak_bits);
 642     peak_bits = 0;
 643   }
 644   data += 1 + 2 + 1;
 645   datalen -= 1 + 2 + 1;
 646   if (peak_bits == 16) {
 647     peak = GST_READ_UINT16_BE (data);
 648   } else {
 649     peak = 0;
 650     for (i = 0; i < (GST_ROUND_UP_8 (peak_bits) / 8) && datalen > 0; ++i) {
 651       peak = peak << 8;
 652       peak |= GST_READ_UINT8 (data);
 653       ++data;
 654       --datalen;
 655     }
 656   }
 657
 658   peak = peak << (64 - GST_ROUND_UP_8 (peak_bits));
 659   peak_val =
 660       gst_guint64_to_gdouble (peak) / gst_util_guint64_to_gdouble (G_MAXINT64);
 661   GST_LOG ("RVA2 frame: id=%s, chan=%u, adj=%.2fdB, peak_bits=%u, peak=%.2f",
 662       id, chan, gain_dB, (guint) peak_bits, peak_val);
 663
 664   if (chan == ID3V2_RVA2_CHANNEL_MASTER && strcmp (id, "track") == 0) {
 665     gain_tag_name = GST_TAG_TRACK_GAIN;
 666     peak_tag_name = GST_TAG_TRACK_PEAK;
 667   } else if (chan == ID3V2_RVA2_CHANNEL_MASTER && strcmp (id, "album") == 0) {
 668     gain_tag_name = GST_TAG_ALBUM_GAIN;
 669     peak_tag_name = GST_TAG_ALBUM_PEAK;
 670   } else {
 671     GST_INFO ("Unhandled RVA2 frame id '%s' for channel %d", id, chan);
 672   }
 673
 674   if (gain_tag_name) {
 675     gst_tag_list_add (work->tags, GST_TAG_MERGE_APPEND,
 676         gain_tag_name, gain_dB, NULL);
 677   }
 678   if (peak_tag_name && peak_bits > 0) {
 679     gst_tag_list_add (work->tags, GST_TAG_MERGE_APPEND,
 680         peak_tag_name, peak_val, NULL);
 681   }
 682
 683   g_free (id);
 684
 685   return (gain_tag_name != NULL || peak_tag_name != NULL);
 686 }
 687
 688 static void
 689 parse_obsolete_tdat_frame (ID3TagsWorking * work)
 690 {
 691   if (work->parse_size >= 5 &&
 692       work->parse_data[0] == ID3V2_ENCODING_ISO8859 &&
 693       g_ascii_isdigit (work->parse_data[1]) &&
 694       g_ascii_isdigit (work->parse_data[2]) &&
 695       g_ascii_isdigit (work->parse_data[3]) &&
 696       g_ascii_isdigit (work->parse_data[4])) {
 697     work->pending_day = (10 * g_ascii_digit_value (work->parse_data[1])) +
 698         g_ascii_digit_value (work->parse_data[2]);
 699     work->pending_month = (10 * g_ascii_digit_value (work->parse_data[3])) +
 700         g_ascii_digit_value (work->parse_data[4]);
 701     GST_LOG ("date (dd/mm) %02u/%02u", work->pending_day, work->pending_month);
 702   }
 703 }
 704
 705 static gboolean
 706 id3v2_tag_to_taglist (ID3TagsWorking * work, const gchar * tag_name,
 707     const gchar * tag_str)
 708 {
 709   GType tag_type = gst_tag_get_type (tag_name);
 710   GstTagList *tag_list = work->tags;
 711
 712   if (tag_str == NULL)
 713     return FALSE;
 714
 715   switch (tag_type) {
 716     case G_TYPE_UINT:
 717     {
 718       gint current, total;
 719
 720       if (sscanf (tag_str, "%d/%d", &current, &total) == 2) {
 721         if (total <= 0) {
 722           GST_WARNING ("Ignoring invalid value for total %d in tag %s",
 723               total, tag_name);
 724         } else {
 725           if (strcmp (tag_name, GST_TAG_TRACK_NUMBER) == 0) {
 726             gst_tag_list_add (tag_list, GST_TAG_MERGE_APPEND,
 727                 GST_TAG_TRACK_COUNT, total, NULL);
 728           } else if (strcmp (tag_name, GST_TAG_ALBUM_VOLUME_NUMBER) == 0) {
 729             gst_tag_list_add (tag_list, GST_TAG_MERGE_APPEND,
 730                 GST_TAG_ALBUM_VOLUME_COUNT, total, NULL);
 731           }
 732         }
 733       } else if (sscanf (tag_str, "%d", &current) != 1) {
 734         /* Not an integer in the string */
 735         GST_WARNING ("Tag string for tag %s does not contain an integer - "
 736             "ignoring", tag_name);
 737         break;
 738       }
 739
 740       if (current <= 0) {
 741         GST_WARNING ("Ignoring invalid value %d in tag %s", current, tag_name);
 742       } else {
 743         gst_tag_list_add (tag_list, GST_TAG_MERGE_APPEND, tag_name, current,
 744             NULL);
 745       }
 746       break;
 747     }
 748     case G_TYPE_UINT64:
 749     {
 750       guint64 tmp;
 751
 752       g_assert (strcmp (tag_name, GST_TAG_DURATION) == 0);
 753       tmp = strtoul (tag_str, NULL, 10);
 754       if (tmp == 0) {
 755         break;
 756       }
 757       gst_tag_list_add (tag_list, GST_TAG_MERGE_APPEND,
 758           GST_TAG_DURATION, tmp * 1000 * 1000, NULL);
 759       break;
 760     }
 761     case G_TYPE_STRING:{
 762       const GValue *val;
 763       guint i, num;
 764
 765       /* make sure we add each unique string only once per tag, we don't want
 766        * to have the same genre in the genre list multiple times, for example,
 767        * or the same DiscID in there twice just because it's contained in the
 768        * tag multiple times under different TXXX user tags */
 769       num = gst_tag_list_get_tag_size (tag_list, tag_name);
 770       for (i = 0; i < num; ++i) {
 771         val = gst_tag_list_get_value_index (tag_list, tag_name, i);
 772         if (val != NULL && strcmp (g_value_get_string (val), tag_str) == 0)
 773           break;
 774       }
 775       if (i == num) {
 776         gst_tag_list_add (tag_list, GST_TAG_MERGE_APPEND,
 777             tag_name, tag_str, NULL);
 778       }
 779       break;
 780     }
 781
 782     default:{
 783       gchar *tmp = NULL;
 784       GValue src = { 0, };
 785       GValue dest = { 0, };
 786
 787       /* Ensure that any date string is complete */
 788       if (tag_type == GST_TYPE_DATE) {
 789         guint year = 1901, month = 1, day = 1;
 790
 791         /* Dates can be yyyy-MM-dd, yyyy-MM or yyyy, but we need
 792          * the first type */
 793         if (sscanf (tag_str, "%04u-%02u-%02u", &year, &month, &day) == 0)
 794           break;
 795
 796         tmp = g_strdup_printf ("%04u-%02u-%02u", year, month, day);
 797         tag_str = tmp;
 798       }
 799
 800       /* handles anything else */
 801       g_value_init (&src, G_TYPE_STRING);
 802       g_value_set_string (&src, (const gchar *) tag_str);
 803       g_value_init (&dest, tag_type);
 804
 805       if (g_value_transform (&src, &dest)) {
 806         gst_tag_list_add_values (tag_list, GST_TAG_MERGE_APPEND,
 807             tag_name, &dest, NULL);
 808       } else if (tag_type == G_TYPE_DOUBLE) {
 809         /* replaygain tags in TXXX frames ... */
 810         g_value_set_double (&dest, g_strtod (tag_str, NULL));
 811         gst_tag_list_add_values (tag_list, GST_TAG_MERGE_KEEP,
 812             tag_name, &dest, NULL);
 813         GST_LOG ("Converted string '%s' to double %f", tag_str,
 814             g_value_get_double (&dest));
 815       } else {
 816         GST_WARNING ("Failed to transform tag from string to type '%s'",
 817             g_type_name (tag_type));
 818       }
 819
 820       g_value_unset (&src);
 821       g_value_unset (&dest);
 822       g_free (tmp);
 823       break;
 824     }
 825   }
 826
 827   return TRUE;
 828 }
 829
 830 /* Check that an array of characters contains only digits */
 831 static gboolean
 832 id3v2_are_digits (const gchar * chars, gint size)
 833 {
 834   gint i;
 835
 836   for (i = 0; i < size; i++) {
 837     if (!g_ascii_isdigit (chars[i]))
 838       return FALSE;
 839   }
 840   return TRUE;
 841 }
 842
 843 static gboolean
 844 id3v2_genre_string_to_taglist (ID3TagsWorking * work, const gchar * tag_name,
 845     const gchar * tag_str, gint len)
 846 {
 847   g_return_val_if_fail (tag_str != NULL, FALSE);
 848
 849   /* If it's a number, it might be a defined genre */
 850   if (id3v2_are_digits (tag_str, len)) {
 851     tag_str = gst_tag_id3_genre_get (strtol (tag_str, NULL, 10));
 852     return id3v2_tag_to_taglist (work, tag_name, tag_str);
 853   }
 854   /* Otherwise it might be "RX" or "CR" */
 855   if (len == 2) {
 856     if (g_ascii_strncasecmp ("rx", tag_str, len) == 0)
 857       return id3v2_tag_to_taglist (work, tag_name, "Remix");
 858
 859     if (g_ascii_strncasecmp ("cr", tag_str, len) == 0)
 860       return id3v2_tag_to_taglist (work, tag_name, "Cover");
 861   }
 862
 863   /* Otherwise it's a string */
 864   return id3v2_tag_to_taglist (work, tag_name, tag_str);
 865 }
 866
 867 static gboolean
 868 id3v2_genre_fields_to_taglist (ID3TagsWorking * work, const gchar * tag_name,
 869     GArray * tag_fields)
 870 {
 871   gchar *tag_str = NULL;
 872   gboolean result = FALSE;
 873   gint i;
 874
 875   for (i = 0; i < tag_fields->len; i++) {
 876     gint len;
 877
 878     tag_str = g_array_index (tag_fields, gchar *, i);
 879     if (tag_str == NULL)
 880       continue;
 881
 882     len = strlen (tag_str);
 883     /* Only supposed to see '(n)' type numeric genre strings in ID3 <= 2.3.0
 884      * but apparently we see them in 2.4.0 sometimes too */
 885     if (TRUE || work->hdr.version <= 0x300) {   /* <= 2.3.0 */
 886       /* Check for genre numbers wrapped in parentheses, possibly
 887        * followed by a string */
 888       while (len >= 2) {
 889         gint pos;
 890         gboolean found = FALSE;
 891
 892         /* Double parenthesis ends the numeric genres, but we need
 893          * to swallow the first one so we actually output '(' */
 894         if (tag_str[0] == '(' && tag_str[1] == '(') {
 895           tag_str++;
 896           len--;
 897           break;
 898         }
 899
 900         /* If the first char is not a parenthesis, then stop
 901          * looking for parenthesised genre strings */
 902         if (tag_str[0] != '(')
 903           break;
 904
 905         for (pos = 1; pos < len; pos++) {
 906           if (tag_str[pos] == ')') {
 907             gchar *tmp_str;
 908
 909             tmp_str = g_strndup (tag_str + 1, pos - 1);
 910             result |=
 911                 id3v2_genre_string_to_taglist (work, tag_name, tmp_str,
 912                 pos - 1);
 913             g_free (tmp_str);
 914             tag_str += pos + 1;
 915             len -= pos + 1;
 916             found = TRUE;
 917             break;
 918           }
 919
 920           /* If we encounter a non-digit while searching for a closing
 921            * parenthesis, we should not try and interpret this as a
 922            * numeric genre string */
 923           if (!g_ascii_isdigit (tag_str[pos]))
 924             break;
 925         }
 926         if (!found)
 927           break;                /* There was no closing parenthesis */
 928       }
 929     }
 930
 931     if (len > 0 && tag_str != NULL)
 932       result |= id3v2_genre_string_to_taglist (work, tag_name, tag_str, len);
 933   }
 934   return result;
 935 }
 936
 937 static const gchar utf16enc[] = "UTF-16";
 938 static const gchar utf16leenc[] = "UTF-16LE";
 939 static const gchar utf16beenc[] = "UTF-16BE";
 940
 941 static gboolean
 942 find_utf16_bom (gchar * data, const gchar ** p_in_encoding)
 943 {
 944   guint16 marker = (GST_READ_UINT8 (data) << 8) | GST_READ_UINT8 (data + 1);
 945
 946   switch (marker) {
 947     case 0xFFFE:
 948       *p_in_encoding = utf16leenc;
 949       return TRUE;
 950     case 0xFEFF:
 951       *p_in_encoding = utf16beenc;
 952       return TRUE;
 953     default:
 954       break;
 955   }
 956   return FALSE;
 957 }
 958
 959 static void *
 960 string_utf8_dup (const gchar * start, const guint size)
 961 {
 962   const gchar *env;
 963   gsize bytes_read;
 964   gchar *utf8;
 965
 966   /* Should we try the charsets specified
 967    * via environment variables FIRST ? */
 968   if (g_utf8_validate (start, size, NULL)) {
 969     utf8 = g_strndup (start, size);
 970     goto beach;
 971   }
 972
 973   env = g_getenv ("GST_ID3V1_TAG_ENCODING");
 974   if (!env || *env == '\0')
 975     env = g_getenv ("GST_ID3_TAG_ENCODING");
 976   if (!env || *env == '\0')
 977     env = g_getenv ("GST_TAG_ENCODING");
 978
 979   /* Try charsets specified via the environment */
 980   if (env && *env != '\0') {
 981     gchar **c, **csets;
 982
 983     csets = g_strsplit (env, G_SEARCHPATH_SEPARATOR_S, -1);
 984
 985     for (c = csets; c && *c; ++c) {
 986       if ((utf8 =
 987               g_convert (start, size, "UTF-8", *c, &bytes_read, NULL, NULL))) {
 988         if (bytes_read == size) {
 989           GST_DEBUG ("Using charset %s to interperate id3 tags\n", *c);
 990           g_strfreev (csets);
 991           goto beach;
 992         }
 993         g_free (utf8);
 994         utf8 = NULL;
 995       }
 996     }
 997   }
 998   /* Try current locale (if not UTF-8) */
 999   if (!g_get_charset (&env)) {
1000     if ((utf8 = g_locale_to_utf8 (start, size, &bytes_read, NULL, NULL))) {
1001       if (bytes_read == size) {
1002         goto beach;
1003       }
1004       g_free (utf8);
1005       utf8 = NULL;
1006     }
1007   }
1008
1009   /* Try ISO-8859-1 */
1010   utf8 =
1011       g_convert (start, size, "UTF-8", "ISO-8859-1", &bytes_read, NULL, NULL);
1012   if (utf8 != NULL && bytes_read == size) {
1013     goto beach;
1014   }
1015
1016   g_free (utf8);
1017   return NULL;
1018
1019 beach:
1020
1021   g_strchomp (utf8);
1022
1023   return (utf8);
1024 }
1025
1026 static void
1027 parse_insert_string_field (guint8 encoding, gchar * data, gint data_size,
1028     GArray * fields)
1029 {
1030   gchar *field = NULL;
1031
1032   switch (encoding) {
1033     case ID3V2_ENCODING_UTF16:
1034     case ID3V2_ENCODING_UTF16BE:
1035     {
1036       const gchar *in_encode;
1037
1038       if (encoding == ID3V2_ENCODING_UTF16)
1039         in_encode = utf16enc;
1040       else
1041         in_encode = utf16beenc;
1042
1043       /* Sometimes we see strings with multiple BOM markers at the start.
1044        * In that case, we assume the innermost one is correct. If that fails
1045        * to produce valid UTF-8, we try the other endianness anyway */
1046       while (data_size > 2 && find_utf16_bom (data, &in_encode)) {
1047         data += 2;              /* skip BOM */
1048         data_size -= 2;
1049       }
1050
1051       field = g_convert (data, data_size, "UTF-8", in_encode, NULL, NULL, NULL);
1052
1053       if (field == NULL || g_utf8_validate (field, -1, NULL) == FALSE) {
1054         /* As a fallback, try interpreting UTF-16 in the other endianness */
1055         if (in_encode == utf16beenc)
1056           field = g_convert (data, data_size, "UTF-8", utf16leenc,
1057               NULL, NULL, NULL);
1058       }
1059     }
1060
1061       break;
1062     case ID3V2_ENCODING_ISO8859:
1063       if (g_utf8_validate (data, data_size, NULL))
1064         field = g_strndup (data, data_size);
1065       else
1066         /* field = g_convert (data, data_size, "UTF-8", "ISO-8859-1",
1067            NULL, NULL, NULL); */
1068         field = string_utf8_dup (data, data_size);
1069       break;
1070     default:
1071       field = g_strndup (data, data_size);
1072       break;
1073   }
1074
1075   if (field) {
1076     if (g_utf8_validate (field, -1, NULL)) {
1077       g_array_append_val (fields, field);
1078       return;
1079     }
1080
1081     GST_DEBUG ("%s was bad UTF-8 after conversion from encoding %d. Ignoring",
1082         field, encoding);
1083     g_free (field);
1084   }
1085 }
1086
1087 static void
1088 parse_split_strings (guint8 encoding, gchar * data, gint data_size,
1089     GArray ** out_fields)
1090 {
1091   GArray *fields = g_array_new (FALSE, TRUE, sizeof (gchar *));
1092   gint text_pos;
1093   gint prev = 0;
1094
1095   g_return_if_fail (out_fields != NULL);
1096
1097   switch (encoding) {
1098     case ID3V2_ENCODING_ISO8859:
1099       for (text_pos = 0; text_pos < data_size; text_pos++) {
1100         if (data[text_pos] == 0) {
1101           parse_insert_string_field (encoding, data + prev,
1102               text_pos - prev + 1, fields);
1103           prev = text_pos + 1;
1104         }
1105       }
1106       if (data_size - prev > 0 && data[prev] != 0x00) {
1107         parse_insert_string_field (encoding, data + prev,
1108             data_size - prev, fields);
1109       }
1110
1111       break;
1112     case ID3V2_ENCODING_UTF8:
1113       for (prev = 0, text_pos = 0; text_pos < data_size; text_pos++) {
1114         if (data[text_pos] == '\0') {
1115           parse_insert_string_field (encoding, data + prev,
1116               text_pos - prev + 1, fields);
1117           prev = text_pos + 1;
1118         }
1119       }
1120       if (data_size - prev > 0 && data[prev] != 0x00) {
1121         parse_insert_string_field (encoding, data + prev,
1122             data_size - prev, fields);
1123       }
1124       break;
1125     case ID3V2_ENCODING_UTF16:
1126     case ID3V2_ENCODING_UTF16BE:
1127     {
1128       /* Find '\0\0' terminator */
1129       for (text_pos = 0; text_pos < data_size - 1; text_pos += 2) {
1130         if (data[text_pos] == '\0' && data[text_pos + 1] == '\0') {
1131           /* found a delimiter */
1132           parse_insert_string_field (encoding, data + prev,
1133               text_pos - prev + 2, fields);
1134           text_pos++;           /* Advance to the 2nd NULL terminator */
1135           prev = text_pos + 1;
1136           break;
1137         }
1138       }
1139       if (data_size - prev > 1 &&
1140           (data[prev] != 0x00 || data[prev + 1] != 0x00)) {
1141         /* There were 2 or more non-null chars left, convert those too */
1142         parse_insert_string_field (encoding, data + prev,
1143             data_size - prev, fields);
1144       }
1145       break;
1146     }
1147   }
1148   if (fields->len > 0)
1149     *out_fields = fields;
1150   else
1151     g_array_free (fields, TRUE);
1152 }
1153
1154 static void
1155 free_tag_strings (GArray * fields)
1156 {
1157   if (fields) {
1158     gint i;
1159     gchar *c;
1160
1161     for (i = 0; i < fields->len; i++) {
1162       c = g_array_index (fields, gchar *, i);
1163       g_free (c);
1164     }
1165     g_array_free (fields, TRUE);
1166   }
1167 }