gst/id3demux/id3v2frames.c

   1 /* -*- Mode: C; tab-width: 2; indent-tabs-mode: t; c-basic-offset: 2 -*- */
   2 /* Copyright 2006-2008 Tim-Philipp Müller <tim centricular net>
   3  * Copyright 2005 Jan Schmidt <thaytan@mad.scientist.com>
   4  * Copyright 2002,2003 Scott Wheeler <wheeler@kde.org> (portions from taglib)
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Library General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Library General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Library General Public
  17  * License along with this library; if not, write to the
  18  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  19  * Boston, MA 02111-1307, USA.
  20  */
  21
  22 #ifdef HAVE_CONFIG_H
  23 #include "config.h"
  24 #endif
  25
  26 #include <string.h>
  27 #include <stdio.h>
  28 #include <stdlib.h>
  29 #include <gst/tag/tag.h>
  30 #include <gst/base/gsttypefindhelper.h>
  31
  32 #ifdef HAVE_ZLIB
  33 #include <zlib.h>
  34 #endif
  35
  36 #include "id3tags.h"
  37
  38 GST_DEBUG_CATEGORY_EXTERN (id3demux_debug);
  39 #define GST_CAT_DEFAULT (id3demux_debug)
  40
  41 static gboolean parse_comment_frame (ID3TagsWorking * work);
  42 static gchar *parse_url_link_frame (ID3TagsWorking * work,
  43     const gchar ** tag_name);
  44 static GArray *parse_text_identification_frame (ID3TagsWorking * work);
  45 static gchar *parse_user_text_identification_frame (ID3TagsWorking * work,
  46     const gchar ** tag_name);
  47 static gchar *parse_unique_file_identifier (ID3TagsWorking * work,
  48     const gchar ** tag_name);
  49 static gboolean parse_relative_volume_adjustment_two (ID3TagsWorking * work);
  50 static void parse_obsolete_tdat_frame (ID3TagsWorking * work);
  51 static gboolean id3v2_tag_to_taglist (ID3TagsWorking * work,
  52     const gchar * tag_name, const gchar * tag_str);
  53 /* Parse a single string into an array of gchar* */
  54 static void parse_split_strings (guint8 encoding, gchar * data, gint data_size,
  55     GArray ** out_fields);
  56 static void free_tag_strings (GArray * fields);
  57 static gboolean
  58 id3v2_genre_fields_to_taglist (ID3TagsWorking * work, const gchar * tag_name,
  59     GArray * tag_fields);
  60 static gboolean parse_picture_frame (ID3TagsWorking * work);
  61
  62 #define ID3V2_ENCODING_ISO8859 0x00
  63 #define ID3V2_ENCODING_UTF16   0x01
  64 #define ID3V2_ENCODING_UTF16BE 0x02
  65 #define ID3V2_ENCODING_UTF8    0x03
  66
  67 gboolean
  68 id3demux_id3v2_parse_frame (ID3TagsWorking * work)
  69 {
  70   const gchar *tag_name;
  71   gboolean result = FALSE;
  72   gint i;
  73   guint8 *frame_data = work->hdr.frame_data;
  74   guint frame_data_size = work->cur_frame_size;
  75   gchar *tag_str = NULL;
  76   GArray *tag_fields = NULL;
  77   guint8 *uu_data = NULL;
  78
  79 #ifdef HAVE_ZLIB
  80   guint8 *uncompressed_data = NULL;
  81 #endif
  82
  83   /* Check that the frame id is valid */
  84   for (i = 0; i < 5 && work->frame_id[i] != '\0'; i++) {
  85     if (!g_ascii_isalnum (work->frame_id[i])) {
  86       GST_DEBUG ("Encountered invalid frame_id");
  87       return FALSE;
  88     }
  89   }
  90
  91   /* Can't handle encrypted frames right now (in case we ever do, we'll have
  92    * to do the decryption after the un-unsynchronisation and decompression,
  93    * not here) */
  94   if (work->frame_flags & ID3V2_FRAME_FORMAT_ENCRYPTION) {
  95     GST_WARNING ("Encrypted frames are not supported");
  96     return FALSE;
  97   }
  98
  99   tag_name = gst_tag_from_id3_tag (work->frame_id);
 100   if (tag_name == NULL &&
 101       strncmp (work->frame_id, "RVA2", 4) != 0 &&
 102       strncmp (work->frame_id, "TXXX", 4) != 0 &&
 103       strncmp (work->frame_id, "TDAT", 4) != 0 &&
 104       strncmp (work->frame_id, "UFID", 4) != 0) {
 105     return FALSE;
 106   }
 107
 108   if (work->frame_flags & (ID3V2_FRAME_FORMAT_COMPRESSION |
 109           ID3V2_FRAME_FORMAT_DATA_LENGTH_INDICATOR)) {
 110     if (work->hdr.frame_data_size <= 4)
 111       return FALSE;
 112     if (ID3V2_VER_MAJOR (work->hdr.version) == 3) {
 113       work->parse_size = GST_READ_UINT32_BE (frame_data);
 114     } else {
 115       work->parse_size = read_synch_uint (frame_data, 4);
 116     }
 117     frame_data += 4;
 118     frame_data_size -= 4;
 119     if (work->parse_size < frame_data_size) {
 120       GST_WARNING ("ID3v2 frame %s has invalid size %d.", tag_name,
 121           frame_data_size);
 122       return FALSE;
 123     }
 124   }
 125
 126   /* in v2.3 the frame sizes are not syncsafe, so the entire tag had to be
 127    * unsynced. In v2.4 the frame sizes are syncsafe so it's just the frame
 128    * data that needs un-unsyncing, but not the frame headers. */
 129   if (ID3V2_VER_MAJOR (work->hdr.version) == 4) {
 130     if ((work->hdr.flags & ID3V2_HDR_FLAG_UNSYNC) != 0 ||
 131         ((work->frame_flags & ID3V2_FRAME_FORMAT_UNSYNCHRONISATION) != 0)) {
 132       GST_DEBUG ("Un-unsyncing frame %s", work->frame_id);
 133       uu_data = id3demux_ununsync_data (frame_data, &frame_data_size);
 134       frame_data = uu_data;
 135       GST_MEMDUMP ("ID3v2 frame (un-unsyced)", frame_data, frame_data_size);
 136     }
 137   }
 138
 139   work->parse_size = frame_data_size;
 140
 141   if (work->frame_flags & ID3V2_FRAME_FORMAT_COMPRESSION) {
 142 #ifdef HAVE_ZLIB
 143     uLongf destSize = work->parse_size;
 144     Bytef *dest, *src;
 145
 146     uncompressed_data = g_malloc (work->parse_size);
 147
 148     dest = (Bytef *) uncompressed_data;
 149     src = (Bytef *) frame_data;
 150
 151     if (uncompress (dest, &destSize, src, frame_data_size) != Z_OK) {
 152       g_free (uncompressed_data);
 153       g_free (uu_data);
 154       return FALSE;
 155     }
 156     if (destSize != work->parse_size) {
 157       GST_WARNING
 158           ("Decompressing ID3v2 frame %s did not produce expected size %d bytes (got %lu)",
 159           tag_name, work->parse_size, destSize);
 160       g_free (uncompressed_data);
 161       g_free (uu_data);
 162       return FALSE;
 163     }
 164     work->parse_data = uncompressed_data;
 165 #else
 166     GST_WARNING ("Compressed ID3v2 tag frame could not be decompressed"
 167         " because gstid3demux was compiled without zlib support");
 168     g_free (uu_data);
 169     return FALSE;
 170 #endif
 171   } else {
 172     work->parse_data = frame_data;
 173   }
 174
 175   if (work->frame_id[0] == 'T') {
 176     if (strcmp (work->frame_id, "TDAT") == 0) {
 177       parse_obsolete_tdat_frame (work);
 178       result = TRUE;
 179     } else if (strcmp (work->frame_id, "TXXX") == 0) {
 180       /* Handle user text frame */
 181       tag_str = parse_user_text_identification_frame (work, &tag_name);
 182     } else {
 183       /* Text identification frame */
 184       tag_fields = parse_text_identification_frame (work);
 185     }
 186   } else if (work->frame_id[0] == 'W' && strcmp (work->frame_id, "WXXX") != 0) {
 187     /* URL link frame: ISO-8859-1 encoded, one frame per tag */
 188     tag_str = parse_url_link_frame (work, &tag_name);
 189   } else if (!strcmp (work->frame_id, "COMM")) {
 190     /* Comment */
 191     result = parse_comment_frame (work);
 192   } else if (!strcmp (work->frame_id, "APIC")) {
 193     /* Attached picture */
 194     result = parse_picture_frame (work);
 195   } else if (!strcmp (work->frame_id, "RVA2")) {
 196     /* Relative volume */
 197     result = parse_relative_volume_adjustment_two (work);
 198   } else if (!strcmp (work->frame_id, "UFID")) {
 199     /* Unique file identifier */
 200     tag_str = parse_unique_file_identifier (work, &tag_name);
 201   }
 202 #ifdef HAVE_ZLIB
 203   if (work->frame_flags & ID3V2_FRAME_FORMAT_COMPRESSION) {
 204     g_free (uncompressed_data);
 205     uncompressed_data = NULL;
 206     work->parse_data = frame_data;
 207   }
 208 #endif
 209
 210   if (tag_str != NULL) {
 211     /* g_print ("Tag %s value %s\n", tag_name, tag_str); */
 212     result = id3v2_tag_to_taglist (work, tag_name, tag_str);
 213     g_free (tag_str);
 214   }
 215   if (tag_fields != NULL) {
 216     if (strcmp (work->frame_id, "TCON") == 0) {
 217       /* Genre strings need special treatment */
 218       result |= id3v2_genre_fields_to_taglist (work, tag_name, tag_fields);
 219     } else {
 220       gint t;
 221
 222       for (t = 0; t < tag_fields->len; t++) {
 223         tag_str = g_array_index (tag_fields, gchar *, t);
 224         if (tag_str != NULL && tag_str[0] != '\0')
 225           result |= id3v2_tag_to_taglist (work, tag_name, tag_str);
 226       }
 227     }
 228     free_tag_strings (tag_fields);
 229   }
 230
 231   g_free (uu_data);
 232
 233   return result;
 234 }
 235
 236 static gboolean
 237 parse_comment_frame (ID3TagsWorking * work)
 238 {
 239   guint dummy;
 240   guint8 encoding;
 241   gchar language[4];
 242   GArray *fields = NULL;
 243   gchar *description, *text;
 244
 245   if (work->parse_size < 6)
 246     return FALSE;
 247
 248   encoding = work->parse_data[0];
 249   language[0] = g_ascii_tolower (work->parse_data[1]);
 250   language[1] = g_ascii_tolower (work->parse_data[2]);
 251   language[2] = g_ascii_tolower (work->parse_data[3]);
 252   language[3] = '\0';
 253
 254   parse_split_strings (encoding, (gchar *) work->parse_data + 4,
 255       work->parse_size - 4, &fields);
 256
 257   if (fields == NULL || fields->len < 2) {
 258     GST_WARNING ("Failed to decode comment frame");
 259     goto fail;
 260   }
 261   description = g_array_index (fields, gchar *, 0);
 262   text = g_array_index (fields, gchar *, 1);
 263
 264   if (!g_utf8_validate (text, -1, NULL)) {
 265     GST_WARNING ("Converted string is not valid utf-8");
 266     goto fail;
 267   }
 268
 269   /* skip our own dummy descriptions (from id3v2mux) */
 270   if (strlen (description) > 0 && g_utf8_validate (description, -1, NULL) &&
 271       sscanf (description, "c%u", &dummy) != 1) {
 272     gchar *s;
 273
 274     /* must be either an ISO-639-1 or ISO-639-2 language code */
 275     if (language[0] != '\0' &&
 276         g_ascii_isalpha (language[0]) &&
 277         g_ascii_isalpha (language[1]) &&
 278         (g_ascii_isalpha (language[2]) || language[2] == '\0')) {
 279       s = g_strdup_printf ("%s[%s]=%s", description, language, text);
 280     } else {
 281       s = g_strdup_printf ("%s=%s", description, text);
 282     }
 283     gst_tag_list_add (work->tags, GST_TAG_MERGE_APPEND,
 284         GST_TAG_EXTENDED_COMMENT, s, NULL);
 285     g_free (s);
 286   } else if (text != NULL && *text != '\0') {
 287     gst_tag_list_add (work->tags, GST_TAG_MERGE_APPEND,
 288         GST_TAG_COMMENT, text, NULL);
 289   } else {
 290     goto fail;
 291   }
 292
 293   free_tag_strings (fields);
 294   return TRUE;
 295
 296 fail:
 297   {
 298     GST_WARNING ("failed to parse COMM frame");
 299     free_tag_strings (fields);
 300     return FALSE;
 301   }
 302 }
 303
 304 static GArray *
 305 parse_text_identification_frame (ID3TagsWorking * work)
 306 {
 307   guchar encoding;
 308   GArray *fields = NULL;
 309
 310   if (work->parse_size < 2)
 311     return NULL;
 312
 313   encoding = work->parse_data[0];
 314   parse_split_strings (encoding, (gchar *) work->parse_data + 1,
 315       work->parse_size - 1, &fields);
 316   if (fields) {
 317     if (fields->len > 0) {
 318       GST_LOG ("Read %d fields from Text ID frame of size %d with encoding %d"
 319           ". First is '%s'", fields->len, work->parse_size - 1, encoding,
 320           g_array_index (fields, gchar *, 0));
 321     } else {
 322       GST_LOG ("Read 0 fields from Text ID frame of size %d with encoding %d",
 323           work->parse_size - 1, encoding);
 324     }
 325   }
 326
 327   return fields;
 328 }
 329
 330 static gboolean
 331 link_is_known_license (const gchar * url)
 332 {
 333   return g_str_has_prefix (url, "http://creativecommons.org/licenses/");
 334 }
 335
 336 static gchar *
 337 parse_url_link_frame (ID3TagsWorking * work, const gchar ** tag_name)
 338 {
 339   gsize len;
 340   gchar *nul, *data, *link;
 341
 342   *tag_name = NULL;
 343
 344   if (work->parse_size == 0)
 345     return NULL;
 346
 347   data = (gchar *) work->parse_data;
 348   /* if there's more data then the string is long, we only want to parse the
 349    * data up to the terminating zero to g_convert and ignore the rest, as
 350    * per spec */
 351   nul = memchr (data, '\0', work->parse_size);
 352   if (nul != NULL) {
 353     len = (gsize) (nul - data);
 354   } else {
 355     len = work->parse_size;
 356   }
 357
 358   link = g_convert (data, len, "UTF-8", "ISO-8859-1", NULL, NULL, NULL);
 359
 360   if (link == NULL || !gst_uri_is_valid (link)) {
 361     GST_DEBUG ("Invalid URI in %s frame: %s", work->frame_id,
 362         GST_STR_NULL (link));
 363     g_free (link);
 364     return NULL;
 365   }
 366
 367   /* we don't know if it's a link to a page that explains the copyright
 368    * situation, or a link that points to/represents a license, the ID3 spec
 369    * does not separate those two things; for now only put known license URIs
 370    * into GST_TAG_LICENSE_URI and everything else into GST_TAG_COPYRIGHT_URI */
 371   if (strcmp (work->frame_id, "WCOP") == 0) {
 372     if (link_is_known_license (link))
 373       *tag_name = GST_TAG_LICENSE_URI;
 374     else
 375       *tag_name = GST_TAG_COPYRIGHT_URI;
 376   } else if (strcmp (work->frame_id, "WOAF") == 0) {
 377     /* can't be bothered to create a CONTACT_URI tag for this, so let's just
 378      * put into into GST_TAG_CONTACT, which is where it ends up when reading
 379      * the info from vorbis comments as well */
 380     *tag_name = GST_TAG_CONTACT;
 381   }
 382
 383   return link;
 384 }
 385
 386
 387 static gchar *
 388 parse_user_text_identification_frame (ID3TagsWorking * work,
 389     const gchar ** tag_name)
 390 {
 391   gchar *ret;
 392   guchar encoding;
 393   GArray *fields = NULL;
 394
 395   *tag_name = NULL;
 396
 397   if (work->parse_size < 2)
 398     return NULL;
 399
 400   encoding = work->parse_data[0];
 401
 402   parse_split_strings (encoding, (gchar *) work->parse_data + 1,
 403       work->parse_size - 1, &fields);
 404
 405   if (fields == NULL)
 406     return NULL;
 407
 408   if (fields->len != 2) {
 409     GST_WARNING ("Expected 2 fields in TXXX frame, but got %d", fields->len);
 410     free_tag_strings (fields);
 411     return NULL;
 412   }
 413
 414   *tag_name =
 415       gst_tag_from_id3_user_tag ("TXXX", g_array_index (fields, gchar *, 0));
 416
 417   GST_LOG ("TXXX frame of size %d. Mapped descriptor '%s' to GStreamer tag %s",
 418       work->parse_size - 1, g_array_index (fields, gchar *, 0),
 419       GST_STR_NULL (*tag_name));
 420
 421   if (*tag_name) {
 422     ret = g_strdup (g_array_index (fields, gchar *, 1));
 423     /* GST_LOG ("%s = %s", *tag_name, GST_STR_NULL (ret)); */
 424   } else {
 425     ret = NULL;
 426   }
 427
 428   free_tag_strings (fields);
 429   return ret;
 430 }
 431
 432 static gboolean
 433 parse_id_string (ID3TagsWorking * work, gchar ** p_str, gint * p_len,
 434     gint * p_datalen)
 435 {
 436   gint len, datalen;
 437
 438   if (work->parse_size < 2)
 439     return FALSE;
 440
 441   for (len = 0; len < work->parse_size - 1; ++len) {
 442     if (work->parse_data[len] == '\0')
 443       break;
 444   }
 445
 446   datalen = work->parse_size - (len + 1);
 447   if (len == 0 || datalen <= 0)
 448     return FALSE;
 449
 450   *p_str = g_strndup ((gchar *) work->parse_data, len);
 451   *p_len = len;
 452   *p_datalen = datalen;
 453
 454   return TRUE;
 455 }
 456
 457 static gchar *
 458 parse_unique_file_identifier (ID3TagsWorking * work, const gchar ** tag_name)
 459 {
 460   gint len, datalen;
 461   gchar *owner_id, *data, *ret = NULL;
 462
 463   GST_LOG ("parsing UFID frame of size %d", work->parse_size);
 464
 465   if (!parse_id_string (work, &owner_id, &len, &datalen))
 466     return NULL;
 467
 468   data = (gchar *) work->parse_data + len + 1;
 469   GST_LOG ("UFID owner ID: %s (+ %d bytes of data)", owner_id, datalen);
 470
 471   if (strcmp (owner_id, "http://musicbrainz.org") == 0 &&
 472       g_utf8_validate (data, datalen, NULL)) {
 473     *tag_name = GST_TAG_MUSICBRAINZ_TRACKID;
 474     ret = g_strndup (data, datalen);
 475   } else {
 476     GST_INFO ("Unknown UFID owner ID: %s", owner_id);
 477   }
 478   g_free (owner_id);
 479
 480   return ret;
 481 }
 482
 483 /* parse data and return length of the next string in the given encoding,
 484  * including the NUL terminator */
 485 static gint
 486 scan_encoded_string (guint8 encoding, gchar * data, gint data_size)
 487 {
 488   gint i;
 489
 490   switch (encoding) {
 491     case ID3V2_ENCODING_ISO8859:
 492     case ID3V2_ENCODING_UTF8:
 493       for (i = 0; i < data_size; ++i) {
 494         if (data[i] == '\0')
 495           return i + 1;
 496       }
 497       break;
 498     case ID3V2_ENCODING_UTF16:
 499     case ID3V2_ENCODING_UTF16BE:
 500       /* we don't care about BOMs here and treat them as part of the string */
 501       /* Find '\0\0' terminator */
 502       for (i = 0; i < data_size - 1; i += 2) {
 503         if (data[i] == '\0' && data[i + 1] == '\0')
 504           return i + 2;
 505       }
 506       break;
 507     default:
 508       break;
 509   }
 510
 511   return 0;
 512 }
 513
 514 static gboolean
 515 parse_picture_frame (ID3TagsWorking * work)
 516 {
 517   guint8 txt_encoding, pic_type;
 518   gchar *mime_str = NULL;
 519   gint len, datalen;
 520
 521   GST_LOG ("APIC frame (ID3v2.%u)", ID3V2_VER_MAJOR (work->hdr.version));
 522
 523   if (work->parse_size < 1 + 1 + 1 + 1 + 1)
 524     goto not_enough_data;
 525
 526   txt_encoding = work->parse_data[0];
 527   ++work->parse_data;
 528   --work->parse_size;
 529
 530   /* Read image format; in early ID3v2 versions this is a fixed-length
 531    * 3-character string without terminator; in later versions (>= 2.3.0)
 532    * this is a NUL-terminated string of variable length */
 533   if (ID3V2_VER_MAJOR (work->hdr.version) < 3) {
 534     if (work->parse_size < 3)
 535       goto not_enough_data;
 536
 537     mime_str = g_strndup ((gchar *) work->parse_data, 3);
 538     len = 3;
 539   } else {
 540     if (!parse_id_string (work, &mime_str, &len, &datalen))
 541       return FALSE;
 542     ++len;                      /* for string terminator */
 543   }
 544
 545   if (work->parse_size < len + 1 + 1 + 1)
 546     goto not_enough_data;
 547
 548   work->parse_data += len;
 549   work->parse_size -= len;
 550
 551   /* Read image type */
 552   pic_type = work->parse_data[0];
 553   ++work->parse_data;
 554   --work->parse_size;
 555
 556   GST_LOG ("APIC frame mime type    : %s", GST_STR_NULL (mime_str));
 557   GST_LOG ("APIC frame picture type : 0x%02x", (guint) pic_type);
 558
 559   if (work->parse_size < 1 + 1)
 560     goto not_enough_data;
 561
 562   len = scan_encoded_string (txt_encoding, (gchar *) work->parse_data,
 563       work->parse_size);
 564
 565   if (len < 1)
 566     goto error;
 567
 568   /* just skip the description string ... */
 569   GST_LOG ("Skipping description string (%d bytes in original coding)", len);
 570
 571   if (work->parse_size < len + 1)
 572     goto not_enough_data;
 573
 574   work->parse_data += len;
 575   work->parse_size -= len;
 576
 577   GST_DEBUG ("image data is %u bytes", work->parse_size);
 578
 579   if (work->parse_size <= 0)
 580     goto not_enough_data;
 581
 582   if (!gst_tag_list_add_id3_image (work->tags, (guint8 *) work->parse_data,
 583           work->parse_size, pic_type)) {
 584     goto error;
 585   }
 586
 587   g_free (mime_str);
 588   return TRUE;
 589
 590 not_enough_data:
 591   {
 592     GST_DEBUG ("not enough data, skipping APIC frame");
 593     /* fall through to error */
 594   }
 595 error:
 596   {
 597     GST_DEBUG ("problem parsing APIC frame, skipping");
 598     g_free (mime_str);
 599     return FALSE;
 600   }
 601 }
 602
 603 #define ID3V2_RVA2_CHANNEL_MASTER  1
 604
 605 static gboolean
 606 parse_relative_volume_adjustment_two (ID3TagsWorking * work)
 607 {
 608   const gchar *gain_tag_name = NULL;
 609   const gchar *peak_tag_name = NULL;
 610   gdouble gain_dB, peak_val;
 611   guint64 peak;
 612   guint8 *data, chan, peak_bits;
 613   gchar *id;
 614   gint len, datalen, i;
 615
 616   if (!parse_id_string (work, &id, &len, &datalen))
 617     return FALSE;
 618
 619   if (datalen < (1 + 2 + 1)) {
 620     GST_WARNING ("broken RVA2 frame, data size only %d bytes", datalen);
 621     g_free (id);
 622     return FALSE;
 623   }
 624
 625   data = work->parse_data + len + 1;
 626   chan = GST_READ_UINT8 (data);
 627   gain_dB = (gdouble) ((gint16) GST_READ_UINT16_BE (data + 1)) / 512.0;
 628   /* The meaning of the peak value is not defined in the ID3v2 spec. However,
 629    * the first/only implementation of this seems to have been in XMMS, and
 630    * other libs (like mutagen) seem to follow that implementation as well:
 631    * see http://bugs.xmms.org/attachment.cgi?id=113&action=view */
 632   peak_bits = GST_READ_UINT8 (data + 1 + 2);
 633   if (peak_bits > 64) {
 634     GST_WARNING ("silly peak precision of %d bits, ignoring", (gint) peak_bits);
 635     peak_bits = 0;
 636   }
 637   data += 1 + 2 + 1;
 638   datalen -= 1 + 2 + 1;
 639   if (peak_bits == 16) {
 640     peak = GST_READ_UINT16_BE (data);
 641   } else {
 642     peak = 0;
 643     for (i = 0; i < (GST_ROUND_UP_8 (peak_bits) / 8) && datalen > 0; ++i) {
 644       peak = peak << 8;
 645       peak |= GST_READ_UINT8 (data);
 646       ++data;
 647       --datalen;
 648     }
 649   }
 650
 651   peak = peak << (64 - GST_ROUND_UP_8 (peak_bits));
 652   peak_val =
 653       gst_guint64_to_gdouble (peak) / gst_util_guint64_to_gdouble (G_MAXINT64);
 654   GST_LOG ("RVA2 frame: id=%s, chan=%u, adj=%.2fdB, peak_bits=%u, peak=%.2f",
 655       id, chan, gain_dB, (guint) peak_bits, peak_val);
 656
 657   if (chan == ID3V2_RVA2_CHANNEL_MASTER && strcmp (id, "track") == 0) {
 658     gain_tag_name = GST_TAG_TRACK_GAIN;
 659     peak_tag_name = GST_TAG_TRACK_PEAK;
 660   } else if (chan == ID3V2_RVA2_CHANNEL_MASTER && strcmp (id, "album") == 0) {
 661     gain_tag_name = GST_TAG_ALBUM_GAIN;
 662     peak_tag_name = GST_TAG_ALBUM_PEAK;
 663   } else {
 664     GST_INFO ("Unhandled RVA2 frame id '%s' for channel %d", id, chan);
 665   }
 666
 667   if (gain_tag_name) {
 668     gst_tag_list_add (work->tags, GST_TAG_MERGE_APPEND,
 669         gain_tag_name, gain_dB, NULL);
 670   }
 671   if (peak_tag_name && peak_bits > 0) {
 672     gst_tag_list_add (work->tags, GST_TAG_MERGE_APPEND,
 673         peak_tag_name, peak_val, NULL);
 674   }
 675
 676   g_free (id);
 677
 678   return (gain_tag_name != NULL || peak_tag_name != NULL);
 679 }
 680
 681 static void
 682 parse_obsolete_tdat_frame (ID3TagsWorking * work)
 683 {
 684   if (work->parse_size >= 5 &&
 685       work->parse_data[0] == ID3V2_ENCODING_ISO8859 &&
 686       g_ascii_isdigit (work->parse_data[1]) &&
 687       g_ascii_isdigit (work->parse_data[2]) &&
 688       g_ascii_isdigit (work->parse_data[3]) &&
 689       g_ascii_isdigit (work->parse_data[4])) {
 690     work->pending_day = (10 * g_ascii_digit_value (work->parse_data[1])) +
 691         g_ascii_digit_value (work->parse_data[2]);
 692     work->pending_month = (10 * g_ascii_digit_value (work->parse_data[3])) +
 693         g_ascii_digit_value (work->parse_data[4]);
 694     GST_LOG ("date (dd/mm) %02u/%02u", work->pending_day, work->pending_month);
 695   }
 696 }
 697
 698 static gboolean
 699 id3v2_tag_to_taglist (ID3TagsWorking * work, const gchar * tag_name,
 700     const gchar * tag_str)
 701 {
 702   GType tag_type = gst_tag_get_type (tag_name);
 703   GstTagList *tag_list = work->tags;
 704
 705   if (tag_str == NULL)
 706     return FALSE;
 707
 708   switch (tag_type) {
 709     case G_TYPE_UINT:
 710     {
 711       gint current, total;
 712
 713       if (sscanf (tag_str, "%d/%d", &current, &total) == 2) {
 714         if (total <= 0) {
 715           GST_WARNING ("Ignoring invalid value for total %d in tag %s",
 716               total, tag_name);
 717         } else {
 718           if (strcmp (tag_name, GST_TAG_TRACK_NUMBER) == 0) {
 719             gst_tag_list_add (tag_list, GST_TAG_MERGE_APPEND,
 720                 GST_TAG_TRACK_COUNT, total, NULL);
 721           } else if (strcmp (tag_name, GST_TAG_ALBUM_VOLUME_NUMBER) == 0) {
 722             gst_tag_list_add (tag_list, GST_TAG_MERGE_APPEND,
 723                 GST_TAG_ALBUM_VOLUME_COUNT, total, NULL);
 724           }
 725         }
 726       } else if (sscanf (tag_str, "%d", &current) != 1) {
 727         /* Not an integer in the string */
 728         GST_WARNING ("Tag string for tag %s does not contain an integer - "
 729             "ignoring", tag_name);
 730         break;
 731       }
 732
 733       if (current <= 0) {
 734         GST_WARNING ("Ignoring invalid value %d in tag %s", current, tag_name);
 735       } else {
 736         gst_tag_list_add (tag_list, GST_TAG_MERGE_APPEND, tag_name, current,
 737             NULL);
 738       }
 739       break;
 740     }
 741     case G_TYPE_UINT64:
 742     {
 743       guint64 tmp;
 744
 745       g_assert (strcmp (tag_name, GST_TAG_DURATION) == 0);
 746       tmp = strtoul (tag_str, NULL, 10);
 747       if (tmp == 0) {
 748         break;
 749       }
 750       gst_tag_list_add (tag_list, GST_TAG_MERGE_APPEND,
 751           GST_TAG_DURATION, tmp * 1000 * 1000, NULL);
 752       break;
 753     }
 754     case G_TYPE_STRING:{
 755       const GValue *val;
 756       guint i, num;
 757
 758       /* make sure we add each unique string only once per tag, we don't want
 759        * to have the same genre in the genre list multiple times, for example,
 760        * or the same DiscID in there twice just because it's contained in the
 761        * tag multiple times under different TXXX user tags */
 762       num = gst_tag_list_get_tag_size (tag_list, tag_name);
 763       for (i = 0; i < num; ++i) {
 764         val = gst_tag_list_get_value_index (tag_list, tag_name, i);
 765         if (val != NULL && strcmp (g_value_get_string (val), tag_str) == 0)
 766           break;
 767       }
 768       if (i == num) {
 769         gst_tag_list_add (tag_list, GST_TAG_MERGE_APPEND,
 770             tag_name, tag_str, NULL);
 771       }
 772       break;
 773     }
 774
 775     default:{
 776       gchar *tmp = NULL;
 777       GValue src = { 0, };
 778       GValue dest = { 0, };
 779
 780       /* Ensure that any date string is complete */
 781       if (tag_type == GST_TYPE_DATE) {
 782         guint year = 1901, month = 1, day = 1;
 783
 784         /* Dates can be yyyy-MM-dd, yyyy-MM or yyyy, but we need
 785          * the first type */
 786         if (sscanf (tag_str, "%04u-%02u-%02u", &year, &month, &day) == 0)
 787           break;
 788
 789         tmp = g_strdup_printf ("%04u-%02u-%02u", year, month, day);
 790         tag_str = tmp;
 791       }
 792
 793       /* handles anything else */
 794       g_value_init (&src, G_TYPE_STRING);
 795       g_value_set_string (&src, (const gchar *) tag_str);
 796       g_value_init (&dest, tag_type);
 797
 798       if (g_value_transform (&src, &dest)) {
 799         gst_tag_list_add_values (tag_list, GST_TAG_MERGE_APPEND,
 800             tag_name, &dest, NULL);
 801       } else if (tag_type == G_TYPE_DOUBLE) {
 802         /* replaygain tags in TXXX frames ... */
 803         g_value_set_double (&dest, g_strtod (tag_str, NULL));
 804         gst_tag_list_add_values (tag_list, GST_TAG_MERGE_KEEP,
 805             tag_name, &dest, NULL);
 806         GST_LOG ("Converted string '%s' to double %f", tag_str,
 807             g_value_get_double (&dest));
 808       } else {
 809         GST_WARNING ("Failed to transform tag from string to type '%s'",
 810             g_type_name (tag_type));
 811       }
 812
 813       g_value_unset (&src);
 814       g_value_unset (&dest);
 815       g_free (tmp);
 816       break;
 817     }
 818   }
 819
 820   return TRUE;
 821 }
 822
 823 /* Check that an array of characters contains only digits */
 824 static gboolean
 825 id3v2_are_digits (const gchar * chars, gint size)
 826 {
 827   gint i;
 828
 829   for (i = 0; i < size; i++) {
 830     if (!g_ascii_isdigit (chars[i]))
 831       return FALSE;
 832   }
 833   return TRUE;
 834 }
 835
 836 static gboolean
 837 id3v2_genre_string_to_taglist (ID3TagsWorking * work, const gchar * tag_name,
 838     const gchar * tag_str, gint len)
 839 {
 840   g_return_val_if_fail (tag_str != NULL, FALSE);
 841
 842   /* If it's a number, it might be a defined genre */
 843   if (id3v2_are_digits (tag_str, len)) {
 844     tag_str = gst_tag_id3_genre_get (strtol (tag_str, NULL, 10));
 845     return id3v2_tag_to_taglist (work, tag_name, tag_str);
 846   }
 847   /* Otherwise it might be "RX" or "CR" */
 848   if (len == 2) {
 849     if (g_ascii_strncasecmp ("rx", tag_str, len) == 0)
 850       return id3v2_tag_to_taglist (work, tag_name, "Remix");
 851
 852     if (g_ascii_strncasecmp ("cr", tag_str, len) == 0)
 853       return id3v2_tag_to_taglist (work, tag_name, "Cover");
 854   }
 855
 856   /* Otherwise it's a string */
 857   return id3v2_tag_to_taglist (work, tag_name, tag_str);
 858 }
 859
 860 static gboolean
 861 id3v2_genre_fields_to_taglist (ID3TagsWorking * work, const gchar * tag_name,
 862     GArray * tag_fields)
 863 {
 864   gchar *tag_str = NULL;
 865   gboolean result = FALSE;
 866   gint i;
 867
 868   for (i = 0; i < tag_fields->len; i++) {
 869     gint len;
 870
 871     tag_str = g_array_index (tag_fields, gchar *, 0);
 872     if (tag_str == NULL)
 873       continue;
 874
 875     len = strlen (tag_str);
 876     /* Only supposed to see '(n)' type numeric genre strings in ID3 <= 2.3.0
 877      * but apparently we see them in 2.4.0 sometimes too */
 878     if (TRUE || work->hdr.version <= 0x300) {   /* <= 2.3.0 */
 879       /* Check for genre numbers wrapped in parentheses, possibly
 880        * followed by a string */
 881       while (len >= 2) {
 882         gint pos;
 883         gboolean found = FALSE;
 884
 885         /* Double parenthesis ends the numeric genres, but we need
 886          * to swallow the first one so we actually output '(' */
 887         if (tag_str[0] == '(' && tag_str[1] == '(') {
 888           tag_str++;
 889           len--;
 890           break;
 891         }
 892
 893         /* If the first char is not a parenthesis, then stop
 894          * looking for parenthesised genre strings */
 895         if (tag_str[0] != '(')
 896           break;
 897
 898         for (pos = 1; pos < len; pos++) {
 899           if (tag_str[pos] == ')') {
 900             gchar *tmp_str;
 901
 902             tmp_str = g_strndup (tag_str + 1, pos - 1);
 903             result |=
 904                 id3v2_genre_string_to_taglist (work, tag_name, tmp_str,
 905                 pos - 1);
 906             g_free (tmp_str);
 907             tag_str += pos + 1;
 908             len -= pos + 1;
 909             found = TRUE;
 910             break;
 911           }
 912
 913           /* If we encounter a non-digit while searching for a closing
 914            * parenthesis, we should not try and interpret this as a
 915            * numeric genre string */
 916           if (!g_ascii_isdigit (tag_str[pos]))
 917             break;
 918         }
 919         if (!found)
 920           break;                /* There was no closing parenthesis */
 921       }
 922     }
 923
 924     if (len > 0 && tag_str != NULL)
 925       result |= id3v2_genre_string_to_taglist (work, tag_name, tag_str, len);
 926   }
 927   return result;
 928 }
 929
 930 static const gchar utf16enc[] = "UTF-16";
 931 static const gchar utf16leenc[] = "UTF-16LE";
 932 static const gchar utf16beenc[] = "UTF-16BE";
 933
 934 static gboolean
 935 find_utf16_bom (gchar * data, const gchar ** p_in_encoding)
 936 {
 937   guint16 marker = (GST_READ_UINT8 (data) << 8) | GST_READ_UINT8 (data + 1);
 938
 939   switch (marker) {
 940     case 0xFFFE:
 941       *p_in_encoding = utf16leenc;
 942       return TRUE;
 943     case 0xFEFF:
 944       *p_in_encoding = utf16beenc;
 945       return TRUE;
 946     default:
 947       break;
 948   }
 949   return FALSE;
 950 }
 951
 952 static void *
 953 string_utf8_dup (const gchar * start, const guint size)
 954 {
 955   const gchar *env;
 956   gsize bytes_read;
 957   gchar *utf8;
 958
 959   /* Should we try the charsets specified
 960    * via environment variables FIRST ? */
 961   if (g_utf8_validate (start, size, NULL)) {
 962     utf8 = g_strndup (start, size);
 963     goto beach;
 964   }
 965
 966   env = g_getenv ("GST_ID3V1_TAG_ENCODING");
 967   if (!env || *env == '\0')
 968     env = g_getenv ("GST_ID3_TAG_ENCODING");
 969   if (!env || *env == '\0')
 970     env = g_getenv ("GST_TAG_ENCODING");
 971
 972   /* Try charsets specified via the environment */
 973   if (env && *env != '\0') {
 974     gchar **c, **csets;
 975
 976     csets = g_strsplit (env, G_SEARCHPATH_SEPARATOR_S, -1);
 977
 978     for (c = csets; c && *c; ++c) {
 979       if ((utf8 =
 980               g_convert (start, size, "UTF-8", *c, &bytes_read, NULL, NULL))) {
 981         if (bytes_read == size) {
 982           GST_DEBUG ("Using charset %s to interperate id3 tags\n", *c);
 983           g_strfreev (csets);
 984           goto beach;
 985         }
 986         g_free (utf8);
 987         utf8 = NULL;
 988       }
 989     }
 990   }
 991   /* Try current locale (if not UTF-8) */
 992   if (!g_get_charset (&env)) {
 993     if ((utf8 = g_locale_to_utf8 (start, size, &bytes_read, NULL, NULL))) {
 994       if (bytes_read == size) {
 995         goto beach;
 996       }
 997       g_free (utf8);
 998       utf8 = NULL;
 999     }
1000   }
1001
1002   /* Try ISO-8859-1 */
1003   utf8 =
1004       g_convert (start, size, "UTF-8", "ISO-8859-1", &bytes_read, NULL, NULL);
1005   if (utf8 != NULL && bytes_read == size) {
1006     goto beach;
1007   }
1008
1009   g_free (utf8);
1010   return NULL;
1011
1012 beach:
1013
1014   g_strchomp (utf8);
1015
1016   return (utf8);
1017 }
1018
1019 static void
1020 parse_insert_string_field (guint8 encoding, gchar * data, gint data_size,
1021     GArray * fields)
1022 {
1023   gchar *field = NULL;
1024
1025   switch (encoding) {
1026     case ID3V2_ENCODING_UTF16:
1027     case ID3V2_ENCODING_UTF16BE:
1028     {
1029       const gchar *in_encode;
1030
1031       if (encoding == ID3V2_ENCODING_UTF16)
1032         in_encode = utf16enc;
1033       else
1034         in_encode = utf16beenc;
1035
1036       /* Sometimes we see strings with multiple BOM markers at the start.
1037        * In that case, we assume the innermost one is correct. If that fails
1038        * to produce valid UTF-8, we try the other endianness anyway */
1039       while (data_size > 2 && find_utf16_bom (data, &in_encode)) {
1040         data += 2;              /* skip BOM */
1041         data_size -= 2;
1042       }
1043
1044       field = g_convert (data, data_size, "UTF-8", in_encode, NULL, NULL, NULL);
1045
1046       if (field == NULL || g_utf8_validate (field, -1, NULL) == FALSE) {
1047         /* As a fallback, try interpreting UTF-16 in the other endianness */
1048         if (in_encode == utf16beenc)
1049           field = g_convert (data, data_size, "UTF-8", utf16leenc,
1050               NULL, NULL, NULL);
1051       }
1052     }
1053
1054       break;
1055     case ID3V2_ENCODING_ISO8859:
1056       if (g_utf8_validate (data, data_size, NULL))
1057         field = g_strndup (data, data_size);
1058       else
1059         /* field = g_convert (data, data_size, "UTF-8", "ISO-8859-1",
1060            NULL, NULL, NULL); */
1061         field = string_utf8_dup (data, data_size);
1062       break;
1063     default:
1064       field = g_strndup (data, data_size);
1065       break;
1066   }
1067
1068   if (field) {
1069     if (g_utf8_validate (field, -1, NULL)) {
1070       g_array_append_val (fields, field);
1071       return;
1072     }
1073
1074     GST_DEBUG ("%s was bad UTF-8 after conversion from encoding %d. Ignoring",
1075         field, encoding);
1076     g_free (field);
1077   }
1078 }
1079
1080 static void
1081 parse_split_strings (guint8 encoding, gchar * data, gint data_size,
1082     GArray ** out_fields)
1083 {
1084   GArray *fields = g_array_new (FALSE, TRUE, sizeof (gchar *));
1085   gint text_pos;
1086   gint prev = 0;
1087
1088   g_return_if_fail (out_fields != NULL);
1089
1090   switch (encoding) {
1091     case ID3V2_ENCODING_ISO8859:
1092       for (text_pos = 0; text_pos < data_size; text_pos++) {
1093         if (data[text_pos] == 0) {
1094           parse_insert_string_field (encoding, data + prev,
1095               text_pos - prev + 1, fields);
1096           prev = text_pos + 1;
1097         }
1098       }
1099       if (data_size - prev > 0 && data[prev] != 0x00) {
1100         parse_insert_string_field (encoding, data + prev,
1101             data_size - prev, fields);
1102       }
1103
1104       break;
1105     case ID3V2_ENCODING_UTF8:
1106       for (prev = 0, text_pos = 0; text_pos < data_size; text_pos++) {
1107         if (data[text_pos] == '\0') {
1108           parse_insert_string_field (encoding, data + prev,
1109               text_pos - prev + 1, fields);
1110           prev = text_pos + 1;
1111         }
1112       }
1113       if (data_size - prev > 0 && data[prev] != 0x00) {
1114         parse_insert_string_field (encoding, data + prev,
1115             data_size - prev, fields);
1116       }
1117       break;
1118     case ID3V2_ENCODING_UTF16:
1119     case ID3V2_ENCODING_UTF16BE:
1120     {
1121       /* Find '\0\0' terminator */
1122       for (text_pos = 0; text_pos < data_size - 1; text_pos += 2) {
1123         if (data[text_pos] == '\0' && data[text_pos + 1] == '\0') {
1124           /* found a delimiter */
1125           parse_insert_string_field (encoding, data + prev,
1126               text_pos - prev + 2, fields);
1127           text_pos++;           /* Advance to the 2nd NULL terminator */
1128           prev = text_pos + 1;
1129           break;
1130         }
1131       }
1132       if (data_size - prev > 1 &&
1133           (data[prev] != 0x00 || data[prev + 1] != 0x00)) {
1134         /* There were 2 or more non-null chars left, convert those too */
1135         parse_insert_string_field (encoding, data + prev,
1136             data_size - prev, fields);
1137       }
1138       break;
1139     }
1140   }
1141   if (fields->len > 0)
1142     *out_fields = fields;
1143   else
1144     g_array_free (fields, TRUE);
1145 }
1146
1147 static void
1148 free_tag_strings (GArray * fields)
1149 {
1150   if (fields) {
1151     gint i;
1152     gchar *c;
1153
1154     for (i = 0; i < fields->len; i++) {
1155       c = g_array_index (fields, gchar *, i);
1156       g_free (c);
1157     }
1158     g_array_free (fields, TRUE);
1159   }
1160 }