gst/id3demux/id3v2frames.c

   1 /* -*- Mode: C; tab-width: 2; indent-tabs-mode: t; c-basic-offset: 2 -*- */
   2 /* Copyright 2006-2008 Tim-Philipp Müller <tim centricular net>
   3  * Copyright 2005 Jan Schmidt <thaytan@mad.scientist.com>
   4  * Copyright 2002,2003 Scott Wheeler <wheeler@kde.org> (portions from taglib)
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Library General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Library General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Library General Public
  17  * License along with this library; if not, write to the
  18  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  19  * Boston, MA 02111-1307, USA.
  20  */
  21
  22 #ifdef HAVE_CONFIG_H
  23 #include "config.h"
  24 #endif
  25
  26 #include <string.h>
  27 #include <stdlib.h>
  28 #include <gst/tag/tag.h>
  29 #include <gst/base/gsttypefindhelper.h>
  30
  31 #ifdef HAVE_ZLIB
  32 #include <zlib.h>
  33 #endif
  34
  35 #include "id3tags.h"
  36
  37 GST_DEBUG_CATEGORY_EXTERN (id3demux_debug);
  38 #define GST_CAT_DEFAULT (id3demux_debug)
  39
  40 static gboolean parse_comment_frame (ID3TagsWorking * work);
  41 static gchar *parse_url_link_frame (ID3TagsWorking * work,
  42     const gchar ** tag_name);
  43 static GArray *parse_text_identification_frame (ID3TagsWorking * work);
  44 static gchar *parse_user_text_identification_frame (ID3TagsWorking * work,
  45     const gchar ** tag_name);
  46 static gchar *parse_unique_file_identifier (ID3TagsWorking * work,
  47     const gchar ** tag_name);
  48 static gboolean parse_relative_volume_adjustment_two (ID3TagsWorking * work);
  49 static void parse_obsolete_tdat_frame (ID3TagsWorking * work);
  50 static gboolean id3v2_tag_to_taglist (ID3TagsWorking * work,
  51     const gchar * tag_name, const gchar * tag_str);
  52 /* Parse a single string into an array of gchar* */
  53 static void parse_split_strings (guint8 encoding, gchar * data, gint data_size,
  54     GArray ** out_fields);
  55 static void free_tag_strings (GArray * fields);
  56 static gboolean
  57 id3v2_genre_fields_to_taglist (ID3TagsWorking * work, const gchar * tag_name,
  58     GArray * tag_fields);
  59 static gboolean parse_picture_frame (ID3TagsWorking * work);
  60
  61 #define ID3V2_ENCODING_ISO8859 0x00
  62 #define ID3V2_ENCODING_UTF16   0x01
  63 #define ID3V2_ENCODING_UTF16BE 0x02
  64 #define ID3V2_ENCODING_UTF8    0x03
  65
  66 gboolean
  67 id3demux_id3v2_parse_frame (ID3TagsWorking * work)
  68 {
  69   const gchar *tag_name;
  70   gboolean result = FALSE;
  71   gint i;
  72   guint8 *frame_data = work->hdr.frame_data;
  73   guint frame_data_size = work->cur_frame_size;
  74   gchar *tag_str = NULL;
  75   GArray *tag_fields = NULL;
  76   guint8 *uu_data = NULL;
  77
  78 #ifdef HAVE_ZLIB
  79   guint8 *uncompressed_data = NULL;
  80 #endif
  81
  82   /* Check that the frame id is valid */
  83   for (i = 0; i < 5 && work->frame_id[i] != '\0'; i++) {
  84     if (!g_ascii_isalnum (work->frame_id[i])) {
  85       GST_DEBUG ("Encountered invalid frame_id");
  86       return FALSE;
  87     }
  88   }
  89
  90   /* Can't handle encrypted frames right now (in case we ever do, we'll have
  91    * to do the decryption after the un-unsynchronisation and decompression,
  92    * not here) */
  93   if (work->frame_flags & ID3V2_FRAME_FORMAT_ENCRYPTION) {
  94     GST_WARNING ("Encrypted frames are not supported");
  95     return FALSE;
  96   }
  97
  98   tag_name = gst_tag_from_id3_tag (work->frame_id);
  99   if (tag_name == NULL &&
 100       strncmp (work->frame_id, "RVA2", 4) != 0 &&
 101       strncmp (work->frame_id, "TXXX", 4) != 0 &&
 102       strncmp (work->frame_id, "TDAT", 4) != 0 &&
 103       strncmp (work->frame_id, "UFID", 4) != 0) {
 104     return FALSE;
 105   }
 106
 107   if (work->frame_flags & (ID3V2_FRAME_FORMAT_COMPRESSION |
 108           ID3V2_FRAME_FORMAT_DATA_LENGTH_INDICATOR)) {
 109     if (work->hdr.frame_data_size <= 4)
 110       return FALSE;
 111     if (ID3V2_VER_MAJOR (work->hdr.version) == 3) {
 112       work->parse_size = GST_READ_UINT32_BE (frame_data);
 113     } else {
 114       work->parse_size = read_synch_uint (frame_data, 4);
 115     }
 116     frame_data += 4;
 117     frame_data_size -= 4;
 118     if (work->parse_size < frame_data_size) {
 119       GST_WARNING ("ID3v2 frame %s has invalid size %d.", tag_name,
 120           frame_data_size);
 121       return FALSE;
 122     }
 123   }
 124
 125   /* in v2.3 the frame sizes are not syncsafe, so the entire tag had to be
 126    * unsynced. In v2.4 the frame sizes are syncsafe so it's just the frame
 127    * data that needs un-unsyncing, but not the frame headers. */
 128   if (ID3V2_VER_MAJOR (work->hdr.version) == 4) {
 129     if ((work->hdr.flags & ID3V2_HDR_FLAG_UNSYNC) != 0 ||
 130         ((work->frame_flags & ID3V2_FRAME_FORMAT_UNSYNCHRONISATION) != 0)) {
 131       GST_DEBUG ("Un-unsyncing frame %s", work->frame_id);
 132       uu_data = id3demux_ununsync_data (frame_data, &frame_data_size);
 133       frame_data = uu_data;
 134       GST_MEMDUMP ("ID3v2 frame (un-unsyced)", frame_data, frame_data_size);
 135     }
 136   }
 137
 138   work->parse_size = frame_data_size;
 139
 140   if (work->frame_flags & ID3V2_FRAME_FORMAT_COMPRESSION) {
 141 #ifdef HAVE_ZLIB
 142     uLongf destSize = work->parse_size;
 143     Bytef *dest, *src;
 144
 145     uncompressed_data = g_malloc (work->parse_size);
 146
 147     dest = (Bytef *) uncompressed_data;
 148     src = (Bytef *) frame_data;
 149
 150     if (uncompress (dest, &destSize, src, frame_data_size) != Z_OK) {
 151       g_free (uncompressed_data);
 152       g_free (uu_data);
 153       return FALSE;
 154     }
 155     if (destSize != work->parse_size) {
 156       GST_WARNING
 157           ("Decompressing ID3v2 frame %s did not produce expected size %d bytes (got %lu)",
 158           tag_name, work->parse_size, destSize);
 159       g_free (uncompressed_data);
 160       g_free (uu_data);
 161       return FALSE;
 162     }
 163     work->parse_data = uncompressed_data;
 164 #else
 165     GST_WARNING ("Compressed ID3v2 tag frame could not be decompressed"
 166         " because gstid3demux was compiled without zlib support");
 167     g_free (uu_data);
 168     return FALSE;
 169 #endif
 170   } else {
 171     work->parse_data = frame_data;
 172   }
 173
 174   if (work->frame_id[0] == 'T') {
 175     if (strcmp (work->frame_id, "TDAT") == 0) {
 176       parse_obsolete_tdat_frame (work);
 177       result = TRUE;
 178     } else if (strcmp (work->frame_id, "TXXX") == 0) {
 179       /* Handle user text frame */
 180       tag_str = parse_user_text_identification_frame (work, &tag_name);
 181     } else {
 182       /* Text identification frame */
 183       tag_fields = parse_text_identification_frame (work);
 184     }
 185   } else if (work->frame_id[0] == 'W' && strcmp (work->frame_id, "WXXX") != 0) {
 186     /* URL link frame: ISO-8859-1 encoded, one frame per tag */
 187     tag_str = parse_url_link_frame (work, &tag_name);
 188   } else if (!strcmp (work->frame_id, "COMM")) {
 189     /* Comment */
 190     result = parse_comment_frame (work);
 191   } else if (!strcmp (work->frame_id, "APIC")) {
 192     /* Attached picture */
 193     result = parse_picture_frame (work);
 194   } else if (!strcmp (work->frame_id, "RVA2")) {
 195     /* Relative volume */
 196     result = parse_relative_volume_adjustment_two (work);
 197   } else if (!strcmp (work->frame_id, "UFID")) {
 198     /* Unique file identifier */
 199     tag_str = parse_unique_file_identifier (work, &tag_name);
 200   }
 201 #ifdef HAVE_ZLIB
 202   if (work->frame_flags & ID3V2_FRAME_FORMAT_COMPRESSION) {
 203     g_free (uncompressed_data);
 204     uncompressed_data = NULL;
 205     work->parse_data = frame_data;
 206   }
 207 #endif
 208
 209   if (tag_str != NULL) {
 210     /* g_print ("Tag %s value %s\n", tag_name, tag_str); */
 211     result = id3v2_tag_to_taglist (work, tag_name, tag_str);
 212     g_free (tag_str);
 213   }
 214   if (tag_fields != NULL) {
 215     if (strcmp (work->frame_id, "TCON") == 0) {
 216       /* Genre strings need special treatment */
 217       result |= id3v2_genre_fields_to_taglist (work, tag_name, tag_fields);
 218     } else {
 219       gint t;
 220
 221       for (t = 0; t < tag_fields->len; t++) {
 222         tag_str = g_array_index (tag_fields, gchar *, t);
 223         if (tag_str != NULL && tag_str[0] != '\0')
 224           result |= id3v2_tag_to_taglist (work, tag_name, tag_str);
 225       }
 226     }
 227     free_tag_strings (tag_fields);
 228   }
 229
 230   g_free (uu_data);
 231
 232   return result;
 233 }
 234
 235 static gboolean
 236 parse_comment_frame (ID3TagsWorking * work)
 237 {
 238   guint dummy;
 239   guint8 encoding;
 240   gchar language[4];
 241   GArray *fields = NULL;
 242   gchar *description, *text;
 243
 244   if (work->parse_size < 6)
 245     return FALSE;
 246
 247   encoding = work->parse_data[0];
 248   language[0] = g_ascii_tolower (work->parse_data[1]);
 249   language[1] = g_ascii_tolower (work->parse_data[2]);
 250   language[2] = g_ascii_tolower (work->parse_data[3]);
 251   language[3] = '\0';
 252
 253   parse_split_strings (encoding, (gchar *) work->parse_data + 4,
 254       work->parse_size - 4, &fields);
 255
 256   if (fields == NULL || fields->len < 2) {
 257     GST_WARNING ("Failed to decode comment frame");
 258     goto fail;
 259   }
 260   description = g_array_index (fields, gchar *, 0);
 261   text = g_array_index (fields, gchar *, 1);
 262
 263   if (!g_utf8_validate (text, -1, NULL)) {
 264     GST_WARNING ("Converted string is not valid utf-8");
 265     goto fail;
 266   }
 267
 268   /* skip our own dummy descriptions (from id3v2mux) */
 269   if (strlen (description) > 0 && g_utf8_validate (description, -1, NULL) &&
 270       sscanf (description, "c%u", &dummy) != 1) {
 271     gchar *s;
 272
 273     /* must be either an ISO-639-1 or ISO-639-2 language code */
 274     if (language[0] != '\0' &&
 275         g_ascii_isalpha (language[0]) &&
 276         g_ascii_isalpha (language[1]) &&
 277         (g_ascii_isalpha (language[2]) || language[2] == '\0')) {
 278       s = g_strdup_printf ("%s[%s]=%s", description, language, text);
 279     } else {
 280       s = g_strdup_printf ("%s=%s", description, text);
 281     }
 282     gst_tag_list_add (work->tags, GST_TAG_MERGE_APPEND,
 283         GST_TAG_EXTENDED_COMMENT, s, NULL);
 284     g_free (s);
 285   } else if (text != NULL && *text != '\0') {
 286     gst_tag_list_add (work->tags, GST_TAG_MERGE_APPEND,
 287         GST_TAG_COMMENT, text, NULL);
 288   } else {
 289     goto fail;
 290   }
 291
 292   free_tag_strings (fields);
 293   return TRUE;
 294
 295 fail:
 296   {
 297     GST_WARNING ("failed to parse COMM frame");
 298     free_tag_strings (fields);
 299     return FALSE;
 300   }
 301 }
 302
 303 static GArray *
 304 parse_text_identification_frame (ID3TagsWorking * work)
 305 {
 306   guchar encoding;
 307   GArray *fields = NULL;
 308
 309   if (work->parse_size < 2)
 310     return NULL;
 311
 312   encoding = work->parse_data[0];
 313   parse_split_strings (encoding, (gchar *) work->parse_data + 1,
 314       work->parse_size - 1, &fields);
 315   if (fields) {
 316     if (fields->len > 0) {
 317       GST_LOG ("Read %d fields from Text ID frame of size %d with encoding %d"
 318           ". First is '%s'", fields->len, work->parse_size - 1, encoding,
 319           g_array_index (fields, gchar *, 0));
 320     } else {
 321       GST_LOG ("Read 0 fields from Text ID frame of size %d with encoding %d",
 322           work->parse_size - 1, encoding);
 323     }
 324   }
 325
 326   return fields;
 327 }
 328
 329 static gboolean
 330 link_is_known_license (const gchar * url)
 331 {
 332   return g_str_has_prefix (url, "http://creativecommons.org/licenses/");
 333 }
 334
 335 static gchar *
 336 parse_url_link_frame (ID3TagsWorking * work, const gchar ** tag_name)
 337 {
 338   gsize len;
 339   gchar *nul, *data, *link;
 340
 341   *tag_name = NULL;
 342
 343   if (work->parse_size == 0)
 344     return NULL;
 345
 346   data = (gchar *) work->parse_data;
 347   /* if there's more data then the string is long, we only want to parse the
 348    * data up to the terminating zero to g_convert and ignore the rest, as
 349    * per spec */
 350   nul = memchr (data, '\0', work->parse_size);
 351   if (nul != NULL) {
 352     len = (gsize) (nul - data);
 353   } else {
 354     len = work->parse_size;
 355   }
 356
 357   link = g_convert (data, len, "UTF-8", "ISO-8859-1", NULL, NULL, NULL);
 358
 359   if (link == NULL || !gst_uri_is_valid (link)) {
 360     GST_DEBUG ("Invalid URI in %s frame: %s", work->frame_id,
 361         GST_STR_NULL (link));
 362     g_free (link);
 363     return NULL;
 364   }
 365
 366   /* we don't know if it's a link to a page that explains the copyright
 367    * situation, or a link that points to/represents a license, the ID3 spec
 368    * does not separate those two things; for now only put known license URIs
 369    * into GST_TAG_LICENSE_URI and everything else into GST_TAG_COPYRIGHT_URI */
 370   if (strcmp (work->frame_id, "WCOP") == 0) {
 371     if (link_is_known_license (link))
 372       *tag_name = GST_TAG_LICENSE_URI;
 373     else
 374       *tag_name = GST_TAG_COPYRIGHT_URI;
 375   } else if (strcmp (work->frame_id, "WOAF") == 0) {
 376     /* can't be bothered to create a CONTACT_URI tag for this, so let's just
 377      * put into into GST_TAG_CONTACT, which is where it ends up when reading
 378      * the info from vorbis comments as well */
 379     *tag_name = GST_TAG_CONTACT;
 380   }
 381
 382   return link;
 383 }
 384
 385
 386 static gchar *
 387 parse_user_text_identification_frame (ID3TagsWorking * work,
 388     const gchar ** tag_name)
 389 {
 390   gchar *ret;
 391   guchar encoding;
 392   GArray *fields = NULL;
 393
 394   *tag_name = NULL;
 395
 396   if (work->parse_size < 2)
 397     return NULL;
 398
 399   encoding = work->parse_data[0];
 400
 401   parse_split_strings (encoding, (gchar *) work->parse_data + 1,
 402       work->parse_size - 1, &fields);
 403
 404   if (fields == NULL)
 405     return NULL;
 406
 407   if (fields->len != 2) {
 408     GST_WARNING ("Expected 2 fields in TXXX frame, but got %d", fields->len);
 409     free_tag_strings (fields);
 410     return NULL;
 411   }
 412
 413   *tag_name =
 414       gst_tag_from_id3_user_tag ("TXXX", g_array_index (fields, gchar *, 0));
 415
 416   GST_LOG ("TXXX frame of size %d. Mapped descriptor '%s' to GStreamer tag %s",
 417       work->parse_size - 1, g_array_index (fields, gchar *, 0),
 418       GST_STR_NULL (*tag_name));
 419
 420   if (*tag_name) {
 421     ret = g_strdup (g_array_index (fields, gchar *, 1));
 422     /* GST_LOG ("%s = %s", *tag_name, GST_STR_NULL (ret)); */
 423   } else {
 424     ret = NULL;
 425   }
 426
 427   free_tag_strings (fields);
 428   return ret;
 429 }
 430
 431 static gboolean
 432 parse_id_string (ID3TagsWorking * work, gchar ** p_str, gint * p_len,
 433     gint * p_datalen)
 434 {
 435   gint len, datalen;
 436
 437   if (work->parse_size < 2)
 438     return FALSE;
 439
 440   for (len = 0; len < work->parse_size - 1; ++len) {
 441     if (work->parse_data[len] == '\0')
 442       break;
 443   }
 444
 445   datalen = work->parse_size - (len + 1);
 446   if (len == 0 || datalen <= 0)
 447     return FALSE;
 448
 449   *p_str = g_strndup ((gchar *) work->parse_data, len);
 450   *p_len = len;
 451   *p_datalen = datalen;
 452
 453   return TRUE;
 454 }
 455
 456 static gchar *
 457 parse_unique_file_identifier (ID3TagsWorking * work, const gchar ** tag_name)
 458 {
 459   gint len, datalen;
 460   gchar *owner_id, *data, *ret = NULL;
 461
 462   GST_LOG ("parsing UFID frame of size %d", work->parse_size);
 463
 464   if (!parse_id_string (work, &owner_id, &len, &datalen))
 465     return NULL;
 466
 467   data = (gchar *) work->parse_data + len + 1;
 468   GST_LOG ("UFID owner ID: %s (+ %d bytes of data)", owner_id, datalen);
 469
 470   if (strcmp (owner_id, "http://musicbrainz.org") == 0 &&
 471       g_utf8_validate (data, datalen, NULL)) {
 472     *tag_name = GST_TAG_MUSICBRAINZ_TRACKID;
 473     ret = g_strndup (data, datalen);
 474   } else {
 475     GST_INFO ("Unknown UFID owner ID: %s", owner_id);
 476   }
 477   g_free (owner_id);
 478
 479   return ret;
 480 }
 481
 482 /* parse data and return length of the next string in the given encoding,
 483  * including the NUL terminator */
 484 static gint
 485 scan_encoded_string (guint8 encoding, gchar * data, gint data_size)
 486 {
 487   gint i;
 488
 489   switch (encoding) {
 490     case ID3V2_ENCODING_ISO8859:
 491     case ID3V2_ENCODING_UTF8:
 492       for (i = 0; i < data_size; ++i) {
 493         if (data[i] == '\0')
 494           return i + 1;
 495       }
 496       break;
 497     case ID3V2_ENCODING_UTF16:
 498     case ID3V2_ENCODING_UTF16BE:
 499       /* we don't care about BOMs here and treat them as part of the string */
 500       /* Find '\0\0' terminator */
 501       for (i = 0; i < data_size - 1; i += 2) {
 502         if (data[i] == '\0' && data[i + 1] == '\0')
 503           return i + 2;
 504       }
 505       break;
 506     default:
 507       break;
 508   }
 509
 510   return 0;
 511 }
 512
 513 static gboolean
 514 parse_picture_frame (ID3TagsWorking * work)
 515 {
 516   guint8 txt_encoding, pic_type;
 517   gchar *mime_str = NULL;
 518   gint len, datalen;
 519
 520   GST_LOG ("APIC frame (ID3v2.%u)", ID3V2_VER_MAJOR (work->hdr.version));
 521
 522   if (work->parse_size < 1 + 1 + 1 + 1 + 1)
 523     goto not_enough_data;
 524
 525   txt_encoding = work->parse_data[0];
 526   ++work->parse_data;
 527   --work->parse_size;
 528
 529   /* Read image format; in early ID3v2 versions this is a fixed-length
 530    * 3-character string without terminator; in later versions (>= 2.3.0)
 531    * this is a NUL-terminated string of variable length */
 532   if (ID3V2_VER_MAJOR (work->hdr.version) < 3) {
 533     if (work->parse_size < 3)
 534       goto not_enough_data;
 535
 536     mime_str = g_strndup ((gchar *) work->parse_data, 3);
 537     len = 3;
 538   } else {
 539     if (!parse_id_string (work, &mime_str, &len, &datalen))
 540       return FALSE;
 541     ++len;                      /* for string terminator */
 542   }
 543
 544   if (work->parse_size < len + 1 + 1 + 1)
 545     goto not_enough_data;
 546
 547   work->parse_data += len;
 548   work->parse_size -= len;
 549
 550   /* Read image type */
 551   pic_type = work->parse_data[0];
 552   ++work->parse_data;
 553   --work->parse_size;
 554
 555   GST_LOG ("APIC frame mime type    : %s", GST_STR_NULL (mime_str));
 556   GST_LOG ("APIC frame picture type : 0x%02x", (guint) pic_type);
 557
 558   if (work->parse_size < 1 + 1)
 559     goto not_enough_data;
 560
 561   len = scan_encoded_string (txt_encoding, (gchar *) work->parse_data,
 562       work->parse_size);
 563
 564   if (len < 1)
 565     goto error;
 566
 567   /* just skip the description string ... */
 568   GST_LOG ("Skipping description string (%d bytes in original coding)", len);
 569
 570   if (work->parse_size < len + 1)
 571     goto not_enough_data;
 572
 573   work->parse_data += len;
 574   work->parse_size -= len;
 575
 576   GST_DEBUG ("image data is %u bytes", work->parse_size);
 577
 578   if (work->parse_size <= 0)
 579     goto not_enough_data;
 580
 581   if (!gst_tag_list_add_id3_image (work->tags, (guint8 *) work->parse_data,
 582           work->parse_size, pic_type)) {
 583     goto error;
 584   }
 585
 586   g_free (mime_str);
 587   return TRUE;
 588
 589 not_enough_data:
 590   {
 591     GST_DEBUG ("not enough data, skipping APIC frame");
 592     /* fall through to error */
 593   }
 594 error:
 595   {
 596     GST_DEBUG ("problem parsing APIC frame, skipping");
 597     g_free (mime_str);
 598     return FALSE;
 599   }
 600 }
 601
 602 #define ID3V2_RVA2_CHANNEL_MASTER  1
 603
 604 static gboolean
 605 parse_relative_volume_adjustment_two (ID3TagsWorking * work)
 606 {
 607   const gchar *gain_tag_name = NULL;
 608   const gchar *peak_tag_name = NULL;
 609   gdouble gain_dB, peak_val;
 610   guint64 peak;
 611   guint8 *data, chan, peak_bits;
 612   gchar *id;
 613   gint len, datalen, i;
 614
 615   if (!parse_id_string (work, &id, &len, &datalen))
 616     return FALSE;
 617
 618   if (datalen < (1 + 2 + 1)) {
 619     GST_WARNING ("broken RVA2 frame, data size only %d bytes", datalen);
 620     g_free (id);
 621     return FALSE;
 622   }
 623
 624   data = work->parse_data + len + 1;
 625   chan = GST_READ_UINT8 (data);
 626   gain_dB = (gdouble) ((gint16) GST_READ_UINT16_BE (data + 1)) / 512.0;
 627   /* The meaning of the peak value is not defined in the ID3v2 spec. However,
 628    * the first/only implementation of this seems to have been in XMMS, and
 629    * other libs (like mutagen) seem to follow that implementation as well:
 630    * see http://bugs.xmms.org/attachment.cgi?id=113&action=view */
 631   peak_bits = GST_READ_UINT8 (data + 1 + 2);
 632   if (peak_bits > 64) {
 633     GST_WARNING ("silly peak precision of %d bits, ignoring", (gint) peak_bits);
 634     peak_bits = 0;
 635   }
 636   data += 1 + 2 + 1;
 637   datalen -= 1 + 2 + 1;
 638   if (peak_bits == 16) {
 639     peak = GST_READ_UINT16_BE (data);
 640   } else {
 641     peak = 0;
 642     for (i = 0; i < (GST_ROUND_UP_8 (peak_bits) / 8) && datalen > 0; ++i) {
 643       peak = peak << 8;
 644       peak |= GST_READ_UINT8 (data);
 645       ++data;
 646       --datalen;
 647     }
 648   }
 649
 650   peak = peak << (64 - GST_ROUND_UP_8 (peak_bits));
 651   peak_val =
 652       gst_guint64_to_gdouble (peak) / gst_util_guint64_to_gdouble (G_MAXINT64);
 653   GST_LOG ("RVA2 frame: id=%s, chan=%u, adj=%.2fdB, peak_bits=%u, peak=%.2f",
 654       id, chan, gain_dB, (guint) peak_bits, peak_val);
 655
 656   if (chan == ID3V2_RVA2_CHANNEL_MASTER && strcmp (id, "track") == 0) {
 657     gain_tag_name = GST_TAG_TRACK_GAIN;
 658     peak_tag_name = GST_TAG_TRACK_PEAK;
 659   } else if (chan == ID3V2_RVA2_CHANNEL_MASTER && strcmp (id, "album") == 0) {
 660     gain_tag_name = GST_TAG_ALBUM_GAIN;
 661     peak_tag_name = GST_TAG_ALBUM_PEAK;
 662   } else {
 663     GST_INFO ("Unhandled RVA2 frame id '%s' for channel %d", id, chan);
 664   }
 665
 666   if (gain_tag_name) {
 667     gst_tag_list_add (work->tags, GST_TAG_MERGE_APPEND,
 668         gain_tag_name, gain_dB, NULL);
 669   }
 670   if (peak_tag_name && peak_bits > 0) {
 671     gst_tag_list_add (work->tags, GST_TAG_MERGE_APPEND,
 672         peak_tag_name, peak_val, NULL);
 673   }
 674
 675   g_free (id);
 676
 677   return (gain_tag_name != NULL || peak_tag_name != NULL);
 678 }
 679
 680 static void
 681 parse_obsolete_tdat_frame (ID3TagsWorking * work)
 682 {
 683   if (work->parse_size >= 5 &&
 684       work->parse_data[0] == ID3V2_ENCODING_ISO8859 &&
 685       g_ascii_isdigit (work->parse_data[1]) &&
 686       g_ascii_isdigit (work->parse_data[2]) &&
 687       g_ascii_isdigit (work->parse_data[3]) &&
 688       g_ascii_isdigit (work->parse_data[4])) {
 689     work->pending_day = (10 * g_ascii_digit_value (work->parse_data[1])) +
 690         g_ascii_digit_value (work->parse_data[2]);
 691     work->pending_month = (10 * g_ascii_digit_value (work->parse_data[3])) +
 692         g_ascii_digit_value (work->parse_data[4]);
 693     GST_LOG ("date (dd/mm) %02u/%02u", work->pending_day, work->pending_month);
 694   }
 695 }
 696
 697 static gboolean
 698 id3v2_tag_to_taglist (ID3TagsWorking * work, const gchar * tag_name,
 699     const gchar * tag_str)
 700 {
 701   GType tag_type = gst_tag_get_type (tag_name);
 702   GstTagList *tag_list = work->tags;
 703
 704   if (tag_str == NULL)
 705     return FALSE;
 706
 707   switch (tag_type) {
 708     case G_TYPE_UINT:
 709     {
 710       gint current, total;
 711
 712       if (sscanf (tag_str, "%d/%d", &current, &total) == 2) {
 713         if (total <= 0) {
 714           GST_WARNING ("Ignoring invalid value for total %d in tag %s",
 715               total, tag_name);
 716         } else {
 717           if (strcmp (tag_name, GST_TAG_TRACK_NUMBER) == 0) {
 718             gst_tag_list_add (tag_list, GST_TAG_MERGE_APPEND,
 719                 GST_TAG_TRACK_COUNT, total, NULL);
 720           } else if (strcmp (tag_name, GST_TAG_ALBUM_VOLUME_NUMBER) == 0) {
 721             gst_tag_list_add (tag_list, GST_TAG_MERGE_APPEND,
 722                 GST_TAG_ALBUM_VOLUME_COUNT, total, NULL);
 723           }
 724         }
 725       } else if (sscanf (tag_str, "%d", &current) != 1) {
 726         /* Not an integer in the string */
 727         GST_WARNING ("Tag string for tag %s does not contain an integer - "
 728             "ignoring", tag_name);
 729         break;
 730       }
 731
 732       if (current <= 0) {
 733         GST_WARNING ("Ignoring invalid value %d in tag %s", current, tag_name);
 734       } else {
 735         gst_tag_list_add (tag_list, GST_TAG_MERGE_APPEND, tag_name, current,
 736             NULL);
 737       }
 738       break;
 739     }
 740     case G_TYPE_UINT64:
 741     {
 742       guint64 tmp;
 743
 744       g_assert (strcmp (tag_name, GST_TAG_DURATION) == 0);
 745       tmp = strtoul (tag_str, NULL, 10);
 746       if (tmp == 0) {
 747         break;
 748       }
 749       gst_tag_list_add (tag_list, GST_TAG_MERGE_APPEND,
 750           GST_TAG_DURATION, tmp * 1000 * 1000, NULL);
 751       break;
 752     }
 753     case G_TYPE_STRING:{
 754       const GValue *val;
 755       guint i, num;
 756
 757       /* make sure we add each unique string only once per tag, we don't want
 758        * to have the same genre in the genre list multiple times, for example,
 759        * or the same DiscID in there twice just because it's contained in the
 760        * tag multiple times under different TXXX user tags */
 761       num = gst_tag_list_get_tag_size (tag_list, tag_name);
 762       for (i = 0; i < num; ++i) {
 763         val = gst_tag_list_get_value_index (tag_list, tag_name, i);
 764         if (val != NULL && strcmp (g_value_get_string (val), tag_str) == 0)
 765           break;
 766       }
 767       if (i == num) {
 768         gst_tag_list_add (tag_list, GST_TAG_MERGE_APPEND,
 769             tag_name, tag_str, NULL);
 770       }
 771       break;
 772     }
 773
 774     default:{
 775       gchar *tmp = NULL;
 776       GValue src = { 0, };
 777       GValue dest = { 0, };
 778
 779       /* Ensure that any date string is complete */
 780       if (tag_type == GST_TYPE_DATE) {
 781         guint year = 1901, month = 1, day = 1;
 782
 783         /* Dates can be yyyy-MM-dd, yyyy-MM or yyyy, but we need
 784          * the first type */
 785         if (sscanf (tag_str, "%04u-%02u-%02u", &year, &month, &day) == 0)
 786           break;
 787
 788         tmp = g_strdup_printf ("%04u-%02u-%02u", year, month, day);
 789         tag_str = tmp;
 790       }
 791
 792       /* handles anything else */
 793       g_value_init (&src, G_TYPE_STRING);
 794       g_value_set_string (&src, (const gchar *) tag_str);
 795       g_value_init (&dest, tag_type);
 796
 797       if (g_value_transform (&src, &dest)) {
 798         gst_tag_list_add_values (tag_list, GST_TAG_MERGE_APPEND,
 799             tag_name, &dest, NULL);
 800       } else if (tag_type == G_TYPE_DOUBLE) {
 801         /* replaygain tags in TXXX frames ... */
 802         g_value_set_double (&dest, g_strtod (tag_str, NULL));
 803         gst_tag_list_add_values (tag_list, GST_TAG_MERGE_KEEP,
 804             tag_name, &dest, NULL);
 805         GST_LOG ("Converted string '%s' to double %f", tag_str,
 806             g_value_get_double (&dest));
 807       } else {
 808         GST_WARNING ("Failed to transform tag from string to type '%s'",
 809             g_type_name (tag_type));
 810       }
 811
 812       g_value_unset (&src);
 813       g_value_unset (&dest);
 814       g_free (tmp);
 815       break;
 816     }
 817   }
 818
 819   return TRUE;
 820 }
 821
 822 /* Check that an array of characters contains only digits */
 823 static gboolean
 824 id3v2_are_digits (const gchar * chars, gint size)
 825 {
 826   gint i;
 827
 828   for (i = 0; i < size; i++) {
 829     if (!g_ascii_isdigit (chars[i]))
 830       return FALSE;
 831   }
 832   return TRUE;
 833 }
 834
 835 static gboolean
 836 id3v2_genre_string_to_taglist (ID3TagsWorking * work, const gchar * tag_name,
 837     const gchar * tag_str, gint len)
 838 {
 839   g_return_val_if_fail (tag_str != NULL, FALSE);
 840
 841   /* If it's a number, it might be a defined genre */
 842   if (id3v2_are_digits (tag_str, len)) {
 843     tag_str = gst_tag_id3_genre_get (strtol (tag_str, NULL, 10));
 844     return id3v2_tag_to_taglist (work, tag_name, tag_str);
 845   }
 846   /* Otherwise it might be "RX" or "CR" */
 847   if (len == 2) {
 848     if (g_ascii_strncasecmp ("rx", tag_str, len) == 0)
 849       return id3v2_tag_to_taglist (work, tag_name, "Remix");
 850
 851     if (g_ascii_strncasecmp ("cr", tag_str, len) == 0)
 852       return id3v2_tag_to_taglist (work, tag_name, "Cover");
 853   }
 854
 855   /* Otherwise it's a string */
 856   return id3v2_tag_to_taglist (work, tag_name, tag_str);
 857 }
 858
 859 static gboolean
 860 id3v2_genre_fields_to_taglist (ID3TagsWorking * work, const gchar * tag_name,
 861     GArray * tag_fields)
 862 {
 863   gchar *tag_str = NULL;
 864   gboolean result = FALSE;
 865   gint i;
 866
 867   for (i = 0; i < tag_fields->len; i++) {
 868     gint len;
 869
 870     tag_str = g_array_index (tag_fields, gchar *, 0);
 871     if (tag_str == NULL)
 872       continue;
 873
 874     len = strlen (tag_str);
 875     /* Only supposed to see '(n)' type numeric genre strings in ID3 <= 2.3.0
 876      * but apparently we see them in 2.4.0 sometimes too */
 877     if (TRUE || work->hdr.version <= 0x300) {   /* <= 2.3.0 */
 878       /* Check for genre numbers wrapped in parentheses, possibly
 879        * followed by a string */
 880       while (len >= 2) {
 881         gint pos;
 882         gboolean found = FALSE;
 883
 884         /* Double parenthesis ends the numeric genres, but we need
 885          * to swallow the first one so we actually output '(' */
 886         if (tag_str[0] == '(' && tag_str[1] == '(') {
 887           tag_str++;
 888           len--;
 889           break;
 890         }
 891
 892         /* If the first char is not a parenthesis, then stop
 893          * looking for parenthesised genre strings */
 894         if (tag_str[0] != '(')
 895           break;
 896
 897         for (pos = 1; pos < len; pos++) {
 898           if (tag_str[pos] == ')') {
 899             gchar *tmp_str;
 900
 901             tmp_str = g_strndup (tag_str + 1, pos - 1);
 902             result |=
 903                 id3v2_genre_string_to_taglist (work, tag_name, tmp_str,
 904                 pos - 1);
 905             g_free (tmp_str);
 906             tag_str += pos + 1;
 907             len -= pos + 1;
 908             found = TRUE;
 909             break;
 910           }
 911
 912           /* If we encounter a non-digit while searching for a closing
 913            * parenthesis, we should not try and interpret this as a
 914            * numeric genre string */
 915           if (!g_ascii_isdigit (tag_str[pos]))
 916             break;
 917         }
 918         if (!found)
 919           break;                /* There was no closing parenthesis */
 920       }
 921     }
 922
 923     if (len > 0 && tag_str != NULL)
 924       result |= id3v2_genre_string_to_taglist (work, tag_name, tag_str, len);
 925   }
 926   return result;
 927 }
 928
 929 static const gchar utf16enc[] = "UTF-16";
 930 static const gchar utf16leenc[] = "UTF-16LE";
 931 static const gchar utf16beenc[] = "UTF-16BE";
 932
 933 static gboolean
 934 find_utf16_bom (gchar * data, const gchar ** p_in_encoding)
 935 {
 936   guint16 marker = (GST_READ_UINT8 (data) << 8) | GST_READ_UINT8 (data + 1);
 937
 938   switch (marker) {
 939     case 0xFFFE:
 940       *p_in_encoding = utf16leenc;
 941       return TRUE;
 942     case 0xFEFF:
 943       *p_in_encoding = utf16beenc;
 944       return TRUE;
 945     default:
 946       break;
 947   }
 948   return FALSE;
 949 }
 950
 951 static void *
 952 string_utf8_dup (const gchar * start, const guint size)
 953 {
 954   const gchar *env;
 955   gsize bytes_read;
 956   gchar *utf8;
 957
 958   /* Should we try the charsets specified
 959    * via environment variables FIRST ? */
 960   if (g_utf8_validate (start, size, NULL)) {
 961     utf8 = g_strndup (start, size);
 962     goto beach;
 963   }
 964
 965   env = g_getenv ("GST_ID3V1_TAG_ENCODING");
 966   if (!env || *env == '\0')
 967     env = g_getenv ("GST_ID3_TAG_ENCODING");
 968   if (!env || *env == '\0')
 969     env = g_getenv ("GST_TAG_ENCODING");
 970
 971   /* Try charsets specified via the environment */
 972   if (env && *env != '\0') {
 973     gchar **c, **csets;
 974
 975     csets = g_strsplit (env, G_SEARCHPATH_SEPARATOR_S, -1);
 976
 977     for (c = csets; c && *c; ++c) {
 978       if ((utf8 =
 979               g_convert (start, size, "UTF-8", *c, &bytes_read, NULL, NULL))) {
 980         if (bytes_read == size) {
 981           GST_DEBUG ("Using charset %s to interperate id3 tags\n", c);
 982           g_strfreev (csets);
 983           goto beach;
 984         }
 985         g_free (utf8);
 986         utf8 = NULL;
 987       }
 988     }
 989   }
 990   /* Try current locale (if not UTF-8) */
 991   if (!g_get_charset (&env)) {
 992     if ((utf8 = g_locale_to_utf8 (start, size, &bytes_read, NULL, NULL))) {
 993       if (bytes_read == size) {
 994         goto beach;
 995       }
 996       g_free (utf8);
 997       utf8 = NULL;
 998     }
 999   }
1000
1001   /* Try ISO-8859-1 */
1002   utf8 =
1003       g_convert (start, size, "UTF-8", "ISO-8859-1", &bytes_read, NULL, NULL);
1004   if (utf8 != NULL && bytes_read == size) {
1005     goto beach;
1006   }
1007
1008   g_free (utf8);
1009   return NULL;
1010
1011 beach:
1012
1013   g_strchomp (utf8);
1014
1015   return (utf8);
1016 }
1017
1018 static void
1019 parse_insert_string_field (guint8 encoding, gchar * data, gint data_size,
1020     GArray * fields)
1021 {
1022   gchar *field = NULL;
1023
1024   switch (encoding) {
1025     case ID3V2_ENCODING_UTF16:
1026     case ID3V2_ENCODING_UTF16BE:
1027     {
1028       const gchar *in_encode;
1029
1030       if (encoding == ID3V2_ENCODING_UTF16)
1031         in_encode = utf16enc;
1032       else
1033         in_encode = utf16beenc;
1034
1035       /* Sometimes we see strings with multiple BOM markers at the start.
1036        * In that case, we assume the innermost one is correct. If that fails
1037        * to produce valid UTF-8, we try the other endianness anyway */
1038       while (data_size > 2 && find_utf16_bom (data, &in_encode)) {
1039         data += 2;              /* skip BOM */
1040         data_size -= 2;
1041       }
1042
1043       field = g_convert (data, data_size, "UTF-8", in_encode, NULL, NULL, NULL);
1044
1045       if (field == NULL || g_utf8_validate (field, -1, NULL) == FALSE) {
1046         /* As a fallback, try interpreting UTF-16 in the other endianness */
1047         if (in_encode == utf16beenc)
1048           field = g_convert (data, data_size, "UTF-8", utf16leenc,
1049               NULL, NULL, NULL);
1050       }
1051     }
1052
1053       break;
1054     case ID3V2_ENCODING_ISO8859:
1055       if (g_utf8_validate (data, data_size, NULL))
1056         field = g_strndup (data, data_size);
1057       else
1058         /* field = g_convert (data, data_size, "UTF-8", "ISO-8859-1",
1059            NULL, NULL, NULL); */
1060         field = string_utf8_dup (data, data_size);
1061       break;
1062     default:
1063       field = g_strndup (data, data_size);
1064       break;
1065   }
1066
1067   if (field) {
1068     if (g_utf8_validate (field, -1, NULL)) {
1069       g_array_append_val (fields, field);
1070       return;
1071     }
1072
1073     GST_DEBUG ("%s was bad UTF-8 after conversion from encoding %d. Ignoring",
1074         field, encoding);
1075     g_free (field);
1076   }
1077 }
1078
1079 static void
1080 parse_split_strings (guint8 encoding, gchar * data, gint data_size,
1081     GArray ** out_fields)
1082 {
1083   GArray *fields = g_array_new (FALSE, TRUE, sizeof (gchar *));
1084   gint text_pos;
1085   gint prev = 0;
1086
1087   g_return_if_fail (out_fields != NULL);
1088
1089   switch (encoding) {
1090     case ID3V2_ENCODING_ISO8859:
1091       for (text_pos = 0; text_pos < data_size; text_pos++) {
1092         if (data[text_pos] == 0) {
1093           parse_insert_string_field (encoding, data + prev,
1094               text_pos - prev + 1, fields);
1095           prev = text_pos + 1;
1096         }
1097       }
1098       if (data_size - prev > 0 && data[prev] != 0x00) {
1099         parse_insert_string_field (encoding, data + prev,
1100             data_size - prev, fields);
1101       }
1102
1103       break;
1104     case ID3V2_ENCODING_UTF8:
1105       for (prev = 0, text_pos = 0; text_pos < data_size; text_pos++) {
1106         if (data[text_pos] == '\0') {
1107           parse_insert_string_field (encoding, data + prev,
1108               text_pos - prev + 1, fields);
1109           prev = text_pos + 1;
1110         }
1111       }
1112       if (data_size - prev > 0 && data[prev] != 0x00) {
1113         parse_insert_string_field (encoding, data + prev,
1114             data_size - prev, fields);
1115       }
1116       break;
1117     case ID3V2_ENCODING_UTF16:
1118     case ID3V2_ENCODING_UTF16BE:
1119     {
1120       /* Find '\0\0' terminator */
1121       for (text_pos = 0; text_pos < data_size - 1; text_pos += 2) {
1122         if (data[text_pos] == '\0' && data[text_pos + 1] == '\0') {
1123           /* found a delimiter */
1124           parse_insert_string_field (encoding, data + prev,
1125               text_pos - prev + 2, fields);
1126           text_pos++;           /* Advance to the 2nd NULL terminator */
1127           prev = text_pos + 1;
1128           break;
1129         }
1130       }
1131       if (data_size - prev > 1 &&
1132           (data[prev] != 0x00 || data[prev + 1] != 0x00)) {
1133         /* There were 2 or more non-null chars left, convert those too */
1134         parse_insert_string_field (encoding, data + prev,
1135             data_size - prev, fields);
1136       }
1137       break;
1138     }
1139   }
1140   if (fields->len > 0)
1141     *out_fields = fields;
1142   else
1143     g_array_free (fields, TRUE);
1144 }
1145
1146 static void
1147 free_tag_strings (GArray * fields)
1148 {
1149   if (fields) {
1150     gint i;
1151     gchar *c;
1152
1153     for (i = 0; i < fields->len; i++) {
1154       c = g_array_index (fields, gchar *, i);
1155       g_free (c);
1156     }
1157     g_array_free (fields, TRUE);
1158   }
1159 }