src/unicode.c

   1 /**
   2  * \file unicode.c
   3  *
   4  * This file contains general Unicode string manipulation functions.
   5  * It mainly consist of functions for converting between UCS-2 (used on
   6  * the devices) and UTF-8 (used by several applications).
   7  *
   8  * For a deeper understanding of Unicode encoding formats see the
   9  * Wikipedia entries for
  10  * <a href="http://en.wikipedia.org/wiki/UTF-16/UCS-2">UTF-16/UCS-2</a>
  11  * and <a href="http://en.wikipedia.org/wiki/UTF-8">UTF-8</a>.
  12  *
  13  * Copyright (C) 2005-2009 Linus Walleij <triad@df.lth.se>
  14  *
  15  * This library is free software; you can redistribute it and/or
  16  * modify it under the terms of the GNU Lesser General Public
  17  * License as published by the Free Software Foundation; either
  18  * version 2 of the License, or (at your option) any later version.
  19  *
  20  * This library is distributed in the hope that it will be useful,
  21  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  22  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  23  * Lesser General Public License for more details.
  24  *
  25  * You should have received a copy of the GNU Lesser General Public
  26  * License along with this library; if not, write to the
  27  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  28  * Boston, MA 02111-1307, USA.
  29  *
  30  */
  31
  32 #include "config.h"
  33
  34 #include <stdlib.h>
  35 #include <string.h>
  36 #ifdef HAVE_ICONV
  37 #include "iconv.h"
  38 #else
  39 #error "libmtp unicode.c needs fixing to work without iconv()!"
  40 #endif
  41 #include "libmtp.h"
  42 #include "unicode.h"
  43 #include "util.h"
  44 #include "ptp.h"
  45
  46 /**
  47  * The size of the buffer (in characters) used for creating string copies.
  48  */
  49 #define STRING_BUFFER_LENGTH 1024
  50
  51 /**
  52  * Gets the length (in characters, not bytes) of a unicode
  53  * UCS-2 string, eg a string which physically is 0x00 0x41 0x00 0x00
  54  * will return a value of 1.
  55  *
  56  * @param unicstr a UCS-2 Unicode string
  57  * @return the length of the string, in number of characters. If you
  58  *         want to know the length in bytes, multiply this by two and
  59  *         add two (for zero terminator).
  60  */
  61 int ucs2_strlen(uint16_t const * const unicstr)
  62 {
  63   int length;
  64
  65   /* Unicode strings are terminated with 2 * 0x00 */
  66   for(length = 0; unicstr[length] != 0x0000U; length ++);
  67   return length;
  68 }
  69
  70 /**
  71  * Converts a big-endian UTF-16 2-byte string
  72  * to a UTF-8 string. Actually just a UCS-2 internal conversion
  73  * routine that strips off the BOM if there is one.
  74  *
  75  * @param device a pointer to the current device.
  76  * @param unicstr the UTF-16 unicode string to convert
  77  * @return a UTF-8 string.
  78  */
  79 char *utf16_to_utf8(LIBMTP_mtpdevice_t *device, const uint16_t *unicstr)
  80 {
  81   PTPParams *params = (PTPParams *) device->params;
  82   char *stringp = (char *) unicstr;
  83   char loclstr[STRING_BUFFER_LENGTH*3+1]; // UTF-8 encoding is max 3 bytes per UCS2 char.
  84   char *locp = loclstr;
  85   size_t nconv;
  86   size_t convlen = (ucs2_strlen(unicstr)+1) * sizeof(uint16_t); // UCS-2 is 16 bit wide, include terminator
  87   size_t convmax = STRING_BUFFER_LENGTH*3;
  88
  89   loclstr[0]='\0';
  90   /* Do the conversion.  */
  91   nconv = iconv(params->cd_ucs2_to_locale, &stringp, &convlen, &locp, &convmax);
  92   if (nconv == (size_t) -1) {
  93     // Return partial string anyway.
  94     *locp = '\0';
  95   }
  96   loclstr[STRING_BUFFER_LENGTH*3] = '\0';
  97   // Strip off any BOM, it's totally useless...
  98   if ((uint8_t) loclstr[0] == 0xEFU && (uint8_t) loclstr[1] == 0xBBU && (uint8_t) loclstr[2] == 0xBFU) {
  99     return strdup(loclstr+3);
 100   }
 101   return strdup(loclstr);
 102 }
 103
 104 /**
 105  * Converts a UTF-8 string to a big-endian UTF-16 2-byte string
 106  * Actually just a UCS-2 internal conversion.
 107  *
 108  * @param device a pointer to the current device.
 109  * @param localstr the UTF-8 unicode string to convert
 110  * @return a UTF-16 string.
 111  */
 112 uint16_t *utf8_to_utf16(LIBMTP_mtpdevice_t *device, const char *localstr)
 113 {
 114   PTPParams *params = (PTPParams *) device->params;
 115   char *stringp = (char *) localstr; // cast away "const"
 116   char unicstr[(STRING_BUFFER_LENGTH+1)*2]; // UCS2 encoding is 2 bytes per UTF-8 char.
 117   char *unip = unicstr;
 118   size_t nconv = 0;
 119   size_t convlen = strlen(localstr)+1; // utf8 bytes, include terminator
 120   size_t convmax = STRING_BUFFER_LENGTH*2;
 121
 122   unicstr[0]='\0';
 123   unicstr[1]='\0';
 124
 125   /* Do the conversion.  */
 126   nconv = iconv(params->cd_locale_to_ucs2, &stringp, &convlen, &unip, &convmax);
 127
 128   if (nconv == (size_t) -1) {
 129     // Return partial string anyway.
 130     unip[0] = '\0';
 131     unip[1] = '\0';
 132   }
 133   // make sure the string is null terminated
 134   unicstr[STRING_BUFFER_LENGTH*2] = '\0';
 135   unicstr[STRING_BUFFER_LENGTH*2+1] = '\0';
 136
 137   // allocate the string to be returned
 138   // Note: can't use strdup since every other byte is a null byte
 139   int ret_len = ucs2_strlen((uint16_t*)unicstr)*sizeof(uint16_t)+2;
 140   uint16_t* ret = malloc(ret_len);
 141   memcpy(ret,unicstr,(size_t)ret_len);
 142   return ret;
 143 }
 144
 145 /**
 146  * This helper function simply removes any consecutive chars
 147  * > 0x7F and replace then with an underscore. In UTF-8
 148  * consequtive chars > 0x7F represent one single character so
 149  * it has to be done like this (and it's elegant). It will only
 150  * shrink the string in size so no copying is needed.
 151  */
 152 void strip_7bit_from_utf8(char *str)
 153 {
 154   int i,j,k;
 155   i = 0;
 156   j = 0;
 157   k = strlen(str);
 158   while (i < k) {
 159     if ((uint8_t) str[i] > 0x7FU) {
 160       str[j] = '_';
 161       i++;
 162       // Skip over any consequtive > 0x7F chars.
 163       while((uint8_t) str[i] > 0x7FU) {
 164         i++;
 165       }
 166     } else {
 167       str[j] = str[i];
 168       i++;
 169     }
 170     j++;
 171   }
 172   // Terminate stripped string...
 173   str[j] = '\0';
 174 }