src/cairo-unicode.c

   1 /* -*- Mode: c; c-basic-offset: 4; indent-tabs-mode: t; tab-width: 8; -*- */
   2 /* cairo - a vector graphics library with display and print output
   3  *
   4  * The code in this file is derived from GLib's gutf8.c and
   5  *   ultimately from libunicode. It is relicensed under the
   6  *   dual LGPL/MPL with permission of the original authors.
   7  *
   8  * Copyright © 1999 Tom Tromey
   9  * Copyright © 2005 Red Hat, Inc
  10  *
  11  * This library is free software; you can redistribute it and/or
  12  * modify it either under the terms of the GNU Lesser General Public
  13  * License version 2.1 as published by the Free Software Foundation
  14  * (the "LGPL") or, at your option, under the terms of the Mozilla
  15  * Public License Version 1.1 (the "MPL"). If you do not alter this
  16  * notice, a recipient may use your version of this file under either
  17  * the MPL or the LGPL.
  18  *
  19  * You should have received a copy of the LGPL along with this library
  20  * in the file COPYING-LGPL-2.1; if not, write to the Free Software
  21  * Foundation, Inc., 51 Franklin Street, Suite 500, Boston, MA 02110-1335, USA
  22  * You should have received a copy of the MPL along with this library
  23  * in the file COPYING-MPL-1.1
  24  *
  25  * The contents of this file are subject to the Mozilla Public License
  26  * Version 1.1 (the "License"); you may not use this file except in
  27  * compliance with the License. You may obtain a copy of the License at
  28  * http://www.mozilla.org/MPL/
  29  *
  30  * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY
  31  * OF ANY KIND, either express or implied. See the LGPL or the MPL for
  32  * the specific language governing rights and limitations.
  33  *
  34  * The Original Code is the cairo graphics library.
  35  *
  36  * The Initial Developer of the Original Code is Tom Tromey.
  37  *  and Red Hat, Inc.
  38  *
  39  * Contributor(s):
  40  *      Owen Taylor <otaylor@redhat.com>
  41  */
  42
  43 #include "cairoint.h"
  44 #include "cairo-error-private.h"
  45
  46 #define UTF8_COMPUTE(Char, Mask, Len)                                         \
  47   if (Char < 128)                                                             \
  48     {                                                                         \
  49       Len = 1;                                                                \
  50       Mask = 0x7f;                                                            \
  51     }                                                                         \
  52   else if ((Char & 0xe0) == 0xc0)                                             \
  53     {                                                                         \
  54       Len = 2;                                                                \
  55       Mask = 0x1f;                                                            \
  56     }                                                                         \
  57   else if ((Char & 0xf0) == 0xe0)                                             \
  58     {                                                                         \
  59       Len = 3;                                                                \
  60       Mask = 0x0f;                                                            \
  61     }                                                                         \
  62   else if ((Char & 0xf8) == 0xf0)                                             \
  63     {                                                                         \
  64       Len = 4;                                                                \
  65       Mask = 0x07;                                                            \
  66     }                                                                         \
  67   else if ((Char & 0xfc) == 0xf8)                                             \
  68     {                                                                         \
  69       Len = 5;                                                                \
  70       Mask = 0x03;                                                            \
  71     }                                                                         \
  72   else if ((Char & 0xfe) == 0xfc)                                             \
  73     {                                                                         \
  74       Len = 6;                                                                \
  75       Mask = 0x01;                                                            \
  76     }                                                                         \
  77   else                                                                        \
  78     Len = -1;
  79
  80 #define UTF8_LENGTH(Char)              \
  81   ((Char) < 0x80 ? 1 :                 \
  82    ((Char) < 0x800 ? 2 :               \
  83     ((Char) < 0x10000 ? 3 :            \
  84      ((Char) < 0x200000 ? 4 :          \
  85       ((Char) < 0x4000000 ? 5 : 6)))))
  86
  87 #define UTF8_GET(Result, Chars, Count, Mask, Len)                             \
  88   (Result) = (Chars)[0] & (Mask);                                             \
  89   for ((Count) = 1; (Count) < (Len); ++(Count))                               \
  90     {                                                                         \
  91       if (((Chars)[(Count)] & 0xc0) != 0x80)                                  \
  92         {                                                                     \
  93           (Result) = -1;                                                      \
  94           break;                                                              \
  95         }                                                                     \
  96       (Result) <<= 6;                                                         \
  97       (Result) |= ((Chars)[(Count)] & 0x3f);                                  \
  98     }
  99
 100 #define UNICODE_VALID(Char)                   \
 101     ((Char) < 0x110000 &&                     \
 102      (((Char) & 0xFFFFF800) != 0xD800) &&     \
 103      ((Char) < 0xFDD0 || (Char) > 0xFDEF) &&  \
 104      ((Char) & 0xFFFE) != 0xFFFE)
 105
 106 static const char utf8_skip_data[256] = {
 107     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 108     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 109     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 110     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 111     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 112     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 113     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
 114     3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
 115 };
 116
 117 #define UTF8_NEXT_CHAR(p) ((p) + utf8_skip_data[*(unsigned char *)(p)])
 118
 119 /* Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
 120  * If @p does not point to a valid UTF-8 encoded character, results are
 121  * undefined.
 122  **/
 123 static uint32_t
 124 _utf8_get_char (const unsigned char *p)
 125 {
 126     int i, mask = 0, len;
 127     uint32_t result;
 128     unsigned char c = (unsigned char) *p;
 129
 130     UTF8_COMPUTE (c, mask, len);
 131     if (len == -1)
 132         return (uint32_t)-1;
 133     UTF8_GET (result, p, i, mask, len);
 134
 135     return result;
 136 }
 137
 138 /* Like _utf8_get_char, but take a maximum length
 139  * and return (uint32_t)-2 on incomplete trailing character
 140  */
 141 static uint32_t
 142 _utf8_get_char_extended (const unsigned char *p,
 143                          long                 max_len)
 144 {
 145     int i, len;
 146     uint32_t wc = (unsigned char) *p;
 147
 148     if (wc < 0x80) {
 149         return wc;
 150     } else if (wc < 0xc0) {
 151         return (uint32_t)-1;
 152     } else if (wc < 0xe0) {
 153         len = 2;
 154         wc &= 0x1f;
 155     } else if (wc < 0xf0) {
 156         len = 3;
 157         wc &= 0x0f;
 158     } else if (wc < 0xf8) {
 159         len = 4;
 160         wc &= 0x07;
 161     } else if (wc < 0xfc) {
 162         len = 5;
 163         wc &= 0x03;
 164     } else if (wc < 0xfe) {
 165         len = 6;
 166         wc &= 0x01;
 167     } else {
 168         return (uint32_t)-1;
 169     }
 170
 171     if (max_len >= 0 && len > max_len) {
 172         for (i = 1; i < max_len; i++) {
 173             if ((((unsigned char *)p)[i] & 0xc0) != 0x80)
 174                 return (uint32_t)-1;
 175         }
 176         return (uint32_t)-2;
 177     }
 178
 179     for (i = 1; i < len; ++i) {
 180         uint32_t ch = ((unsigned char *)p)[i];
 181
 182         if ((ch & 0xc0) != 0x80) {
 183             if (ch)
 184                 return (uint32_t)-1;
 185             else
 186                 return (uint32_t)-2;
 187         }
 188
 189         wc <<= 6;
 190         wc |= (ch & 0x3f);
 191     }
 192
 193     if (UTF8_LENGTH(wc) != len)
 194         return (uint32_t)-1;
 195
 196     return wc;
 197 }
 198
 199 /**
 200  * _cairo_utf8_get_char_validated:
 201  * @p: a UTF-8 string
 202  * @unicode: location to store one Unicode character
 203  *
 204  * Decodes the first character of a valid UTF-8 string, and returns
 205  * the number of bytes consumed.
 206  *
 207  * Note that the string should be valid.  Do not use this without
 208  * validating the string first.
 209  *
 210  * Returns: the number of bytes forming the character returned.
 211  **/
 212 int
 213 _cairo_utf8_get_char_validated (const char *p,
 214                                 uint32_t   *unicode)
 215 {
 216     int i, mask = 0, len;
 217     uint32_t result;
 218     unsigned char c = (unsigned char) *p;
 219
 220     UTF8_COMPUTE (c, mask, len);
 221     if (len == -1) {
 222         if (unicode)
 223             *unicode = (uint32_t)-1;
 224         return 1;
 225     }
 226     UTF8_GET (result, p, i, mask, len);
 227
 228     if (unicode)
 229         *unicode = result;
 230     return len;
 231 }
 232
 233 /**
 234  * _cairo_utf8_to_ucs4:
 235  * @str: an UTF-8 string
 236  * @len: length of @str in bytes, or -1 if it is nul-terminated.
 237  *   If @len is supplied and the string has an embedded nul
 238  *   byte, only the portion before the nul byte is converted.
 239  * @result: location to store a pointer to a newly allocated UTF-32
 240  *   string (always native endian), or %NULL. Free with free(). A 0
 241  *   word will be written after the last character.
 242  * @items_written: location to store number of 32-bit words
 243  *   written. (Not including the trailing 0)
 244  *
 245  * Converts a UTF-8 string to UCS-4. UCS-4 is an encoding of Unicode
 246  * with 1 32-bit word per character. The string is validated to
 247  * consist entirely of valid Unicode characters.
 248  *
 249  * Return value: %CAIRO_STATUS_SUCCESS if the entire string was
 250  *   successfully converted. %CAIRO_STATUS_INVALID_STRING if an
 251  *   invalid sequence was found.
 252  **/
 253 cairo_status_t
 254 _cairo_utf8_to_ucs4 (const char *str,
 255                      int         len,
 256                      uint32_t  **result,
 257                      int        *items_written)
 258 {
 259     uint32_t *str32 = NULL;
 260     int n_chars, i;
 261     const unsigned char *in;
 262     const unsigned char * const ustr = (const unsigned char *) str;
 263
 264     in = ustr;
 265     n_chars = 0;
 266     while ((len < 0 || ustr + len - in > 0) && *in)
 267     {
 268         uint32_t wc = _utf8_get_char_extended (in, ustr + len - in);
 269         if (wc & 0x80000000 || !UNICODE_VALID (wc))
 270             return _cairo_error (CAIRO_STATUS_INVALID_STRING);
 271
 272         n_chars++;
 273         if (n_chars == INT_MAX)
 274             return _cairo_error (CAIRO_STATUS_INVALID_STRING);
 275
 276         in = UTF8_NEXT_CHAR (in);
 277     }
 278
 279     if (result) {
 280         str32 = _cairo_malloc_ab (n_chars + 1, sizeof (uint32_t));
 281         if (!str32)
 282             return _cairo_error (CAIRO_STATUS_NO_MEMORY);
 283
 284         in = ustr;
 285         for (i=0; i < n_chars; i++) {
 286             str32[i] = _utf8_get_char (in);
 287             in = UTF8_NEXT_CHAR (in);
 288         }
 289         str32[i] = 0;
 290
 291         *result = str32;
 292     }
 293
 294     if (items_written)
 295         *items_written = n_chars;
 296
 297     return CAIRO_STATUS_SUCCESS;
 298 }
 299
 300 /**
 301  * _cairo_ucs4_to_utf8:
 302  * @unicode: a UCS-4 character
 303  * @utf8: buffer to write utf8 string into. Must have at least 4 bytes
 304  * space available. Or %NULL.
 305  *
 306  * This space left intentionally blank.
 307  *
 308  * Return value: Number of bytes in the utf8 string or 0 if an invalid
 309  * unicode character
 310  **/
 311 int
 312 _cairo_ucs4_to_utf8 (uint32_t  unicode,
 313                      char     *utf8)
 314 {
 315     int bytes;
 316     char *p;
 317
 318     if (unicode < 0x80) {
 319         if (utf8)
 320             *utf8 = unicode;
 321         return 1;
 322     } else if (unicode < 0x800) {
 323         bytes = 2;
 324     } else if (unicode < 0x10000) {
 325         bytes = 3;
 326     } else if (unicode < 0x200000) {
 327         bytes = 4;
 328     } else {
 329         return 0;
 330     }
 331
 332     if (!utf8)
 333         return bytes;
 334
 335     p = utf8 + bytes;
 336     while (p > utf8) {
 337         *--p = 0x80 | (unicode & 0x3f);
 338         unicode >>= 6;
 339     }
 340     *p |= 0xf0 << (4 - bytes);
 341
 342     return bytes;
 343 }
 344
 345 #if CAIRO_HAS_UTF8_TO_UTF16
 346 /**
 347  * _cairo_utf8_to_utf16:
 348  * @str: an UTF-8 string
 349  * @len: length of @str in bytes, or -1 if it is nul-terminated.
 350  *   If @len is supplied and the string has an embedded nul
 351  *   byte, only the portion before the nul byte is converted.
 352  * @result: location to store a pointer to a newly allocated UTF-16
 353  *   string (always native endian). Free with free(). A 0
 354  *   word will be written after the last character.
 355  * @items_written: location to store number of 16-bit words
 356  *   written. (Not including the trailing 0)
 357  *
 358  * Converts a UTF-8 string to UTF-16. UTF-16 is an encoding of Unicode
 359  * where characters are represented either as a single 16-bit word, or
 360  * as a pair of 16-bit "surrogates". The string is validated to
 361  * consist entirely of valid Unicode characters.
 362  *
 363  * Return value: %CAIRO_STATUS_SUCCESS if the entire string was
 364  *   successfully converted. %CAIRO_STATUS_INVALID_STRING if an
 365  *   an invalid sequence was found.
 366  **/
 367 cairo_status_t
 368 _cairo_utf8_to_utf16 (const char *str,
 369                       int         len,
 370                       uint16_t **result,
 371                       int       *items_written)
 372 {
 373     uint16_t *str16 = NULL;
 374     int n16, i;
 375     const unsigned char *in;
 376     const unsigned char * const ustr = (const unsigned char *) str;
 377
 378     in = ustr;
 379     n16 = 0;
 380     while ((len < 0 || ustr + len - in > 0) && *in) {
 381         uint32_t wc = _utf8_get_char_extended (in, ustr + len - in);
 382         if (wc & 0x80000000 || !UNICODE_VALID (wc))
 383             return _cairo_error (CAIRO_STATUS_INVALID_STRING);
 384
 385         if (wc < 0x10000)
 386             n16 += 1;
 387         else
 388             n16 += 2;
 389
 390         if (n16 == INT_MAX - 1 || n16 == INT_MAX)
 391             return _cairo_error (CAIRO_STATUS_INVALID_STRING);
 392
 393         in = UTF8_NEXT_CHAR (in);
 394     }
 395
 396     str16 = _cairo_malloc_ab (n16 + 1, sizeof (uint16_t));
 397     if (!str16)
 398         return _cairo_error (CAIRO_STATUS_NO_MEMORY);
 399
 400     in = ustr;
 401     for (i = 0; i < n16;) {
 402         uint32_t wc = _utf8_get_char (in);
 403
 404         if (wc < 0x10000) {
 405             str16[i++] = wc;
 406         } else {
 407             str16[i++] = (wc - 0x10000) / 0x400 + 0xd800;
 408             str16[i++] = (wc - 0x10000) % 0x400 + 0xdc00;
 409         }
 410
 411         in = UTF8_NEXT_CHAR (in);
 412     }
 413
 414     str16[i] = 0;
 415
 416     *result = str16;
 417     if (items_written)
 418         *items_written = n16;
 419
 420     return CAIRO_STATUS_SUCCESS;
 421 }
 422 #endif