src/hb-icu.cc

   1 /*
   2  * Copyright © 2009  Red Hat, Inc.
   3  * Copyright © 2009  Keith Stribley
   4  * Copyright © 2011  Google, Inc.
   5  *
   6  *  This is part of HarfBuzz, a text shaping library.
   7  *
   8  * Permission is hereby granted, without written agreement and without
   9  * license or royalty fees, to use, copy, modify, and distribute this
  10  * software and its documentation for any purpose, provided that the
  11  * above copyright notice and the following two paragraphs appear in
  12  * all copies of this software.
  13  *
  14  * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR
  15  * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
  16  * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN
  17  * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
  18  * DAMAGE.
  19  *
  20  * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
  21  * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  22  * FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
  23  * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO
  24  * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
  25  *
  26  * Red Hat Author(s): Behdad Esfahbod
  27  * Google Author(s): Behdad Esfahbod
  28  */
  29
  30 #include "hb-private.hh"
  31
  32 #include "hb-icu.h"
  33
  34 #include "hb-unicode-private.hh"
  35
  36 #include <unicode/uchar.h>
  37 #include <unicode/unorm.h>
  38 #include <unicode/ustring.h>
  39 #include <unicode/uversion.h>
  40
  41
  42 hb_script_t
  43 hb_icu_script_to_script (UScriptCode script)
  44 {
  45   if (unlikely (script == USCRIPT_INVALID_CODE))
  46     return HB_SCRIPT_INVALID;
  47
  48   return hb_script_from_string (uscript_getShortName (script), -1);
  49 }
  50
  51 UScriptCode
  52 hb_icu_script_from_script (hb_script_t script)
  53 {
  54   if (unlikely (script == HB_SCRIPT_INVALID))
  55     return USCRIPT_INVALID_CODE;
  56
  57   for (unsigned int i = 0; i < USCRIPT_CODE_LIMIT; i++)
  58     if (unlikely (hb_icu_script_to_script ((UScriptCode) i) == script))
  59       return (UScriptCode) i;
  60
  61   return USCRIPT_UNKNOWN;
  62 }
  63
  64
  65 static hb_unicode_combining_class_t
  66 hb_icu_unicode_combining_class (hb_unicode_funcs_t *ufuncs HB_UNUSED,
  67                                 hb_codepoint_t      unicode,
  68                                 void               *user_data HB_UNUSED)
  69
  70 {
  71   return (hb_unicode_combining_class_t) u_getCombiningClass (unicode);
  72 }
  73
  74 static unsigned int
  75 hb_icu_unicode_eastasian_width (hb_unicode_funcs_t *ufuncs HB_UNUSED,
  76                                 hb_codepoint_t      unicode,
  77                                 void               *user_data HB_UNUSED)
  78 {
  79   switch (u_getIntPropertyValue(unicode, UCHAR_EAST_ASIAN_WIDTH))
  80   {
  81   case U_EA_WIDE:
  82   case U_EA_FULLWIDTH:
  83     return 2;
  84   case U_EA_NEUTRAL:
  85   case U_EA_AMBIGUOUS:
  86   case U_EA_HALFWIDTH:
  87   case U_EA_NARROW:
  88     return 1;
  89   }
  90   return 1;
  91 }
  92
  93 static hb_unicode_general_category_t
  94 hb_icu_unicode_general_category (hb_unicode_funcs_t *ufuncs HB_UNUSED,
  95                                  hb_codepoint_t      unicode,
  96                                  void               *user_data HB_UNUSED)
  97 {
  98   switch (u_getIntPropertyValue(unicode, UCHAR_GENERAL_CATEGORY))
  99   {
 100   case U_UNASSIGNED:                    return HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED;
 101
 102   case U_UPPERCASE_LETTER:              return HB_UNICODE_GENERAL_CATEGORY_UPPERCASE_LETTER;
 103   case U_LOWERCASE_LETTER:              return HB_UNICODE_GENERAL_CATEGORY_LOWERCASE_LETTER;
 104   case U_TITLECASE_LETTER:              return HB_UNICODE_GENERAL_CATEGORY_TITLECASE_LETTER;
 105   case U_MODIFIER_LETTER:               return HB_UNICODE_GENERAL_CATEGORY_MODIFIER_LETTER;
 106   case U_OTHER_LETTER:                  return HB_UNICODE_GENERAL_CATEGORY_OTHER_LETTER;
 107
 108   case U_NON_SPACING_MARK:              return HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK;
 109   case U_ENCLOSING_MARK:                return HB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK;
 110   case U_COMBINING_SPACING_MARK:        return HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK;
 111
 112   case U_DECIMAL_DIGIT_NUMBER:          return HB_UNICODE_GENERAL_CATEGORY_DECIMAL_NUMBER;
 113   case U_LETTER_NUMBER:                 return HB_UNICODE_GENERAL_CATEGORY_LETTER_NUMBER;
 114   case U_OTHER_NUMBER:                  return HB_UNICODE_GENERAL_CATEGORY_OTHER_NUMBER;
 115
 116   case U_SPACE_SEPARATOR:               return HB_UNICODE_GENERAL_CATEGORY_SPACE_SEPARATOR;
 117   case U_LINE_SEPARATOR:                return HB_UNICODE_GENERAL_CATEGORY_LINE_SEPARATOR;
 118   case U_PARAGRAPH_SEPARATOR:           return HB_UNICODE_GENERAL_CATEGORY_PARAGRAPH_SEPARATOR;
 119
 120   case U_CONTROL_CHAR:                  return HB_UNICODE_GENERAL_CATEGORY_CONTROL;
 121   case U_FORMAT_CHAR:                   return HB_UNICODE_GENERAL_CATEGORY_FORMAT;
 122   case U_PRIVATE_USE_CHAR:              return HB_UNICODE_GENERAL_CATEGORY_PRIVATE_USE;
 123   case U_SURROGATE:                     return HB_UNICODE_GENERAL_CATEGORY_SURROGATE;
 124
 125
 126   case U_DASH_PUNCTUATION:              return HB_UNICODE_GENERAL_CATEGORY_DASH_PUNCTUATION;
 127   case U_START_PUNCTUATION:             return HB_UNICODE_GENERAL_CATEGORY_OPEN_PUNCTUATION;
 128   case U_END_PUNCTUATION:               return HB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION;
 129   case U_CONNECTOR_PUNCTUATION:         return HB_UNICODE_GENERAL_CATEGORY_CONNECT_PUNCTUATION;
 130   case U_OTHER_PUNCTUATION:             return HB_UNICODE_GENERAL_CATEGORY_OTHER_PUNCTUATION;
 131
 132   case U_MATH_SYMBOL:                   return HB_UNICODE_GENERAL_CATEGORY_MATH_SYMBOL;
 133   case U_CURRENCY_SYMBOL:               return HB_UNICODE_GENERAL_CATEGORY_CURRENCY_SYMBOL;
 134   case U_MODIFIER_SYMBOL:               return HB_UNICODE_GENERAL_CATEGORY_MODIFIER_SYMBOL;
 135   case U_OTHER_SYMBOL:                  return HB_UNICODE_GENERAL_CATEGORY_OTHER_SYMBOL;
 136
 137   case U_INITIAL_PUNCTUATION:           return HB_UNICODE_GENERAL_CATEGORY_INITIAL_PUNCTUATION;
 138   case U_FINAL_PUNCTUATION:             return HB_UNICODE_GENERAL_CATEGORY_FINAL_PUNCTUATION;
 139   }
 140
 141   return HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED;
 142 }
 143
 144 static hb_codepoint_t
 145 hb_icu_unicode_mirroring (hb_unicode_funcs_t *ufuncs HB_UNUSED,
 146                           hb_codepoint_t      unicode,
 147                           void               *user_data HB_UNUSED)
 148 {
 149   return u_charMirror(unicode);
 150 }
 151
 152 static hb_script_t
 153 hb_icu_unicode_script (hb_unicode_funcs_t *ufuncs HB_UNUSED,
 154                        hb_codepoint_t      unicode,
 155                        void               *user_data HB_UNUSED)
 156 {
 157   UErrorCode status = U_ZERO_ERROR;
 158   UScriptCode scriptCode = uscript_getScript(unicode, &status);
 159
 160   if (unlikely (U_FAILURE (status)))
 161     return HB_SCRIPT_UNKNOWN;
 162
 163   return hb_icu_script_to_script (scriptCode);
 164 }
 165
 166 #if U_ICU_VERSION_MAJOR_NUM >= 49
 167 static const UNormalizer2 *normalizer;
 168 #endif
 169
 170 static hb_bool_t
 171 hb_icu_unicode_compose (hb_unicode_funcs_t *ufuncs HB_UNUSED,
 172                         hb_codepoint_t      a,
 173                         hb_codepoint_t      b,
 174                         hb_codepoint_t     *ab,
 175                         void               *user_data HB_UNUSED)
 176 {
 177 #if U_ICU_VERSION_MAJOR_NUM >= 49
 178   {
 179     UChar32 ret = unorm2_composePair (normalizer, a, b);
 180     if (ret < 0) return false;
 181     *ab = ret;
 182     return true;
 183   }
 184 #endif
 185
 186   /* We don't ifdef-out the fallback code such that compiler always
 187    * sees it and makes sure it's compilable. */
 188
 189   UChar utf16[4], normalized[5];
 190   unsigned int len;
 191   hb_bool_t ret, err;
 192   UErrorCode icu_err;
 193
 194   len = 0;
 195   err = false;
 196   U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), a, err);
 197   if (err) return false;
 198   U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), b, err);
 199   if (err) return false;
 200
 201   icu_err = U_ZERO_ERROR;
 202   len = unorm_normalize (utf16, len, UNORM_NFC, 0, normalized, ARRAY_LENGTH (normalized), &icu_err);
 203   if (U_FAILURE (icu_err))
 204     return false;
 205   if (u_countChar32 (normalized, len) == 1) {
 206     U16_GET_UNSAFE (normalized, 0, *ab);
 207     ret = true;
 208   } else {
 209     ret = false;
 210   }
 211
 212   return ret;
 213 }
 214
 215 static hb_bool_t
 216 hb_icu_unicode_decompose (hb_unicode_funcs_t *ufuncs HB_UNUSED,
 217                           hb_codepoint_t      ab,
 218                           hb_codepoint_t     *a,
 219                           hb_codepoint_t     *b,
 220                           void               *user_data HB_UNUSED)
 221 {
 222 #if U_ICU_VERSION_MAJOR_NUM >= 49
 223   {
 224     UChar decomposed[4];
 225     int len;
 226     UErrorCode icu_err = U_ZERO_ERROR;
 227     len = unorm2_getRawDecomposition (normalizer, ab, decomposed,
 228                                       ARRAY_LENGTH (decomposed), &icu_err);
 229     if (U_FAILURE (icu_err) || len < 0) return false;
 230
 231     len = u_countChar32 (decomposed, len);
 232     if (len == 1) {
 233       U16_GET_UNSAFE (decomposed, 0, *a);
 234       *b = 0;
 235       return *a != ab;
 236     } else if (len == 2) {
 237       len =0;
 238       U16_NEXT_UNSAFE (decomposed, len, *a);
 239       U16_NEXT_UNSAFE (decomposed, len, *b);
 240     }
 241     return true;
 242   }
 243 #endif
 244
 245   /* We don't ifdef-out the fallback code such that compiler always
 246    * sees it and makes sure it's compilable. */
 247
 248   UChar utf16[2], normalized[2 * HB_UNICODE_MAX_DECOMPOSITION_LEN + 1];
 249   unsigned int len;
 250   hb_bool_t ret, err;
 251   UErrorCode icu_err;
 252
 253   /* This function is a monster! Maybe it wasn't a good idea adding a
 254    * pairwise decompose API... */
 255   /* Watchout for the dragons.  Err, watchout for macros changing len. */
 256
 257   len = 0;
 258   err = false;
 259   U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), ab, err);
 260   if (err) return false;
 261
 262   icu_err = U_ZERO_ERROR;
 263   len = unorm_normalize (utf16, len, UNORM_NFD, 0, normalized, ARRAY_LENGTH (normalized), &icu_err);
 264   if (U_FAILURE (icu_err))
 265     return false;
 266
 267   len = u_countChar32 (normalized, len);
 268
 269   if (len == 1) {
 270     U16_GET_UNSAFE (normalized, 0, *a);
 271     *b = 0;
 272     ret = *a != ab;
 273   } else if (len == 2) {
 274     len =0;
 275     U16_NEXT_UNSAFE (normalized, len, *a);
 276     U16_NEXT_UNSAFE (normalized, len, *b);
 277
 278     /* Here's the ugly part: if ab decomposes to a single character and
 279      * that character decomposes again, we have to detect that and undo
 280      * the second part :-(. */
 281     UChar recomposed[20];
 282     icu_err = U_ZERO_ERROR;
 283     unorm_normalize (normalized, len, UNORM_NFC, 0, recomposed, ARRAY_LENGTH (recomposed), &icu_err);
 284     if (U_FAILURE (icu_err))
 285       return false;
 286     hb_codepoint_t c;
 287     U16_GET_UNSAFE (recomposed, 0, c);
 288     if (c != *a && c != ab) {
 289       *a = c;
 290       *b = 0;
 291     }
 292     ret = true;
 293   } else {
 294     /* If decomposed to more than two characters, take the last one,
 295      * and recompose the rest to get the first component. */
 296     U16_PREV_UNSAFE (normalized, len, *b); /* Changes len in-place. */
 297     UChar recomposed[18 * 2];
 298     icu_err = U_ZERO_ERROR;
 299     len = unorm_normalize (normalized, len, UNORM_NFC, 0, recomposed, ARRAY_LENGTH (recomposed), &icu_err);
 300     if (U_FAILURE (icu_err))
 301       return false;
 302     /* We expect that recomposed has exactly one character now. */
 303     if (unlikely (u_countChar32 (recomposed, len) != 1))
 304       return false;
 305     U16_GET_UNSAFE (recomposed, 0, *a);
 306     ret = true;
 307   }
 308
 309   return ret;
 310 }
 311
 312 static unsigned int
 313 hb_icu_unicode_decompose_compatibility (hb_unicode_funcs_t *ufuncs HB_UNUSED,
 314                                         hb_codepoint_t      u,
 315                                         hb_codepoint_t     *decomposed,
 316                                         void               *user_data HB_UNUSED)
 317 {
 318   UChar utf16[2], normalized[2 * HB_UNICODE_MAX_DECOMPOSITION_LEN + 1];
 319   unsigned int len;
 320   int32_t utf32_len;
 321   hb_bool_t err;
 322   UErrorCode icu_err;
 323
 324   /* Copy @u into a UTF-16 array to be passed to ICU. */
 325   len = 0;
 326   err = FALSE;
 327   U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), u, err);
 328   if (err)
 329     return 0;
 330
 331   /* Normalise the codepoint using NFKD mode. */
 332   icu_err = U_ZERO_ERROR;
 333   len = unorm_normalize (utf16, len, UNORM_NFKD, 0, normalized, ARRAY_LENGTH (normalized), &icu_err);
 334   if (icu_err)
 335     return 0;
 336
 337   /* Convert the decomposed form from UTF-16 to UTF-32. */
 338   icu_err = U_ZERO_ERROR;
 339   u_strToUTF32 ((UChar32*) decomposed, HB_UNICODE_MAX_DECOMPOSITION_LEN, &utf32_len, normalized, len, &icu_err);
 340   if (icu_err)
 341     return 0;
 342
 343   return utf32_len;
 344 }
 345
 346
 347 hb_unicode_funcs_t *
 348 hb_icu_get_unicode_funcs (void)
 349 {
 350   static const hb_unicode_funcs_t _hb_icu_unicode_funcs = {
 351     HB_OBJECT_HEADER_STATIC,
 352
 353     NULL, /* parent */
 354     true, /* immutable */
 355     {
 356 #define HB_UNICODE_FUNC_IMPLEMENT(name) hb_icu_unicode_##name,
 357       HB_UNICODE_FUNCS_IMPLEMENT_CALLBACKS
 358 #undef HB_UNICODE_FUNC_IMPLEMENT
 359     }
 360   };
 361
 362 #if U_ICU_VERSION_MAJOR_NUM >= 49
 363   if (!hb_atomic_ptr_get (&normalizer)) {
 364     UErrorCode icu_err = U_ZERO_ERROR;
 365     /* We ignore failure in getNFCInstace(). */
 366     hb_atomic_ptr_cmpexch (&normalizer, NULL, unorm2_getNFCInstance (&icu_err));
 367   }
 368 #endif
 369   return const_cast<hb_unicode_funcs_t *> (&_hb_icu_unicode_funcs);
 370 }
 371
 372