src/hb-icu.cc

   1 /*
   2  * Copyright © 2009  Red Hat, Inc.
   3  * Copyright © 2009  Keith Stribley
   4  * Copyright © 2011  Google, Inc.
   5  *
   6  *  This is part of HarfBuzz, a text shaping library.
   7  *
   8  * Permission is hereby granted, without written agreement and without
   9  * license or royalty fees, to use, copy, modify, and distribute this
  10  * software and its documentation for any purpose, provided that the
  11  * above copyright notice and the following two paragraphs appear in
  12  * all copies of this software.
  13  *
  14  * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR
  15  * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
  16  * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN
  17  * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
  18  * DAMAGE.
  19  *
  20  * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
  21  * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  22  * FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
  23  * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO
  24  * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
  25  *
  26  * Red Hat Author(s): Behdad Esfahbod
  27  * Google Author(s): Behdad Esfahbod
  28  */
  29
  30 #include "hb-private.hh"
  31
  32 #include "hb-icu.h"
  33
  34 #include "hb-unicode-private.hh"
  35
  36 #include <unicode/uchar.h>
  37 #include <unicode/unorm.h>
  38 #include <unicode/ustring.h>
  39 #include <unicode/utf16.h>
  40 #include <unicode/uversion.h>
  41
  42
  43 hb_script_t
  44 hb_icu_script_to_script (UScriptCode script)
  45 {
  46   if (unlikely (script == USCRIPT_INVALID_CODE))
  47     return HB_SCRIPT_INVALID;
  48
  49   return hb_script_from_string (uscript_getShortName (script), -1);
  50 }
  51
  52 UScriptCode
  53 hb_icu_script_from_script (hb_script_t script)
  54 {
  55   if (unlikely (script == HB_SCRIPT_INVALID))
  56     return USCRIPT_INVALID_CODE;
  57
  58   for (unsigned int i = 0; i < USCRIPT_CODE_LIMIT; i++)
  59     if (unlikely (hb_icu_script_to_script ((UScriptCode) i) == script))
  60       return (UScriptCode) i;
  61
  62   return USCRIPT_UNKNOWN;
  63 }
  64
  65
  66 static hb_unicode_combining_class_t
  67 hb_icu_unicode_combining_class (hb_unicode_funcs_t *ufuncs HB_UNUSED,
  68                                 hb_codepoint_t      unicode,
  69                                 void               *user_data HB_UNUSED)
  70
  71 {
  72   return (hb_unicode_combining_class_t) u_getCombiningClass (unicode);
  73 }
  74
  75 static unsigned int
  76 hb_icu_unicode_eastasian_width (hb_unicode_funcs_t *ufuncs HB_UNUSED,
  77                                 hb_codepoint_t      unicode,
  78                                 void               *user_data HB_UNUSED)
  79 {
  80   switch (u_getIntPropertyValue(unicode, UCHAR_EAST_ASIAN_WIDTH))
  81   {
  82   case U_EA_WIDE:
  83   case U_EA_FULLWIDTH:
  84     return 2;
  85   case U_EA_NEUTRAL:
  86   case U_EA_AMBIGUOUS:
  87   case U_EA_HALFWIDTH:
  88   case U_EA_NARROW:
  89     return 1;
  90   }
  91   return 1;
  92 }
  93
  94 static hb_unicode_general_category_t
  95 hb_icu_unicode_general_category (hb_unicode_funcs_t *ufuncs HB_UNUSED,
  96                                  hb_codepoint_t      unicode,
  97                                  void               *user_data HB_UNUSED)
  98 {
  99   switch (u_getIntPropertyValue(unicode, UCHAR_GENERAL_CATEGORY))
 100   {
 101   case U_UNASSIGNED:                    return HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED;
 102
 103   case U_UPPERCASE_LETTER:              return HB_UNICODE_GENERAL_CATEGORY_UPPERCASE_LETTER;
 104   case U_LOWERCASE_LETTER:              return HB_UNICODE_GENERAL_CATEGORY_LOWERCASE_LETTER;
 105   case U_TITLECASE_LETTER:              return HB_UNICODE_GENERAL_CATEGORY_TITLECASE_LETTER;
 106   case U_MODIFIER_LETTER:               return HB_UNICODE_GENERAL_CATEGORY_MODIFIER_LETTER;
 107   case U_OTHER_LETTER:                  return HB_UNICODE_GENERAL_CATEGORY_OTHER_LETTER;
 108
 109   case U_NON_SPACING_MARK:              return HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK;
 110   case U_ENCLOSING_MARK:                return HB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK;
 111   case U_COMBINING_SPACING_MARK:        return HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK;
 112
 113   case U_DECIMAL_DIGIT_NUMBER:          return HB_UNICODE_GENERAL_CATEGORY_DECIMAL_NUMBER;
 114   case U_LETTER_NUMBER:                 return HB_UNICODE_GENERAL_CATEGORY_LETTER_NUMBER;
 115   case U_OTHER_NUMBER:                  return HB_UNICODE_GENERAL_CATEGORY_OTHER_NUMBER;
 116
 117   case U_SPACE_SEPARATOR:               return HB_UNICODE_GENERAL_CATEGORY_SPACE_SEPARATOR;
 118   case U_LINE_SEPARATOR:                return HB_UNICODE_GENERAL_CATEGORY_LINE_SEPARATOR;
 119   case U_PARAGRAPH_SEPARATOR:           return HB_UNICODE_GENERAL_CATEGORY_PARAGRAPH_SEPARATOR;
 120
 121   case U_CONTROL_CHAR:                  return HB_UNICODE_GENERAL_CATEGORY_CONTROL;
 122   case U_FORMAT_CHAR:                   return HB_UNICODE_GENERAL_CATEGORY_FORMAT;
 123   case U_PRIVATE_USE_CHAR:              return HB_UNICODE_GENERAL_CATEGORY_PRIVATE_USE;
 124   case U_SURROGATE:                     return HB_UNICODE_GENERAL_CATEGORY_SURROGATE;
 125
 126
 127   case U_DASH_PUNCTUATION:              return HB_UNICODE_GENERAL_CATEGORY_DASH_PUNCTUATION;
 128   case U_START_PUNCTUATION:             return HB_UNICODE_GENERAL_CATEGORY_OPEN_PUNCTUATION;
 129   case U_END_PUNCTUATION:               return HB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION;
 130   case U_CONNECTOR_PUNCTUATION:         return HB_UNICODE_GENERAL_CATEGORY_CONNECT_PUNCTUATION;
 131   case U_OTHER_PUNCTUATION:             return HB_UNICODE_GENERAL_CATEGORY_OTHER_PUNCTUATION;
 132
 133   case U_MATH_SYMBOL:                   return HB_UNICODE_GENERAL_CATEGORY_MATH_SYMBOL;
 134   case U_CURRENCY_SYMBOL:               return HB_UNICODE_GENERAL_CATEGORY_CURRENCY_SYMBOL;
 135   case U_MODIFIER_SYMBOL:               return HB_UNICODE_GENERAL_CATEGORY_MODIFIER_SYMBOL;
 136   case U_OTHER_SYMBOL:                  return HB_UNICODE_GENERAL_CATEGORY_OTHER_SYMBOL;
 137
 138   case U_INITIAL_PUNCTUATION:           return HB_UNICODE_GENERAL_CATEGORY_INITIAL_PUNCTUATION;
 139   case U_FINAL_PUNCTUATION:             return HB_UNICODE_GENERAL_CATEGORY_FINAL_PUNCTUATION;
 140   }
 141
 142   return HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED;
 143 }
 144
 145 static hb_codepoint_t
 146 hb_icu_unicode_mirroring (hb_unicode_funcs_t *ufuncs HB_UNUSED,
 147                           hb_codepoint_t      unicode,
 148                           void               *user_data HB_UNUSED)
 149 {
 150   return u_charMirror(unicode);
 151 }
 152
 153 static hb_script_t
 154 hb_icu_unicode_script (hb_unicode_funcs_t *ufuncs HB_UNUSED,
 155                        hb_codepoint_t      unicode,
 156                        void               *user_data HB_UNUSED)
 157 {
 158   UErrorCode status = U_ZERO_ERROR;
 159   UScriptCode scriptCode = uscript_getScript(unicode, &status);
 160
 161   if (unlikely (U_FAILURE (status)))
 162     return HB_SCRIPT_UNKNOWN;
 163
 164   return hb_icu_script_to_script (scriptCode);
 165 }
 166
 167 #if U_ICU_VERSION_MAJOR_NUM >= 49
 168 static const UNormalizer2 *normalizer;
 169 #endif
 170
 171 static hb_bool_t
 172 hb_icu_unicode_compose (hb_unicode_funcs_t *ufuncs HB_UNUSED,
 173                         hb_codepoint_t      a,
 174                         hb_codepoint_t      b,
 175                         hb_codepoint_t     *ab,
 176                         void               *user_data HB_UNUSED)
 177 {
 178 #if U_ICU_VERSION_MAJOR_NUM >= 49
 179   {
 180     UChar32 ret = unorm2_composePair (normalizer, a, b);
 181     if (ret < 0) return false;
 182     *ab = ret;
 183     return true;
 184   }
 185 #endif
 186
 187   /* We don't ifdef-out the fallback code such that compiler always
 188    * sees it and makes sure it's compilable. */
 189
 190   UChar utf16[4], normalized[5];
 191   unsigned int len;
 192   hb_bool_t ret, err;
 193   UErrorCode icu_err;
 194
 195   len = 0;
 196   err = false;
 197   U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), a, err);
 198   if (err) return false;
 199   U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), b, err);
 200   if (err) return false;
 201
 202   icu_err = U_ZERO_ERROR;
 203   len = unorm_normalize (utf16, len, UNORM_NFC, 0, normalized, ARRAY_LENGTH (normalized), &icu_err);
 204   if (U_FAILURE (icu_err))
 205     return false;
 206   if (u_countChar32 (normalized, len) == 1) {
 207     U16_GET_UNSAFE (normalized, 0, *ab);
 208     ret = true;
 209   } else {
 210     ret = false;
 211   }
 212
 213   return ret;
 214 }
 215
 216 static hb_bool_t
 217 hb_icu_unicode_decompose (hb_unicode_funcs_t *ufuncs HB_UNUSED,
 218                           hb_codepoint_t      ab,
 219                           hb_codepoint_t     *a,
 220                           hb_codepoint_t     *b,
 221                           void               *user_data HB_UNUSED)
 222 {
 223 #if U_ICU_VERSION_MAJOR_NUM >= 49
 224   {
 225     UChar decomposed[4];
 226     int len;
 227     UErrorCode icu_err = U_ZERO_ERROR;
 228     len = unorm2_getRawDecomposition (normalizer, ab, decomposed,
 229                                       ARRAY_LENGTH (decomposed), &icu_err);
 230     if (U_FAILURE (icu_err) || len < 0) return false;
 231
 232     len = u_countChar32 (decomposed, len);
 233     if (len == 1) {
 234       U16_GET_UNSAFE (decomposed, 0, *a);
 235       *b = 0;
 236       return *a != ab;
 237     } else if (len == 2) {
 238       len =0;
 239       U16_NEXT_UNSAFE (decomposed, len, *a);
 240       U16_NEXT_UNSAFE (decomposed, len, *b);
 241     }
 242     return true;
 243   }
 244 #endif
 245
 246   /* We don't ifdef-out the fallback code such that compiler always
 247    * sees it and makes sure it's compilable. */
 248
 249   UChar utf16[2], normalized[2 * HB_UNICODE_MAX_DECOMPOSITION_LEN + 1];
 250   unsigned int len;
 251   hb_bool_t ret, err;
 252   UErrorCode icu_err;
 253
 254   /* This function is a monster! Maybe it wasn't a good idea adding a
 255    * pairwise decompose API... */
 256   /* Watchout for the dragons.  Err, watchout for macros changing len. */
 257
 258   len = 0;
 259   err = false;
 260   U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), ab, err);
 261   if (err) return false;
 262
 263   icu_err = U_ZERO_ERROR;
 264   len = unorm_normalize (utf16, len, UNORM_NFD, 0, normalized, ARRAY_LENGTH (normalized), &icu_err);
 265   if (U_FAILURE (icu_err))
 266     return false;
 267
 268   len = u_countChar32 (normalized, len);
 269
 270   if (len == 1) {
 271     U16_GET_UNSAFE (normalized, 0, *a);
 272     *b = 0;
 273     ret = *a != ab;
 274   } else if (len == 2) {
 275     len =0;
 276     U16_NEXT_UNSAFE (normalized, len, *a);
 277     U16_NEXT_UNSAFE (normalized, len, *b);
 278
 279     /* Here's the ugly part: if ab decomposes to a single character and
 280      * that character decomposes again, we have to detect that and undo
 281      * the second part :-(. */
 282     UChar recomposed[20];
 283     icu_err = U_ZERO_ERROR;
 284     unorm_normalize (normalized, len, UNORM_NFC, 0, recomposed, ARRAY_LENGTH (recomposed), &icu_err);
 285     if (U_FAILURE (icu_err))
 286       return false;
 287     hb_codepoint_t c;
 288     U16_GET_UNSAFE (recomposed, 0, c);
 289     if (c != *a && c != ab) {
 290       *a = c;
 291       *b = 0;
 292     }
 293     ret = true;
 294   } else {
 295     /* If decomposed to more than two characters, take the last one,
 296      * and recompose the rest to get the first component. */
 297     U16_PREV_UNSAFE (normalized, len, *b); /* Changes len in-place. */
 298     UChar recomposed[18 * 2];
 299     icu_err = U_ZERO_ERROR;
 300     len = unorm_normalize (normalized, len, UNORM_NFC, 0, recomposed, ARRAY_LENGTH (recomposed), &icu_err);
 301     if (U_FAILURE (icu_err))
 302       return false;
 303     /* We expect that recomposed has exactly one character now. */
 304     if (unlikely (u_countChar32 (recomposed, len) != 1))
 305       return false;
 306     U16_GET_UNSAFE (recomposed, 0, *a);
 307     ret = true;
 308   }
 309
 310   return ret;
 311 }
 312
 313 static unsigned int
 314 hb_icu_unicode_decompose_compatibility (hb_unicode_funcs_t *ufuncs HB_UNUSED,
 315                                         hb_codepoint_t      u,
 316                                         hb_codepoint_t     *decomposed,
 317                                         void               *user_data HB_UNUSED)
 318 {
 319   UChar utf16[2], normalized[2 * HB_UNICODE_MAX_DECOMPOSITION_LEN + 1];
 320   unsigned int len;
 321   int32_t utf32_len;
 322   hb_bool_t err;
 323   UErrorCode icu_err;
 324
 325   /* Copy @u into a UTF-16 array to be passed to ICU. */
 326   len = 0;
 327   err = false;
 328   U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), u, err);
 329   if (err)
 330     return 0;
 331
 332   /* Normalise the codepoint using NFKD mode. */
 333   icu_err = U_ZERO_ERROR;
 334   len = unorm_normalize (utf16, len, UNORM_NFKD, 0, normalized, ARRAY_LENGTH (normalized), &icu_err);
 335   if (icu_err)
 336     return 0;
 337
 338   /* Convert the decomposed form from UTF-16 to UTF-32. */
 339   icu_err = U_ZERO_ERROR;
 340   u_strToUTF32 ((UChar32*) decomposed, HB_UNICODE_MAX_DECOMPOSITION_LEN, &utf32_len, normalized, len, &icu_err);
 341   if (icu_err)
 342     return 0;
 343
 344   return utf32_len;
 345 }
 346
 347
 348 hb_unicode_funcs_t *
 349 hb_icu_get_unicode_funcs (void)
 350 {
 351   static const hb_unicode_funcs_t _hb_icu_unicode_funcs = {
 352     HB_OBJECT_HEADER_STATIC,
 353
 354     NULL, /* parent */
 355     true, /* immutable */
 356     {
 357 #define HB_UNICODE_FUNC_IMPLEMENT(name) hb_icu_unicode_##name,
 358       HB_UNICODE_FUNCS_IMPLEMENT_CALLBACKS
 359 #undef HB_UNICODE_FUNC_IMPLEMENT
 360     }
 361   };
 362
 363 #if U_ICU_VERSION_MAJOR_NUM >= 49
 364   if (!hb_atomic_ptr_get (&normalizer)) {
 365     UErrorCode icu_err = U_ZERO_ERROR;
 366     /* We ignore failure in getNFCInstace(). */
 367     (void) hb_atomic_ptr_cmpexch (&normalizer, NULL, unorm2_getNFCInstance (&icu_err));
 368   }
 369 #endif
 370   return const_cast<hb_unicode_funcs_t *> (&_hb_icu_unicode_funcs);
 371 }