src/hb-icu.cc

   1 /*
   2  * Copyright © 2009  Red Hat, Inc.
   3  * Copyright © 2009  Keith Stribley
   4  * Copyright © 2011  Google, Inc.
   5  *
   6  *  This is part of HarfBuzz, a text shaping library.
   7  *
   8  * Permission is hereby granted, without written agreement and without
   9  * license or royalty fees, to use, copy, modify, and distribute this
  10  * software and its documentation for any purpose, provided that the
  11  * above copyright notice and the following two paragraphs appear in
  12  * all copies of this software.
  13  *
  14  * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR
  15  * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
  16  * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN
  17  * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
  18  * DAMAGE.
  19  *
  20  * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
  21  * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  22  * FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
  23  * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO
  24  * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
  25  *
  26  * Red Hat Author(s): Behdad Esfahbod
  27  * Google Author(s): Behdad Esfahbod
  28  */
  29
  30 #include "hb.hh"
  31
  32 #ifdef HAVE_ICU
  33
  34 #include "hb-icu.h"
  35
  36 #include "hb-machinery.hh"
  37
  38 #include <unicode/uchar.h>
  39 #include <unicode/unorm2.h>
  40 #include <unicode/ustring.h>
  41 #include <unicode/utf16.h>
  42 #include <unicode/uversion.h>
  43
  44 /* ICU extra semicolon, fixed since 65, https://github.com/unicode-org/icu/commit/480bec3 */
  45 #if U_ICU_VERSION_MAJOR_NUM < 65 && (defined(__GNUC__) || defined(__clang__))
  46 #define HB_ICU_EXTRA_SEMI_IGNORED
  47 #pragma GCC diagnostic push
  48 #pragma GCC diagnostic ignored "-Wextra-semi-stmt"
  49 #endif
  50
  51 /**
  52  * SECTION:hb-icu
  53  * @title: hb-icu
  54  * @short_description: ICU integration
  55  * @include: hb-icu.h
  56  *
  57  * Functions for using HarfBuzz with the International Components for Unicode
  58  * (ICU) library. HarfBuzz supports using ICU to provide Unicode data, by attaching
  59  * ICU functions to the virtual methods in a #hb_unicode_funcs_t function
  60  * structure.
  61  **/
  62
  63 /**
  64  * hb_icu_script_to_script:
  65  * @script: The UScriptCode identifier to query
  66  *
  67  * Fetches the #hb_script_t script that corresponds to the
  68  * specified UScriptCode identifier.
  69  *
  70  * Return value: the #hb_script_t script found
  71  *
  72  **/
  73
  74 hb_script_t
  75 hb_icu_script_to_script (UScriptCode script)
  76 {
  77   if (unlikely (script == USCRIPT_INVALID_CODE))
  78     return HB_SCRIPT_INVALID;
  79
  80   return hb_script_from_string (uscript_getShortName (script), -1);
  81 }
  82
  83 /**
  84  * hb_icu_script_from_script:
  85  * @script: The #hb_script_t script to query
  86  *
  87  * Fetches the UScriptCode identifier that corresponds to the
  88  * specified #hb_script_t script.
  89  *
  90  * Return value: the UScriptCode identifier found
  91  *
  92  **/
  93 UScriptCode
  94 hb_icu_script_from_script (hb_script_t script)
  95 {
  96   if (unlikely (script == HB_SCRIPT_INVALID))
  97     return USCRIPT_INVALID_CODE;
  98
  99   unsigned int numScriptCode = 1 + u_getIntPropertyMaxValue (UCHAR_SCRIPT);
 100   for (unsigned int i = 0; i < numScriptCode; i++)
 101     if (unlikely (hb_icu_script_to_script ((UScriptCode) i) == script))
 102       return (UScriptCode) i;
 103
 104   return USCRIPT_UNKNOWN;
 105 }
 106
 107
 108 static hb_unicode_combining_class_t
 109 hb_icu_unicode_combining_class (hb_unicode_funcs_t *ufuncs HB_UNUSED,
 110                                 hb_codepoint_t      unicode,
 111                                 void               *user_data HB_UNUSED)
 112
 113 {
 114   return (hb_unicode_combining_class_t) u_getCombiningClass (unicode);
 115 }
 116
 117 static hb_unicode_general_category_t
 118 hb_icu_unicode_general_category (hb_unicode_funcs_t *ufuncs HB_UNUSED,
 119                                  hb_codepoint_t      unicode,
 120                                  void               *user_data HB_UNUSED)
 121 {
 122   switch (u_getIntPropertyValue(unicode, UCHAR_GENERAL_CATEGORY))
 123   {
 124   case U_UNASSIGNED:                    return HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED;
 125
 126   case U_UPPERCASE_LETTER:              return HB_UNICODE_GENERAL_CATEGORY_UPPERCASE_LETTER;
 127   case U_LOWERCASE_LETTER:              return HB_UNICODE_GENERAL_CATEGORY_LOWERCASE_LETTER;
 128   case U_TITLECASE_LETTER:              return HB_UNICODE_GENERAL_CATEGORY_TITLECASE_LETTER;
 129   case U_MODIFIER_LETTER:               return HB_UNICODE_GENERAL_CATEGORY_MODIFIER_LETTER;
 130   case U_OTHER_LETTER:                  return HB_UNICODE_GENERAL_CATEGORY_OTHER_LETTER;
 131
 132   case U_NON_SPACING_MARK:              return HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK;
 133   case U_ENCLOSING_MARK:                return HB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK;
 134   case U_COMBINING_SPACING_MARK:        return HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK;
 135
 136   case U_DECIMAL_DIGIT_NUMBER:          return HB_UNICODE_GENERAL_CATEGORY_DECIMAL_NUMBER;
 137   case U_LETTER_NUMBER:                 return HB_UNICODE_GENERAL_CATEGORY_LETTER_NUMBER;
 138   case U_OTHER_NUMBER:                  return HB_UNICODE_GENERAL_CATEGORY_OTHER_NUMBER;
 139
 140   case U_SPACE_SEPARATOR:               return HB_UNICODE_GENERAL_CATEGORY_SPACE_SEPARATOR;
 141   case U_LINE_SEPARATOR:                return HB_UNICODE_GENERAL_CATEGORY_LINE_SEPARATOR;
 142   case U_PARAGRAPH_SEPARATOR:           return HB_UNICODE_GENERAL_CATEGORY_PARAGRAPH_SEPARATOR;
 143
 144   case U_CONTROL_CHAR:                  return HB_UNICODE_GENERAL_CATEGORY_CONTROL;
 145   case U_FORMAT_CHAR:                   return HB_UNICODE_GENERAL_CATEGORY_FORMAT;
 146   case U_PRIVATE_USE_CHAR:              return HB_UNICODE_GENERAL_CATEGORY_PRIVATE_USE;
 147   case U_SURROGATE:                     return HB_UNICODE_GENERAL_CATEGORY_SURROGATE;
 148
 149
 150   case U_DASH_PUNCTUATION:              return HB_UNICODE_GENERAL_CATEGORY_DASH_PUNCTUATION;
 151   case U_START_PUNCTUATION:             return HB_UNICODE_GENERAL_CATEGORY_OPEN_PUNCTUATION;
 152   case U_END_PUNCTUATION:               return HB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION;
 153   case U_CONNECTOR_PUNCTUATION:         return HB_UNICODE_GENERAL_CATEGORY_CONNECT_PUNCTUATION;
 154   case U_OTHER_PUNCTUATION:             return HB_UNICODE_GENERAL_CATEGORY_OTHER_PUNCTUATION;
 155
 156   case U_MATH_SYMBOL:                   return HB_UNICODE_GENERAL_CATEGORY_MATH_SYMBOL;
 157   case U_CURRENCY_SYMBOL:               return HB_UNICODE_GENERAL_CATEGORY_CURRENCY_SYMBOL;
 158   case U_MODIFIER_SYMBOL:               return HB_UNICODE_GENERAL_CATEGORY_MODIFIER_SYMBOL;
 159   case U_OTHER_SYMBOL:                  return HB_UNICODE_GENERAL_CATEGORY_OTHER_SYMBOL;
 160
 161   case U_INITIAL_PUNCTUATION:           return HB_UNICODE_GENERAL_CATEGORY_INITIAL_PUNCTUATION;
 162   case U_FINAL_PUNCTUATION:             return HB_UNICODE_GENERAL_CATEGORY_FINAL_PUNCTUATION;
 163   }
 164
 165   return HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED;
 166 }
 167
 168 static hb_codepoint_t
 169 hb_icu_unicode_mirroring (hb_unicode_funcs_t *ufuncs HB_UNUSED,
 170                           hb_codepoint_t      unicode,
 171                           void               *user_data HB_UNUSED)
 172 {
 173   return u_charMirror(unicode);
 174 }
 175
 176 static hb_script_t
 177 hb_icu_unicode_script (hb_unicode_funcs_t *ufuncs HB_UNUSED,
 178                        hb_codepoint_t      unicode,
 179                        void               *user_data HB_UNUSED)
 180 {
 181   UErrorCode status = U_ZERO_ERROR;
 182   UScriptCode scriptCode = uscript_getScript(unicode, &status);
 183
 184   if (unlikely (U_FAILURE (status)))
 185     return HB_SCRIPT_UNKNOWN;
 186
 187   return hb_icu_script_to_script (scriptCode);
 188 }
 189
 190 static hb_bool_t
 191 hb_icu_unicode_compose (hb_unicode_funcs_t *ufuncs HB_UNUSED,
 192                         hb_codepoint_t      a,
 193                         hb_codepoint_t      b,
 194                         hb_codepoint_t     *ab,
 195                         void               *user_data HB_UNUSED)
 196 {
 197 #if U_ICU_VERSION_MAJOR_NUM >= 49
 198   {
 199     const UNormalizer2 *normalizer = (const UNormalizer2 *) user_data;
 200     UChar32 ret = unorm2_composePair (normalizer, a, b);
 201     if (ret < 0) return false;
 202     *ab = ret;
 203     return true;
 204   }
 205 #endif
 206
 207   /* We don't ifdef-out the fallback code such that compiler always
 208    * sees it and makes sure it's compilable. */
 209
 210   UChar utf16[4], normalized[5];
 211   unsigned int len;
 212   hb_bool_t ret, err;
 213   UErrorCode icu_err;
 214
 215   len = 0;
 216   err = false;
 217   U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), a, err);
 218   if (err) return false;
 219   U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), b, err);
 220   if (err) return false;
 221
 222   icu_err = U_ZERO_ERROR;
 223   len = unorm2_normalize (unorm2_getNFCInstance (&icu_err), utf16, len, normalized, ARRAY_LENGTH (normalized), &icu_err);
 224   if (U_FAILURE (icu_err))
 225     return false;
 226   if (u_countChar32 (normalized, len) == 1) {
 227     U16_GET_UNSAFE (normalized, 0, *ab);
 228     ret = true;
 229   } else {
 230     ret = false;
 231   }
 232
 233   return ret;
 234 }
 235
 236 static hb_bool_t
 237 hb_icu_unicode_decompose (hb_unicode_funcs_t *ufuncs HB_UNUSED,
 238                           hb_codepoint_t      ab,
 239                           hb_codepoint_t     *a,
 240                           hb_codepoint_t     *b,
 241                           void               *user_data HB_UNUSED)
 242 {
 243 #if U_ICU_VERSION_MAJOR_NUM >= 49
 244   {
 245     const UNormalizer2 *normalizer = (const UNormalizer2 *) user_data;
 246     UChar decomposed[4];
 247     int len;
 248     UErrorCode icu_err = U_ZERO_ERROR;
 249     len = unorm2_getRawDecomposition (normalizer, ab, decomposed,
 250                                       ARRAY_LENGTH (decomposed), &icu_err);
 251     if (U_FAILURE (icu_err) || len < 0) return false;
 252
 253     len = u_countChar32 (decomposed, len);
 254     if (len == 1) {
 255       U16_GET_UNSAFE (decomposed, 0, *a);
 256       *b = 0;
 257       return *a != ab;
 258     } else if (len == 2) {
 259       len = 0;
 260       U16_NEXT_UNSAFE (decomposed, len, *a);
 261       U16_NEXT_UNSAFE (decomposed, len, *b);
 262     }
 263     return true;
 264   }
 265 #endif
 266
 267   /* We don't ifdef-out the fallback code such that compiler always
 268    * sees it and makes sure it's compilable. */
 269
 270   UChar utf16[2], normalized[2 * 19/*HB_UNICODE_MAX_DECOMPOSITION_LEN*/ + 1];
 271   unsigned int len;
 272   hb_bool_t ret, err;
 273   UErrorCode icu_err;
 274
 275   /* This function is a monster! Maybe it wasn't a good idea adding a
 276    * pairwise decompose API... */
 277   /* Watchout for the dragons.  Err, watchout for macros changing len. */
 278
 279   len = 0;
 280   err = false;
 281   U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), ab, err);
 282   if (err) return false;
 283
 284   icu_err = U_ZERO_ERROR;
 285   len = unorm2_normalize (unorm2_getNFDInstance (&icu_err), utf16, len, normalized, ARRAY_LENGTH (normalized), &icu_err);
 286   if (U_FAILURE (icu_err))
 287     return false;
 288
 289   len = u_countChar32 (normalized, len);
 290
 291   if (len == 1) {
 292     U16_GET_UNSAFE (normalized, 0, *a);
 293     *b = 0;
 294     ret = *a != ab;
 295   } else if (len == 2) {
 296     len = 0;
 297     U16_NEXT_UNSAFE (normalized, len, *a);
 298     U16_NEXT_UNSAFE (normalized, len, *b);
 299
 300     /* Here's the ugly part: if ab decomposes to a single character and
 301      * that character decomposes again, we have to detect that and undo
 302      * the second part :-(. */
 303     UChar recomposed[20];
 304     icu_err = U_ZERO_ERROR;
 305     unorm2_normalize (unorm2_getNFCInstance (&icu_err), normalized, len, recomposed, ARRAY_LENGTH (recomposed), &icu_err);
 306     if (U_FAILURE (icu_err))
 307       return false;
 308     hb_codepoint_t c;
 309     U16_GET_UNSAFE (recomposed, 0, c);
 310     if (c != *a && c != ab) {
 311       *a = c;
 312       *b = 0;
 313     }
 314     ret = true;
 315   } else {
 316     /* If decomposed to more than two characters, take the last one,
 317      * and recompose the rest to get the first component. */
 318     U16_PREV_UNSAFE (normalized, len, *b); /* Changes len in-place. */
 319     UChar recomposed[18 * 2];
 320     icu_err = U_ZERO_ERROR;
 321     len = unorm2_normalize (unorm2_getNFCInstance (&icu_err), normalized, len, recomposed, ARRAY_LENGTH (recomposed), &icu_err);
 322     if (U_FAILURE (icu_err))
 323       return false;
 324     /* We expect that recomposed has exactly one character now. */
 325     if (unlikely (u_countChar32 (recomposed, len) != 1))
 326       return false;
 327     U16_GET_UNSAFE (recomposed, 0, *a);
 328     ret = true;
 329   }
 330
 331   return ret;
 332 }
 333
 334
 335 #if HB_USE_ATEXIT
 336 static void free_static_icu_funcs ();
 337 #endif
 338
 339 static struct hb_icu_unicode_funcs_lazy_loader_t : hb_unicode_funcs_lazy_loader_t<hb_icu_unicode_funcs_lazy_loader_t>
 340 {
 341   static hb_unicode_funcs_t *create ()
 342   {
 343     void *user_data = nullptr;
 344 #if U_ICU_VERSION_MAJOR_NUM >= 49
 345     UErrorCode icu_err = U_ZERO_ERROR;
 346     user_data = (void *) unorm2_getNFCInstance (&icu_err);
 347     assert (user_data);
 348 #endif
 349
 350     hb_unicode_funcs_t *funcs = hb_unicode_funcs_create (nullptr);
 351
 352     hb_unicode_funcs_set_combining_class_func (funcs, hb_icu_unicode_combining_class, nullptr, nullptr);
 353     hb_unicode_funcs_set_general_category_func (funcs, hb_icu_unicode_general_category, nullptr, nullptr);
 354     hb_unicode_funcs_set_mirroring_func (funcs, hb_icu_unicode_mirroring, nullptr, nullptr);
 355     hb_unicode_funcs_set_script_func (funcs, hb_icu_unicode_script, nullptr, nullptr);
 356     hb_unicode_funcs_set_compose_func (funcs, hb_icu_unicode_compose, user_data, nullptr);
 357     hb_unicode_funcs_set_decompose_func (funcs, hb_icu_unicode_decompose, user_data, nullptr);
 358
 359     hb_unicode_funcs_make_immutable (funcs);
 360
 361 #if HB_USE_ATEXIT
 362     atexit (free_static_icu_funcs);
 363 #endif
 364
 365     return funcs;
 366   }
 367 } static_icu_funcs;
 368
 369 #if HB_USE_ATEXIT
 370 static
 371 void free_static_icu_funcs ()
 372 {
 373   static_icu_funcs.free_instance ();
 374 }
 375 #endif
 376
 377 /**
 378  * hb_icu_get_unicode_funcs:
 379  *
 380  * Fetches a Unicode-functions structure that is populated
 381  * with the appropriate ICU function for each method.
 382  *
 383  * Return value: (transfer none): a pointer to the #hb_unicode_funcs_t Unicode-functions structure
 384  *
 385  * Since: 0.9.38
 386  **/
 387 hb_unicode_funcs_t *
 388 hb_icu_get_unicode_funcs ()
 389 {
 390   return static_icu_funcs.get_unconst ();
 391 }
 392
 393 #ifdef HB_ICU_EXTRA_SEMI_IGNORED
 394 #pragma GCC diagnostic pop
 395 #endif
 396
 397 #endif