src/hb-icu.cc

   1 /*
   2  * Copyright © 2009  Red Hat, Inc.
   3  * Copyright © 2009  Keith Stribley
   4  * Copyright © 2011  Google, Inc.
   5  *
   6  *  This is part of HarfBuzz, a text shaping library.
   7  *
   8  * Permission is hereby granted, without written agreement and without
   9  * license or royalty fees, to use, copy, modify, and distribute this
  10  * software and its documentation for any purpose, provided that the
  11  * above copyright notice and the following two paragraphs appear in
  12  * all copies of this software.
  13  *
  14  * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR
  15  * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
  16  * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN
  17  * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
  18  * DAMAGE.
  19  *
  20  * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
  21  * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  22  * FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
  23  * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO
  24  * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
  25  *
  26  * Red Hat Author(s): Behdad Esfahbod
  27  * Google Author(s): Behdad Esfahbod
  28  */
  29
  30 #include "hb.hh"
  31
  32 #include "hb-icu.h"
  33
  34 #include "hb-machinery.hh"
  35
  36 #include <unicode/uchar.h>
  37 #include <unicode/unorm2.h>
  38 #include <unicode/ustring.h>
  39 #include <unicode/utf16.h>
  40 #include <unicode/uversion.h>
  41
  42
  43 /**
  44  * SECTION:hb-icu
  45  * @title: hb-icu
  46  * @short_description: ICU integration
  47  * @include: hb-icu.h
  48  *
  49  * Functions for using HarfBuzz with the ICU library to provide Unicode data.
  50  **/
  51
  52
  53 hb_script_t
  54 hb_icu_script_to_script (UScriptCode script)
  55 {
  56   if (unlikely (script == USCRIPT_INVALID_CODE))
  57     return HB_SCRIPT_INVALID;
  58
  59   return hb_script_from_string (uscript_getShortName (script), -1);
  60 }
  61
  62 UScriptCode
  63 hb_icu_script_from_script (hb_script_t script)
  64 {
  65   if (unlikely (script == HB_SCRIPT_INVALID))
  66     return USCRIPT_INVALID_CODE;
  67
  68   unsigned int numScriptCode = 1 + u_getIntPropertyMaxValue (UCHAR_SCRIPT);
  69   for (unsigned int i = 0; i < numScriptCode; i++)
  70     if (unlikely (hb_icu_script_to_script ((UScriptCode) i) == script))
  71       return (UScriptCode) i;
  72
  73   return USCRIPT_UNKNOWN;
  74 }
  75
  76
  77 static hb_unicode_combining_class_t
  78 hb_icu_unicode_combining_class (hb_unicode_funcs_t *ufuncs HB_UNUSED,
  79                                 hb_codepoint_t      unicode,
  80                                 void               *user_data HB_UNUSED)
  81
  82 {
  83   return (hb_unicode_combining_class_t) u_getCombiningClass (unicode);
  84 }
  85
  86 static hb_unicode_general_category_t
  87 hb_icu_unicode_general_category (hb_unicode_funcs_t *ufuncs HB_UNUSED,
  88                                  hb_codepoint_t      unicode,
  89                                  void               *user_data HB_UNUSED)
  90 {
  91   switch (u_getIntPropertyValue(unicode, UCHAR_GENERAL_CATEGORY))
  92   {
  93   case U_UNASSIGNED:                    return HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED;
  94
  95   case U_UPPERCASE_LETTER:              return HB_UNICODE_GENERAL_CATEGORY_UPPERCASE_LETTER;
  96   case U_LOWERCASE_LETTER:              return HB_UNICODE_GENERAL_CATEGORY_LOWERCASE_LETTER;
  97   case U_TITLECASE_LETTER:              return HB_UNICODE_GENERAL_CATEGORY_TITLECASE_LETTER;
  98   case U_MODIFIER_LETTER:               return HB_UNICODE_GENERAL_CATEGORY_MODIFIER_LETTER;
  99   case U_OTHER_LETTER:                  return HB_UNICODE_GENERAL_CATEGORY_OTHER_LETTER;
 100
 101   case U_NON_SPACING_MARK:              return HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK;
 102   case U_ENCLOSING_MARK:                return HB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK;
 103   case U_COMBINING_SPACING_MARK:        return HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK;
 104
 105   case U_DECIMAL_DIGIT_NUMBER:          return HB_UNICODE_GENERAL_CATEGORY_DECIMAL_NUMBER;
 106   case U_LETTER_NUMBER:                 return HB_UNICODE_GENERAL_CATEGORY_LETTER_NUMBER;
 107   case U_OTHER_NUMBER:                  return HB_UNICODE_GENERAL_CATEGORY_OTHER_NUMBER;
 108
 109   case U_SPACE_SEPARATOR:               return HB_UNICODE_GENERAL_CATEGORY_SPACE_SEPARATOR;
 110   case U_LINE_SEPARATOR:                return HB_UNICODE_GENERAL_CATEGORY_LINE_SEPARATOR;
 111   case U_PARAGRAPH_SEPARATOR:           return HB_UNICODE_GENERAL_CATEGORY_PARAGRAPH_SEPARATOR;
 112
 113   case U_CONTROL_CHAR:                  return HB_UNICODE_GENERAL_CATEGORY_CONTROL;
 114   case U_FORMAT_CHAR:                   return HB_UNICODE_GENERAL_CATEGORY_FORMAT;
 115   case U_PRIVATE_USE_CHAR:              return HB_UNICODE_GENERAL_CATEGORY_PRIVATE_USE;
 116   case U_SURROGATE:                     return HB_UNICODE_GENERAL_CATEGORY_SURROGATE;
 117
 118
 119   case U_DASH_PUNCTUATION:              return HB_UNICODE_GENERAL_CATEGORY_DASH_PUNCTUATION;
 120   case U_START_PUNCTUATION:             return HB_UNICODE_GENERAL_CATEGORY_OPEN_PUNCTUATION;
 121   case U_END_PUNCTUATION:               return HB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION;
 122   case U_CONNECTOR_PUNCTUATION:         return HB_UNICODE_GENERAL_CATEGORY_CONNECT_PUNCTUATION;
 123   case U_OTHER_PUNCTUATION:             return HB_UNICODE_GENERAL_CATEGORY_OTHER_PUNCTUATION;
 124
 125   case U_MATH_SYMBOL:                   return HB_UNICODE_GENERAL_CATEGORY_MATH_SYMBOL;
 126   case U_CURRENCY_SYMBOL:               return HB_UNICODE_GENERAL_CATEGORY_CURRENCY_SYMBOL;
 127   case U_MODIFIER_SYMBOL:               return HB_UNICODE_GENERAL_CATEGORY_MODIFIER_SYMBOL;
 128   case U_OTHER_SYMBOL:                  return HB_UNICODE_GENERAL_CATEGORY_OTHER_SYMBOL;
 129
 130   case U_INITIAL_PUNCTUATION:           return HB_UNICODE_GENERAL_CATEGORY_INITIAL_PUNCTUATION;
 131   case U_FINAL_PUNCTUATION:             return HB_UNICODE_GENERAL_CATEGORY_FINAL_PUNCTUATION;
 132   }
 133
 134   return HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED;
 135 }
 136
 137 static hb_codepoint_t
 138 hb_icu_unicode_mirroring (hb_unicode_funcs_t *ufuncs HB_UNUSED,
 139                           hb_codepoint_t      unicode,
 140                           void               *user_data HB_UNUSED)
 141 {
 142   return u_charMirror(unicode);
 143 }
 144
 145 static hb_script_t
 146 hb_icu_unicode_script (hb_unicode_funcs_t *ufuncs HB_UNUSED,
 147                        hb_codepoint_t      unicode,
 148                        void               *user_data HB_UNUSED)
 149 {
 150   UErrorCode status = U_ZERO_ERROR;
 151   UScriptCode scriptCode = uscript_getScript(unicode, &status);
 152
 153   if (unlikely (U_FAILURE (status)))
 154     return HB_SCRIPT_UNKNOWN;
 155
 156   return hb_icu_script_to_script (scriptCode);
 157 }
 158
 159 static hb_bool_t
 160 hb_icu_unicode_compose (hb_unicode_funcs_t *ufuncs HB_UNUSED,
 161                         hb_codepoint_t      a,
 162                         hb_codepoint_t      b,
 163                         hb_codepoint_t     *ab,
 164                         void               *user_data HB_UNUSED)
 165 {
 166 #if U_ICU_VERSION_MAJOR_NUM >= 49
 167   {
 168     const UNormalizer2 *normalizer = (const UNormalizer2 *) user_data;
 169     UChar32 ret = unorm2_composePair (normalizer, a, b);
 170     if (ret < 0) return false;
 171     *ab = ret;
 172     return true;
 173   }
 174 #endif
 175
 176   /* We don't ifdef-out the fallback code such that compiler always
 177    * sees it and makes sure it's compilable. */
 178
 179   UChar utf16[4], normalized[5];
 180   unsigned int len;
 181   hb_bool_t ret, err;
 182   UErrorCode icu_err;
 183
 184   len = 0;
 185   err = false;
 186   U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), a, err);
 187   if (err) return false;
 188   U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), b, err);
 189   if (err) return false;
 190
 191   icu_err = U_ZERO_ERROR;
 192   len = unorm2_normalize (unorm2_getNFCInstance (&icu_err), utf16, len, normalized, ARRAY_LENGTH (normalized), &icu_err);
 193   if (U_FAILURE (icu_err))
 194     return false;
 195   if (u_countChar32 (normalized, len) == 1) {
 196     U16_GET_UNSAFE (normalized, 0, *ab);
 197     ret = true;
 198   } else {
 199     ret = false;
 200   }
 201
 202   return ret;
 203 }
 204
 205 static hb_bool_t
 206 hb_icu_unicode_decompose (hb_unicode_funcs_t *ufuncs HB_UNUSED,
 207                           hb_codepoint_t      ab,
 208                           hb_codepoint_t     *a,
 209                           hb_codepoint_t     *b,
 210                           void               *user_data HB_UNUSED)
 211 {
 212 #if U_ICU_VERSION_MAJOR_NUM >= 49
 213   {
 214     const UNormalizer2 *normalizer = (const UNormalizer2 *) user_data;
 215     UChar decomposed[4];
 216     int len;
 217     UErrorCode icu_err = U_ZERO_ERROR;
 218     len = unorm2_getRawDecomposition (normalizer, ab, decomposed,
 219                                       ARRAY_LENGTH (decomposed), &icu_err);
 220     if (U_FAILURE (icu_err) || len < 0) return false;
 221
 222     len = u_countChar32 (decomposed, len);
 223     if (len == 1) {
 224       U16_GET_UNSAFE (decomposed, 0, *a);
 225       *b = 0;
 226       return *a != ab;
 227     } else if (len == 2) {
 228       len =0;
 229       U16_NEXT_UNSAFE (decomposed, len, *a);
 230       U16_NEXT_UNSAFE (decomposed, len, *b);
 231     }
 232     return true;
 233   }
 234 #endif
 235
 236   /* We don't ifdef-out the fallback code such that compiler always
 237    * sees it and makes sure it's compilable. */
 238
 239   UChar utf16[2], normalized[2 * HB_UNICODE_MAX_DECOMPOSITION_LEN + 1];
 240   unsigned int len;
 241   hb_bool_t ret, err;
 242   UErrorCode icu_err;
 243
 244   /* This function is a monster! Maybe it wasn't a good idea adding a
 245    * pairwise decompose API... */
 246   /* Watchout for the dragons.  Err, watchout for macros changing len. */
 247
 248   len = 0;
 249   err = false;
 250   U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), ab, err);
 251   if (err) return false;
 252
 253   icu_err = U_ZERO_ERROR;
 254   len = unorm2_normalize (unorm2_getNFDInstance (&icu_err), utf16, len, normalized, ARRAY_LENGTH (normalized), &icu_err);
 255   if (U_FAILURE (icu_err))
 256     return false;
 257
 258   len = u_countChar32 (normalized, len);
 259
 260   if (len == 1) {
 261     U16_GET_UNSAFE (normalized, 0, *a);
 262     *b = 0;
 263     ret = *a != ab;
 264   } else if (len == 2) {
 265     len =0;
 266     U16_NEXT_UNSAFE (normalized, len, *a);
 267     U16_NEXT_UNSAFE (normalized, len, *b);
 268
 269     /* Here's the ugly part: if ab decomposes to a single character and
 270      * that character decomposes again, we have to detect that and undo
 271      * the second part :-(. */
 272     UChar recomposed[20];
 273     icu_err = U_ZERO_ERROR;
 274     unorm2_normalize (unorm2_getNFCInstance (&icu_err), normalized, len, recomposed, ARRAY_LENGTH (recomposed), &icu_err);
 275     if (U_FAILURE (icu_err))
 276       return false;
 277     hb_codepoint_t c;
 278     U16_GET_UNSAFE (recomposed, 0, c);
 279     if (c != *a && c != ab) {
 280       *a = c;
 281       *b = 0;
 282     }
 283     ret = true;
 284   } else {
 285     /* If decomposed to more than two characters, take the last one,
 286      * and recompose the rest to get the first component. */
 287     U16_PREV_UNSAFE (normalized, len, *b); /* Changes len in-place. */
 288     UChar recomposed[18 * 2];
 289     icu_err = U_ZERO_ERROR;
 290     len = unorm2_normalize (unorm2_getNFCInstance (&icu_err), normalized, len, recomposed, ARRAY_LENGTH (recomposed), &icu_err);
 291     if (U_FAILURE (icu_err))
 292       return false;
 293     /* We expect that recomposed has exactly one character now. */
 294     if (unlikely (u_countChar32 (recomposed, len) != 1))
 295       return false;
 296     U16_GET_UNSAFE (recomposed, 0, *a);
 297     ret = true;
 298   }
 299
 300   return ret;
 301 }
 302
 303
 304 #if HB_USE_ATEXIT
 305 static void free_static_icu_funcs ();
 306 #endif
 307
 308 static struct hb_icu_unicode_funcs_lazy_loader_t : hb_unicode_funcs_lazy_loader_t<hb_icu_unicode_funcs_lazy_loader_t>
 309 {
 310   static hb_unicode_funcs_t *create ()
 311   {
 312     void *user_data = nullptr;
 313 #if U_ICU_VERSION_MAJOR_NUM >= 49
 314     UErrorCode icu_err = U_ZERO_ERROR;
 315     user_data = (void *) unorm2_getNFCInstance (&icu_err);
 316     assert (user_data);
 317 #endif
 318
 319     hb_unicode_funcs_t *funcs = hb_unicode_funcs_create (nullptr);
 320
 321     hb_unicode_funcs_set_combining_class_func (funcs, hb_icu_unicode_combining_class, nullptr, nullptr);
 322     hb_unicode_funcs_set_general_category_func (funcs, hb_icu_unicode_general_category, nullptr, nullptr);
 323     hb_unicode_funcs_set_mirroring_func (funcs, hb_icu_unicode_mirroring, nullptr, nullptr);
 324     hb_unicode_funcs_set_script_func (funcs, hb_icu_unicode_script, nullptr, nullptr);
 325     hb_unicode_funcs_set_compose_func (funcs, hb_icu_unicode_compose, user_data, nullptr);
 326     hb_unicode_funcs_set_decompose_func (funcs, hb_icu_unicode_decompose, user_data, nullptr);
 327
 328     hb_unicode_funcs_make_immutable (funcs);
 329
 330 #if HB_USE_ATEXIT
 331     atexit (free_static_icu_funcs);
 332 #endif
 333
 334     return funcs;
 335   }
 336 } static_icu_funcs;
 337
 338 #if HB_USE_ATEXIT
 339 static
 340 void free_static_icu_funcs ()
 341 {
 342   static_icu_funcs.free_instance ();
 343 }
 344 #endif
 345
 346 hb_unicode_funcs_t *
 347 hb_icu_get_unicode_funcs ()
 348 {
 349   return static_icu_funcs.get_unconst ();
 350 }