src/hb-icu.cc

   1 /*
   2  * Copyright © 2009  Red Hat, Inc.
   3  * Copyright © 2009  Keith Stribley
   4  * Copyright © 2011  Google, Inc.
   5  *
   6  *  This is part of HarfBuzz, a text shaping library.
   7  *
   8  * Permission is hereby granted, without written agreement and without
   9  * license or royalty fees, to use, copy, modify, and distribute this
  10  * software and its documentation for any purpose, provided that the
  11  * above copyright notice and the following two paragraphs appear in
  12  * all copies of this software.
  13  *
  14  * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR
  15  * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
  16  * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN
  17  * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
  18  * DAMAGE.
  19  *
  20  * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
  21  * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  22  * FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
  23  * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO
  24  * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
  25  *
  26  * Red Hat Author(s): Behdad Esfahbod
  27  * Google Author(s): Behdad Esfahbod
  28  */
  29
  30 #include "hb-private.hh"
  31
  32 #include "hb-icu.h"
  33
  34 #include "hb-unicode-private.hh"
  35
  36 #include <unicode/uversion.h>
  37 #include <unicode/uchar.h>
  38 #include <unicode/unorm.h>
  39 #include <unicode/unistr.h>
  40
  41 HB_BEGIN_DECLS
  42
  43
  44 hb_script_t
  45 hb_icu_script_to_script (UScriptCode script)
  46 {
  47   if (unlikely (script == USCRIPT_INVALID_CODE))
  48     return HB_SCRIPT_INVALID;
  49
  50   return hb_script_from_string (uscript_getShortName (script));
  51 }
  52
  53 UScriptCode
  54 hb_icu_script_from_script (hb_script_t script)
  55 {
  56   if (unlikely (script == HB_SCRIPT_INVALID))
  57     return USCRIPT_INVALID_CODE;
  58
  59   for (unsigned int i = 0; i < USCRIPT_CODE_LIMIT; i++)
  60     if (unlikely (hb_icu_script_to_script ((UScriptCode) i) == script))
  61       return (UScriptCode) i;
  62
  63   return USCRIPT_UNKNOWN;
  64 }
  65
  66
  67 static unsigned int
  68 hb_icu_unicode_combining_class (hb_unicode_funcs_t *ufuncs HB_UNUSED,
  69                                 hb_codepoint_t      unicode,
  70                                 void               *user_data HB_UNUSED)
  71
  72 {
  73   return u_getCombiningClass (unicode);
  74 }
  75
  76 static unsigned int
  77 hb_icu_unicode_eastasian_width (hb_unicode_funcs_t *ufuncs HB_UNUSED,
  78                                 hb_codepoint_t      unicode,
  79                                 void               *user_data HB_UNUSED)
  80 {
  81   switch (u_getIntPropertyValue(unicode, UCHAR_EAST_ASIAN_WIDTH))
  82   {
  83   case U_EA_WIDE:
  84   case U_EA_FULLWIDTH:
  85     return 2;
  86   case U_EA_NEUTRAL:
  87   case U_EA_AMBIGUOUS:
  88   case U_EA_HALFWIDTH:
  89   case U_EA_NARROW:
  90     return 1;
  91   }
  92   return 1;
  93 }
  94
  95 static hb_unicode_general_category_t
  96 hb_icu_unicode_general_category (hb_unicode_funcs_t *ufuncs HB_UNUSED,
  97                                  hb_codepoint_t      unicode,
  98                                  void               *user_data HB_UNUSED)
  99 {
 100   switch (u_getIntPropertyValue(unicode, UCHAR_GENERAL_CATEGORY))
 101   {
 102   case U_UNASSIGNED:                    return HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED;
 103
 104   case U_UPPERCASE_LETTER:              return HB_UNICODE_GENERAL_CATEGORY_UPPERCASE_LETTER;
 105   case U_LOWERCASE_LETTER:              return HB_UNICODE_GENERAL_CATEGORY_LOWERCASE_LETTER;
 106   case U_TITLECASE_LETTER:              return HB_UNICODE_GENERAL_CATEGORY_TITLECASE_LETTER;
 107   case U_MODIFIER_LETTER:               return HB_UNICODE_GENERAL_CATEGORY_MODIFIER_LETTER;
 108   case U_OTHER_LETTER:                  return HB_UNICODE_GENERAL_CATEGORY_OTHER_LETTER;
 109
 110   case U_NON_SPACING_MARK:              return HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK;
 111   case U_ENCLOSING_MARK:                return HB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK;
 112   case U_COMBINING_SPACING_MARK:        return HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK;
 113
 114   case U_DECIMAL_DIGIT_NUMBER:          return HB_UNICODE_GENERAL_CATEGORY_DECIMAL_NUMBER;
 115   case U_LETTER_NUMBER:                 return HB_UNICODE_GENERAL_CATEGORY_LETTER_NUMBER;
 116   case U_OTHER_NUMBER:                  return HB_UNICODE_GENERAL_CATEGORY_OTHER_NUMBER;
 117
 118   case U_SPACE_SEPARATOR:               return HB_UNICODE_GENERAL_CATEGORY_SPACE_SEPARATOR;
 119   case U_LINE_SEPARATOR:                return HB_UNICODE_GENERAL_CATEGORY_LINE_SEPARATOR;
 120   case U_PARAGRAPH_SEPARATOR:           return HB_UNICODE_GENERAL_CATEGORY_PARAGRAPH_SEPARATOR;
 121
 122   case U_CONTROL_CHAR:                  return HB_UNICODE_GENERAL_CATEGORY_CONTROL;
 123   case U_FORMAT_CHAR:                   return HB_UNICODE_GENERAL_CATEGORY_FORMAT;
 124   case U_PRIVATE_USE_CHAR:              return HB_UNICODE_GENERAL_CATEGORY_PRIVATE_USE;
 125   case U_SURROGATE:                     return HB_UNICODE_GENERAL_CATEGORY_SURROGATE;
 126
 127
 128   case U_DASH_PUNCTUATION:              return HB_UNICODE_GENERAL_CATEGORY_DASH_PUNCTUATION;
 129   case U_START_PUNCTUATION:             return HB_UNICODE_GENERAL_CATEGORY_OPEN_PUNCTUATION;
 130   case U_END_PUNCTUATION:               return HB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION;
 131   case U_CONNECTOR_PUNCTUATION:         return HB_UNICODE_GENERAL_CATEGORY_CONNECT_PUNCTUATION;
 132   case U_OTHER_PUNCTUATION:             return HB_UNICODE_GENERAL_CATEGORY_OTHER_PUNCTUATION;
 133
 134   case U_MATH_SYMBOL:                   return HB_UNICODE_GENERAL_CATEGORY_MATH_SYMBOL;
 135   case U_CURRENCY_SYMBOL:               return HB_UNICODE_GENERAL_CATEGORY_CURRENCY_SYMBOL;
 136   case U_MODIFIER_SYMBOL:               return HB_UNICODE_GENERAL_CATEGORY_MODIFIER_SYMBOL;
 137   case U_OTHER_SYMBOL:                  return HB_UNICODE_GENERAL_CATEGORY_OTHER_SYMBOL;
 138
 139   case U_INITIAL_PUNCTUATION:           return HB_UNICODE_GENERAL_CATEGORY_INITIAL_PUNCTUATION;
 140   case U_FINAL_PUNCTUATION:             return HB_UNICODE_GENERAL_CATEGORY_FINAL_PUNCTUATION;
 141   }
 142
 143   return HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED;
 144 }
 145
 146 static hb_codepoint_t
 147 hb_icu_unicode_mirroring (hb_unicode_funcs_t *ufuncs HB_UNUSED,
 148                           hb_codepoint_t      unicode,
 149                           void               *user_data HB_UNUSED)
 150 {
 151   return u_charMirror(unicode);
 152 }
 153
 154 static hb_script_t
 155 hb_icu_unicode_script (hb_unicode_funcs_t *ufuncs HB_UNUSED,
 156                        hb_codepoint_t      unicode,
 157                        void               *user_data HB_UNUSED)
 158 {
 159   UErrorCode status = U_ZERO_ERROR;
 160   UScriptCode scriptCode = uscript_getScript(unicode, &status);
 161
 162   if (unlikely (status != U_ZERO_ERROR))
 163     return HB_SCRIPT_UNKNOWN;
 164
 165   return hb_icu_script_to_script (scriptCode);
 166 }
 167
 168 static hb_bool_t
 169 hb_icu_unicode_compose (hb_unicode_funcs_t *ufuncs HB_UNUSED,
 170                         hb_codepoint_t      a,
 171                         hb_codepoint_t      b,
 172                         hb_codepoint_t     *ab,
 173                         void               *user_data HB_UNUSED)
 174 {
 175   if (!a || !b)
 176     return FALSE;
 177
 178   UChar utf16[4], normalized[5];
 179   gint len;
 180   hb_bool_t ret, err;
 181   UErrorCode icu_err;
 182
 183   len = 0;
 184   err = FALSE;
 185   U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), a, err);
 186   if (err) return FALSE;
 187   U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), b, err);
 188   if (err) return FALSE;
 189
 190   icu_err = U_ZERO_ERROR;
 191   len = unorm_normalize (utf16, len, UNORM_NFC, 0, normalized, ARRAY_LENGTH (normalized), &icu_err);
 192   if (icu_err)
 193     return FALSE;
 194   normalized[len] = 0;
 195   if (u_strlen (normalized) == 1) {
 196     U16_GET_UNSAFE (normalized, 0, *ab);
 197     ret = TRUE;
 198   } else {
 199     ret = FALSE;
 200   }
 201
 202   return ret;
 203 }
 204
 205 static hb_bool_t
 206 hb_icu_unicode_decompose (hb_unicode_funcs_t *ufuncs HB_UNUSED,
 207                           hb_codepoint_t      ab,
 208                           hb_codepoint_t     *a,
 209                           hb_codepoint_t     *b,
 210                           void               *user_data HB_UNUSED)
 211 {
 212   UChar utf16[2], normalized[20];
 213   gint len;
 214   hb_bool_t ret, err;
 215   UErrorCode icu_err;
 216
 217   len = 0;
 218   err = FALSE;
 219   U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), ab, err);
 220   if (err) return FALSE;
 221
 222   icu_err = U_ZERO_ERROR;
 223   len = unorm_normalize (utf16, len, UNORM_NFD, 0, normalized, ARRAY_LENGTH (normalized), &icu_err);
 224   if (icu_err)
 225     return FALSE;
 226
 227   normalized[len] = 0;
 228   len = u_strlen (normalized);
 229
 230   if (len == 1) {
 231     U16_GET_UNSAFE (normalized, 0, *a);
 232     *b = 0;
 233     ret = *a != ab;
 234   } else if (len == 2) {
 235     /* Here's the ugly part: if ab decomposes to a single character and
 236      * that character decomposes again, we have to detect that and undo
 237      * the second part :-(. */
 238     UChar recomposed[20];
 239     icu_err = U_ZERO_ERROR;
 240     len = unorm_normalize (normalized, len, UNORM_NFC, 0, recomposed, ARRAY_LENGTH (recomposed), &icu_err);
 241     if (icu_err)
 242       return FALSE;
 243     U16_GET_UNSAFE (recomposed, 0, *a);
 244     if (*a != ab) {
 245       *b = 0;
 246     } else {
 247       len =0;
 248       U16_NEXT_UNSAFE (normalized, len, *a);
 249       U16_GET_UNSAFE (normalized, len, *b);
 250     }
 251     ret = TRUE;
 252   } else {
 253     /* If decomposed to more than two characters, take the last one,
 254      * and recompose the rest to get the first component. */
 255     U16_PREV_UNSAFE (normalized, len, *b);
 256     UChar recomposed[20];
 257     icu_err = U_ZERO_ERROR;
 258     len = unorm_normalize (normalized, len, UNORM_NFC, 0, recomposed, ARRAY_LENGTH (recomposed), &icu_err);
 259     if (icu_err)
 260       return FALSE;
 261     /* We expect that recomposed has exactly one character now. */
 262     U16_GET_UNSAFE (recomposed, 0, *a);
 263     ret = TRUE;
 264   }
 265
 266   return ret;
 267 }
 268
 269 extern HB_INTERNAL hb_unicode_funcs_t _hb_unicode_funcs_icu;
 270 hb_unicode_funcs_t _hb_icu_unicode_funcs = {
 271   HB_OBJECT_HEADER_STATIC,
 272
 273   NULL, /* parent */
 274   TRUE, /* immutable */
 275   {
 276 #define HB_UNICODE_FUNC_IMPLEMENT(name) hb_icu_unicode_##name,
 277     HB_UNICODE_FUNCS_IMPLEMENT_CALLBACKS
 278 #undef HB_UNICODE_FUNC_IMPLEMENT
 279   }
 280 };
 281
 282 hb_unicode_funcs_t *
 283 hb_icu_get_unicode_funcs (void)
 284 {
 285   return &_hb_icu_unicode_funcs;
 286 }
 287
 288
 289 HB_END_DECLS