src/hb-ot-shape-complex-indic-private.hh

   1 /*
   2  * Copyright © 2012  Google, Inc.
   3  *
   4  *  This is part of HarfBuzz, a text shaping library.
   5  *
   6  * Permission is hereby granted, without written agreement and without
   7  * license or royalty fees, to use, copy, modify, and distribute this
   8  * software and its documentation for any purpose, provided that the
   9  * above copyright notice and the following two paragraphs appear in
  10  * all copies of this software.
  11  *
  12  * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR
  13  * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
  14  * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN
  15  * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
  16  * DAMAGE.
  17  *
  18  * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
  19  * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  20  * FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
  21  * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO
  22  * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
  23  *
  24  * Google Author(s): Behdad Esfahbod
  25  */
  26
  27 #ifndef HB_OT_SHAPE_COMPLEX_INDIC_PRIVATE_HH
  28 #define HB_OT_SHAPE_COMPLEX_INDIC_PRIVATE_HH
  29
  30 #include "hb-private.hh"
  31
  32
  33 #include "hb-ot-shape-complex-private.hh"
  34 #include "hb-ot-shape-private.hh" /* XXX Remove */
  35
  36
  37 /* buffer var allocations */
  38 #define indic_category() complex_var_u8_0() /* indic_category_t */
  39 #define indic_position() complex_var_u8_1() /* indic_matra_category_t */
  40
  41
  42 #define INDIC_TABLE_ELEMENT_TYPE uint8_t
  43
  44 /* Cateories used in the OpenType spec:
  45  * https://www.microsoft.com/typography/otfntdev/devanot/shaping.aspx
  46  */
  47 /* Note: This enum is duplicated in the -machine.rl source file.
  48  * Not sure how to avoid duplication. */
  49 enum indic_category_t {
  50   OT_X = 0,
  51   OT_C,
  52   OT_V,
  53   OT_N,
  54   OT_H,
  55   OT_ZWNJ,
  56   OT_ZWJ,
  57   OT_M,
  58   OT_SM,
  59   OT_VD,
  60   OT_A,
  61   OT_NBSP,
  62   OT_DOTTEDCIRCLE, /* Not in the spec, but special in Uniscribe. /Very very/ special! */
  63   OT_RS, /* Register Shifter, used in Khmer OT spec */
  64   OT_Coeng,
  65   OT_Repha,
  66   OT_Ra /* Not explicitly listed in the OT spec, but used in the grammar. */
  67 };
  68
  69 /* Visual positions in a syllable from left to right. */
  70 enum indic_position_t {
  71   POS_START,
  72
  73   POS_RA_TO_BECOME_REPH,
  74   POS_PRE_M,
  75   POS_PRE_C,
  76
  77   POS_BASE_C,
  78   POS_AFTER_MAIN,
  79
  80   POS_ABOVE_C,
  81
  82   POS_BEFORE_SUB,
  83   POS_BELOW_C,
  84   POS_AFTER_SUB,
  85
  86   POS_BEFORE_POST,
  87   POS_POST_C,
  88   POS_AFTER_POST,
  89
  90   POS_FINAL_C,
  91   POS_SMVD,
  92
  93   POS_END
  94 };
  95
  96 /* Categories used in IndicSyllabicCategory.txt from UCD. */
  97 enum indic_syllabic_category_t {
  98   INDIC_SYLLABIC_CATEGORY_OTHER                 = OT_X,
  99
 100   INDIC_SYLLABIC_CATEGORY_AVAGRAHA              = OT_X,
 101   INDIC_SYLLABIC_CATEGORY_BINDU                 = OT_SM,
 102   INDIC_SYLLABIC_CATEGORY_CONSONANT             = OT_C,
 103   INDIC_SYLLABIC_CATEGORY_CONSONANT_DEAD        = OT_C,
 104   INDIC_SYLLABIC_CATEGORY_CONSONANT_FINAL       = OT_C,
 105   INDIC_SYLLABIC_CATEGORY_CONSONANT_HEAD_LETTER = OT_C,
 106   INDIC_SYLLABIC_CATEGORY_CONSONANT_MEDIAL      = OT_C,
 107   INDIC_SYLLABIC_CATEGORY_CONSONANT_PLACEHOLDER = OT_NBSP,
 108   INDIC_SYLLABIC_CATEGORY_CONSONANT_SUBJOINED   = OT_C,
 109   INDIC_SYLLABIC_CATEGORY_CONSONANT_REPHA       = OT_Repha,
 110   INDIC_SYLLABIC_CATEGORY_MODIFYING_LETTER      = OT_X,
 111   INDIC_SYLLABIC_CATEGORY_NUKTA                 = OT_N,
 112   INDIC_SYLLABIC_CATEGORY_REGISTER_SHIFTER      = OT_RS,
 113   INDIC_SYLLABIC_CATEGORY_TONE_LETTER           = OT_X,
 114   INDIC_SYLLABIC_CATEGORY_TONE_MARK             = OT_X,
 115   INDIC_SYLLABIC_CATEGORY_VIRAMA                = OT_H,
 116   INDIC_SYLLABIC_CATEGORY_VISARGA               = OT_SM,
 117   INDIC_SYLLABIC_CATEGORY_VOWEL                 = OT_V,
 118   INDIC_SYLLABIC_CATEGORY_VOWEL_DEPENDENT       = OT_M,
 119   INDIC_SYLLABIC_CATEGORY_VOWEL_INDEPENDENT     = OT_V
 120 };
 121
 122 /* Categories used in IndicSMatraCategory.txt from UCD */
 123 enum indic_matra_category_t {
 124   INDIC_MATRA_CATEGORY_NOT_APPLICABLE           = POS_END,
 125
 126   INDIC_MATRA_CATEGORY_LEFT                     = POS_PRE_C,
 127   INDIC_MATRA_CATEGORY_TOP                      = POS_ABOVE_C,
 128   INDIC_MATRA_CATEGORY_BOTTOM                   = POS_BELOW_C,
 129   INDIC_MATRA_CATEGORY_RIGHT                    = POS_POST_C,
 130
 131   /* These should resolve to the position of the last part of the split sequence. */
 132   INDIC_MATRA_CATEGORY_BOTTOM_AND_RIGHT         = INDIC_MATRA_CATEGORY_RIGHT,
 133   INDIC_MATRA_CATEGORY_LEFT_AND_RIGHT           = INDIC_MATRA_CATEGORY_RIGHT,
 134   INDIC_MATRA_CATEGORY_TOP_AND_BOTTOM           = INDIC_MATRA_CATEGORY_BOTTOM,
 135   INDIC_MATRA_CATEGORY_TOP_AND_BOTTOM_AND_RIGHT = INDIC_MATRA_CATEGORY_RIGHT,
 136   INDIC_MATRA_CATEGORY_TOP_AND_LEFT             = INDIC_MATRA_CATEGORY_TOP,
 137   INDIC_MATRA_CATEGORY_TOP_AND_LEFT_AND_RIGHT   = INDIC_MATRA_CATEGORY_RIGHT,
 138   INDIC_MATRA_CATEGORY_TOP_AND_RIGHT            = INDIC_MATRA_CATEGORY_RIGHT,
 139
 140   INDIC_MATRA_CATEGORY_INVISIBLE                = INDIC_MATRA_CATEGORY_NOT_APPLICABLE,
 141   INDIC_MATRA_CATEGORY_OVERSTRUCK               = INDIC_MATRA_CATEGORY_NOT_APPLICABLE,
 142   INDIC_MATRA_CATEGORY_VISUAL_ORDER_LEFT        = INDIC_MATRA_CATEGORY_NOT_APPLICABLE
 143 };
 144
 145 /* Note: We use ASSERT_STATIC_EXPR_ZERO() instead of ASSERT_STATIC_EXPR() and the comma operation
 146  * because gcc fails to optimize the latter and fills the table in at runtime. */
 147 #define INDIC_COMBINE_CATEGORIES(S,M) \
 148   (ASSERT_STATIC_EXPR_ZERO (M == INDIC_MATRA_CATEGORY_NOT_APPLICABLE || (S == INDIC_SYLLABIC_CATEGORY_VIRAMA || S == INDIC_SYLLABIC_CATEGORY_VOWEL_DEPENDENT)) + \
 149    ASSERT_STATIC_EXPR_ZERO (S < 16 && M < 16) + \
 150    ((M << 4) | S))
 151
 152
 153 #include "hb-ot-shape-complex-indic-table.hh"
 154
 155
 156 #define IN_HALF_BLOCK(u, Base) (((u) & ~0x7F) == (Base))
 157
 158 #define IS_DEVA(u) (IN_HALF_BLOCK (u, 0x0900))
 159 #define IS_BENG(u) (IN_HALF_BLOCK (u, 0x0980))
 160 #define IS_GURU(u) (IN_HALF_BLOCK (u, 0x0A00))
 161 #define IS_GUJR(u) (IN_HALF_BLOCK (u, 0x0A80))
 162 #define IS_ORYA(u) (IN_HALF_BLOCK (u, 0x0B00))
 163 #define IS_TAML(u) (IN_HALF_BLOCK (u, 0x0B80))
 164 #define IS_TELU(u) (IN_HALF_BLOCK (u, 0x0C00))
 165 #define IS_KNDA(u) (IN_HALF_BLOCK (u, 0x0C80))
 166 #define IS_MLYM(u) (IN_HALF_BLOCK (u, 0x0D00))
 167 #define IS_SINH(u) (IN_HALF_BLOCK (u, 0x0D80))
 168 #define IS_KHMR(u) (IN_HALF_BLOCK (u, 0x1780))
 169
 170
 171 #define MATRA_POS_LEFT(u)       POS_PRE_M
 172 #define MATRA_POS_RIGHT(u)      ( \
 173                                   IS_DEVA(u) ? POS_AFTER_SUB  : \
 174                                   IS_BENG(u) ? POS_AFTER_POST : \
 175                                   IS_GURU(u) ? POS_AFTER_POST : \
 176                                   IS_GUJR(u) ? POS_AFTER_POST : \
 177                                   IS_ORYA(u) ? POS_AFTER_POST : \
 178                                   IS_TAML(u) ? POS_AFTER_POST : \
 179                                   IS_TELU(u) ? (u <= 0x0C42 ? POS_BEFORE_SUB : POS_AFTER_SUB) : \
 180                                   IS_KNDA(u) ? (u < 0x0CC3 || u > 0xCD6 ? POS_BEFORE_SUB : POS_AFTER_SUB) : \
 181                                   IS_MLYM(u) ? POS_AFTER_POST : \
 182                                   IS_SINH(u) ? POS_AFTER_SUB  : \
 183                                   IS_KHMR(u) ? POS_AFTER_POST : \
 184                                   /*default*/  POS_AFTER_SUB    \
 185                                 )
 186 #define MATRA_POS_TOP(u)        ( /* BENG and MLYM don't have top matras. */ \
 187                                   IS_DEVA(u) ? POS_AFTER_SUB  : \
 188                                   IS_GURU(u) ? POS_AFTER_POST : /* Deviate from spec */ \
 189                                   IS_GUJR(u) ? POS_AFTER_SUB  : \
 190                                   IS_ORYA(u) ? POS_AFTER_MAIN : \
 191                                   IS_TAML(u) ? POS_AFTER_SUB  : \
 192                                   IS_TELU(u) ? POS_BEFORE_SUB : \
 193                                   IS_KNDA(u) ? POS_BEFORE_SUB : \
 194                                   IS_SINH(u) ? POS_AFTER_SUB  : \
 195                                   IS_KHMR(u) ? POS_AFTER_POST : \
 196                                   /*default*/  POS_AFTER_SUB    \
 197                                 )
 198 #define MATRA_POS_BOTTOM(u)     ( \
 199                                   IS_DEVA(u) ? POS_AFTER_SUB  : \
 200                                   IS_BENG(u) ? POS_AFTER_SUB  : \
 201                                   IS_GURU(u) ? POS_AFTER_POST : \
 202                                   IS_GUJR(u) ? POS_AFTER_POST : \
 203                                   IS_ORYA(u) ? POS_AFTER_SUB  : \
 204                                   IS_TAML(u) ? POS_AFTER_POST : \
 205                                   IS_TELU(u) ? POS_BEFORE_SUB : \
 206                                   IS_KNDA(u) ? POS_BEFORE_SUB : \
 207                                   IS_MLYM(u) ? POS_AFTER_POST : \
 208                                   IS_SINH(u) ? POS_AFTER_SUB  : \
 209                                   IS_KHMR(u) ? POS_AFTER_POST : \
 210                                   /*default*/  POS_AFTER_SUB    \
 211                                 )
 212
 213
 214 static inline indic_position_t
 215 matra_position (hb_codepoint_t u, indic_position_t side)
 216 {
 217   switch ((int) side)
 218   {
 219     case POS_PRE_C:     return MATRA_POS_LEFT (u);
 220     case POS_POST_C:    return MATRA_POS_RIGHT (u);
 221     case POS_ABOVE_C:   return MATRA_POS_TOP (u);
 222     case POS_BELOW_C:   return MATRA_POS_BOTTOM (u);
 223   };
 224   abort ();
 225 }
 226
 227
 228
 229 /* XXX
 230  * This is a hack for now.  We should move this data into the main Indic table.
 231  * Or completely remove it and just check in the tables.
 232  */
 233 static const hb_codepoint_t ra_chars[] = {
 234   0x0930, /* Devanagari */
 235   0x09B0, /* Bengali */
 236   0x09F0, /* Bengali */
 237   0x0A30, /* Gurmukhi */        /* No Reph */
 238   0x0AB0, /* Gujarati */
 239   0x0B30, /* Oriya */
 240   0x0BB0, /* Tamil */           /* No Reph */
 241   0x0C30, /* Telugu */          /* Reph formed only with ZWJ */
 242   0x0CB0, /* Kannada */
 243   0x0D30, /* Malayalam */       /* No Reph, Logical Repha */
 244
 245   0x0DBB, /* Sinhala */         /* Reph formed only with ZWJ */
 246
 247   0x179A, /* Khmer */           /* No Reph, Visual Repha */
 248 };
 249
 250 static inline indic_position_t
 251 consonant_position (hb_codepoint_t  u)
 252 {
 253   if ((u & ~0x007F) == 0x1780)
 254     return POS_BELOW_C; /* In Khmer coeng model, post and below forms should not be reordered. */
 255   return POS_BASE_C; /* Will recategorize later based on font lookups. */
 256 }
 257
 258 static inline bool
 259 is_ra (hb_codepoint_t u)
 260 {
 261   for (unsigned int i = 0; i < ARRAY_LENGTH (ra_chars); i++)
 262     if (u == ra_chars[i])
 263       return true;
 264   return false;
 265 }
 266
 267
 268 static inline bool
 269 is_one_of (const hb_glyph_info_t &info, unsigned int flags)
 270 {
 271   /* If it ligated, all bets are off. */
 272   if (is_a_ligature (info)) return false;
 273   return !!(FLAG (info.indic_category()) & flags);
 274 }
 275
 276 #define JOINER_FLAGS (FLAG (OT_ZWJ) | FLAG (OT_ZWNJ))
 277 static inline bool
 278 is_joiner (const hb_glyph_info_t &info)
 279 {
 280   return is_one_of (info, JOINER_FLAGS);
 281 }
 282
 283 /* Note:
 284  *
 285  * We treat Vowels and placeholders as if they were consonants.  This is safe because Vowels
 286  * cannot happen in a consonant syllable.  The plus side however is, we can call the
 287  * consonant syllable logic from the vowel syllable function and get it all right! */
 288 #define CONSONANT_FLAGS (FLAG (OT_C) | FLAG (OT_Ra) | FLAG (OT_V) | FLAG (OT_NBSP) | FLAG (OT_DOTTEDCIRCLE))
 289 static inline bool
 290 is_consonant (const hb_glyph_info_t &info)
 291 {
 292   return is_one_of (info, CONSONANT_FLAGS);
 293 }
 294
 295 #define HALANT_OR_COENG_FLAGS (FLAG (OT_H) | FLAG (OT_Coeng))
 296 static inline bool
 297 is_halant_or_coeng (const hb_glyph_info_t &info)
 298 {
 299   return is_one_of (info, HALANT_OR_COENG_FLAGS);
 300 }
 301
 302 static inline void
 303 set_indic_properties (hb_glyph_info_t   &info)
 304 {
 305   hb_codepoint_t u = info.codepoint;
 306   unsigned int type = get_indic_categories (u);
 307   indic_category_t cat = (indic_category_t) (type & 0x0F);
 308   indic_position_t pos = (indic_position_t) (type >> 4);
 309
 310
 311   /*
 312    * Re-assign category
 313    */
 314
 315
 316   /* The spec says U+0952 is OT_A.  However, testing shows that Uniscribe
 317    * treats U+0951..U+0952 all as OT_VD.
 318    * TESTS:
 319    * U+092E,U+0947,U+0952
 320    * U+092E,U+0952,U+0947
 321    * U+092E,U+0947,U+0951
 322    * U+092E,U+0951,U+0947
 323    * */
 324   if (unlikely (hb_in_range<hb_codepoint_t> (u, 0x0951, 0x0954)))
 325     cat = OT_VD;
 326
 327   if (unlikely (u == 0x17D1))
 328     cat = OT_X;
 329   if (cat == OT_X &&
 330       unlikely (hb_in_range<hb_codepoint_t> (u, 0x17CB, 0x17D3))) /* Khmer Various signs */
 331   {
 332     /* These are like Top Matras. */
 333     cat = OT_M;
 334     pos = POS_ABOVE_C;
 335   }
 336   if (u == 0x17C6) /* Khmer Bindu doesn't like to be repositioned. */
 337     cat = OT_N;
 338
 339   if (unlikely (u == 0x17D2)) cat = OT_Coeng; /* Khmer coeng */
 340   else if (unlikely (u == 0x200C)) cat = OT_ZWNJ;
 341   else if (unlikely (u == 0x200D)) cat = OT_ZWJ;
 342   else if (unlikely (u == 0x25CC)) cat = OT_DOTTEDCIRCLE;
 343   else if (unlikely (u == 0x0A71)) cat = OT_SM; /* GURMUKHI ADDAK.  More like consonant medial. like 0A75. */
 344
 345   if (cat == OT_Repha) {
 346     /* There are two kinds of characters marked as Repha:
 347      * - The ones that are GenCat=Mn are already positioned visually, ie. after base. (eg. Khmer)
 348      * - The ones that are GenCat=Lo is encoded logically, ie. beginning of syllable. (eg. Malayalam)
 349      *
 350      * We recategorize the first kind to look like a Nukta and attached to the base directly.
 351      */
 352     if (_hb_glyph_info_get_general_category (&info) == HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK)
 353       cat = OT_N;
 354   }
 355
 356
 357
 358   /*
 359    * Re-assign position.
 360    */
 361
 362   if ((FLAG (cat) & CONSONANT_FLAGS))
 363   {
 364     pos = consonant_position (u);
 365     if (is_ra (u))
 366       cat = OT_Ra;
 367   }
 368   else if (cat == OT_M)
 369   {
 370     pos = matra_position (u, pos);
 371   }
 372   else if (cat == OT_SM || cat == OT_VD)
 373   {
 374     pos = POS_SMVD;
 375   }
 376
 377   if (unlikely (u == 0x0B01)) pos = POS_BEFORE_SUB; /* Oriya Bindu is BeforeSub in the spec. */
 378
 379
 380
 381   info.indic_category() = cat;
 382   info.indic_position() = pos;
 383 }
 384
 385
 386
 387 #endif /* HB_OT_SHAPE_COMPLEX_INDIC_PRIVATE_HH */