Imported Upstream version 0.9.35
[platform/upstream/harfbuzz.git] / src / hb-ot-shape-complex-indic-private.hh
index e36090e..d8dfc65 100644 (file)
 #include "hb-ot-shape-private.hh" /* XXX Remove */
 
 
-/* buffer var allocations */
-#define indic_category() complex_var_u8_0() /* indic_category_t */
-#define indic_position() complex_var_u8_1() /* indic_matra_category_t */
-
-
 #define INDIC_TABLE_ELEMENT_TYPE uint16_t
 
 /* Cateories used in the OpenType spec:
  * Not sure how to avoid duplication. */
 enum indic_category_t {
   OT_X = 0,
-  OT_C,
-  OT_V,
-  OT_N,
-  OT_H,
-  OT_ZWNJ,
-  OT_ZWJ,
-  OT_M,
-  OT_SM,
-  OT_VD,
-  OT_A,
-  OT_NBSP,
-  OT_DOTTEDCIRCLE, /* Not in the spec, but special in Uniscribe. /Very very/ special! */
-  OT_RS, /* Register Shifter, used in Khmer OT spec */
-  OT_Coeng,
-  OT_Repha,
-  OT_Ra, /* Not explicitly listed in the OT spec, but used in the grammar. */
-  OT_CM
+  OT_C = 1,
+  OT_V = 2,
+  OT_N = 3,
+  OT_H = 4,
+  OT_ZWNJ = 5,
+  OT_ZWJ = 6,
+  OT_M = 7,
+  OT_SM = 8,
+  OT_VD = 9,
+  OT_A = 10,
+  OT_PLACEHOLDER = 11,
+  OT_DOTTEDCIRCLE = 12,
+  OT_RS = 13, /* Register Shifter, used in Khmer OT spec. */
+  OT_Coeng = 14, /* Khmer-style Virama. */
+  OT_Repha = 15, /* Atomically-encoded logical or visual repha. */
+  OT_Ra = 16,
+  OT_CM = 17,  /* Consonant-Medial. */
+  OT_Symbol = 18, /* Avagraha, etc that take marks (SM,A,VD). */
+  OT_CM2 = 31 /* Consonant-Medial, second slot. */
 };
 
+#define MEDIAL_FLAGS (FLAG (OT_CM) | FLAG (OT_CM2))
+
+/* Note:
+ *
+ * We treat Vowels and placeholders as if they were consonants.  This is safe because Vowels
+ * cannot happen in a consonant syllable.  The plus side however is, we can call the
+ * consonant syllable logic from the vowel syllable function and get it all right! */
+#define CONSONANT_FLAGS (FLAG (OT_C) | FLAG (OT_Ra) | MEDIAL_FLAGS | FLAG (OT_V) | FLAG (OT_PLACEHOLDER) | FLAG (OT_DOTTEDCIRCLE))
+#define JOINER_FLAGS (FLAG (OT_ZWJ) | FLAG (OT_ZWNJ))
+#define HALANT_OR_COENG_FLAGS (FLAG (OT_H) | FLAG (OT_Coeng))
+
+
 /* Visual positions in a syllable from left to right. */
 enum indic_position_t {
   POS_START,
@@ -96,293 +105,78 @@ enum indic_position_t {
 
 /* Categories used in IndicSyllabicCategory.txt from UCD. */
 enum indic_syllabic_category_t {
-  INDIC_SYLLABIC_CATEGORY_OTHER                        = OT_X,
-
-  INDIC_SYLLABIC_CATEGORY_AVAGRAHA             = OT_X,
-  INDIC_SYLLABIC_CATEGORY_BINDU                        = OT_SM,
-  INDIC_SYLLABIC_CATEGORY_CONSONANT            = OT_C,
-  INDIC_SYLLABIC_CATEGORY_CONSONANT_DEAD       = OT_C,
-  INDIC_SYLLABIC_CATEGORY_CONSONANT_FINAL      = OT_C,
-  INDIC_SYLLABIC_CATEGORY_CONSONANT_HEAD_LETTER        = OT_C,
-  INDIC_SYLLABIC_CATEGORY_CONSONANT_MEDIAL     = OT_CM,
-  INDIC_SYLLABIC_CATEGORY_CONSONANT_PLACEHOLDER        = OT_NBSP,
-  INDIC_SYLLABIC_CATEGORY_CONSONANT_SUBJOINED  = OT_C,
-  INDIC_SYLLABIC_CATEGORY_CONSONANT_REPHA      = OT_Repha,
-  INDIC_SYLLABIC_CATEGORY_MODIFYING_LETTER     = OT_X,
-  INDIC_SYLLABIC_CATEGORY_NUKTA                        = OT_N,
-  INDIC_SYLLABIC_CATEGORY_REGISTER_SHIFTER     = OT_RS,
-  INDIC_SYLLABIC_CATEGORY_TONE_LETTER          = OT_X,
-  INDIC_SYLLABIC_CATEGORY_TONE_MARK            = OT_N,
-  INDIC_SYLLABIC_CATEGORY_VIRAMA               = OT_H,
-  INDIC_SYLLABIC_CATEGORY_VISARGA              = OT_SM,
-  INDIC_SYLLABIC_CATEGORY_VOWEL                        = OT_V,
-  INDIC_SYLLABIC_CATEGORY_VOWEL_DEPENDENT      = OT_M,
-  INDIC_SYLLABIC_CATEGORY_VOWEL_INDEPENDENT    = OT_V
+  INDIC_SYLLABIC_CATEGORY_OTHER                                = OT_X,
+
+  INDIC_SYLLABIC_CATEGORY_AVAGRAHA                     = OT_Symbol,
+  INDIC_SYLLABIC_CATEGORY_BINDU                                = OT_SM,
+  INDIC_SYLLABIC_CATEGORY_BRAHMI_JOINING_NUMBER                = OT_PLACEHOLDER, /* TODO */
+  INDIC_SYLLABIC_CATEGORY_CANTILLATION_MARK            = OT_A,
+  INDIC_SYLLABIC_CATEGORY_CONSONANT                    = OT_C,
+  INDIC_SYLLABIC_CATEGORY_CONSONANT_DEAD               = OT_C,
+  INDIC_SYLLABIC_CATEGORY_CONSONANT_FINAL              = OT_CM,
+  INDIC_SYLLABIC_CATEGORY_CONSONANT_HEAD_LETTER                = OT_C,
+  INDIC_SYLLABIC_CATEGORY_CONSONANT_MEDIAL             = OT_CM,
+  INDIC_SYLLABIC_CATEGORY_CONSONANT_PLACEHOLDER                = OT_PLACEHOLDER,
+  INDIC_SYLLABIC_CATEGORY_CONSONANT_PRECEDING_REPHA    = OT_Repha,
+  INDIC_SYLLABIC_CATEGORY_CONSONANT_SUBJOINED          = OT_CM,
+  INDIC_SYLLABIC_CATEGORY_CONSONANT_SUCCEEDING_REPHA   = OT_N,
+  INDIC_SYLLABIC_CATEGORY_GEMINATION_MARK              = OT_SM,
+  INDIC_SYLLABIC_CATEGORY_INVISIBLE_STACKER            = OT_H, /* TODO */
+  INDIC_SYLLABIC_CATEGORY_JOINER                       = OT_ZWJ,
+  INDIC_SYLLABIC_CATEGORY_MODIFYING_LETTER             = OT_X,
+  INDIC_SYLLABIC_CATEGORY_NON_JOINER                   = OT_ZWNJ,
+  INDIC_SYLLABIC_CATEGORY_NUKTA                                = OT_N,
+  INDIC_SYLLABIC_CATEGORY_NUMBER                       = OT_PLACEHOLDER,
+  INDIC_SYLLABIC_CATEGORY_NUMBER_JOINER                        = OT_PLACEHOLDER, /* TODO */
+  INDIC_SYLLABIC_CATEGORY_PURE_KILLER                  = OT_H, /* TODO */
+  INDIC_SYLLABIC_CATEGORY_REGISTER_SHIFTER             = OT_RS,
+  INDIC_SYLLABIC_CATEGORY_TONE_LETTER                  = OT_X,
+  INDIC_SYLLABIC_CATEGORY_TONE_MARK                    = OT_N,
+  INDIC_SYLLABIC_CATEGORY_VIRAMA                       = OT_H,
+  INDIC_SYLLABIC_CATEGORY_VISARGA                      = OT_SM,
+  INDIC_SYLLABIC_CATEGORY_VOWEL                                = OT_V,
+  INDIC_SYLLABIC_CATEGORY_VOWEL_DEPENDENT              = OT_M,
+  INDIC_SYLLABIC_CATEGORY_VOWEL_INDEPENDENT            = OT_V
 };
 
 /* Categories used in IndicSMatraCategory.txt from UCD */
 enum indic_matra_category_t {
-  INDIC_MATRA_CATEGORY_NOT_APPLICABLE          = POS_END,
+  INDIC_MATRA_CATEGORY_NOT_APPLICABLE                  = POS_END,
 
-  INDIC_MATRA_CATEGORY_LEFT                    = POS_PRE_C,
-  INDIC_MATRA_CATEGORY_TOP                     = POS_ABOVE_C,
-  INDIC_MATRA_CATEGORY_BOTTOM                  = POS_BELOW_C,
-  INDIC_MATRA_CATEGORY_RIGHT                   = POS_POST_C,
+  INDIC_MATRA_CATEGORY_LEFT                            = POS_PRE_C,
+  INDIC_MATRA_CATEGORY_TOP                             = POS_ABOVE_C,
+  INDIC_MATRA_CATEGORY_BOTTOM                          = POS_BELOW_C,
+  INDIC_MATRA_CATEGORY_RIGHT                           = POS_POST_C,
 
   /* These should resolve to the position of the last part of the split sequence. */
-  INDIC_MATRA_CATEGORY_BOTTOM_AND_RIGHT                = INDIC_MATRA_CATEGORY_RIGHT,
-  INDIC_MATRA_CATEGORY_LEFT_AND_RIGHT          = INDIC_MATRA_CATEGORY_RIGHT,
-  INDIC_MATRA_CATEGORY_TOP_AND_BOTTOM          = INDIC_MATRA_CATEGORY_BOTTOM,
-  INDIC_MATRA_CATEGORY_TOP_AND_BOTTOM_AND_RIGHT        = INDIC_MATRA_CATEGORY_RIGHT,
-  INDIC_MATRA_CATEGORY_TOP_AND_LEFT            = INDIC_MATRA_CATEGORY_TOP,
-  INDIC_MATRA_CATEGORY_TOP_AND_LEFT_AND_RIGHT  = INDIC_MATRA_CATEGORY_RIGHT,
-  INDIC_MATRA_CATEGORY_TOP_AND_RIGHT           = INDIC_MATRA_CATEGORY_RIGHT,
-
-  INDIC_MATRA_CATEGORY_INVISIBLE               = INDIC_MATRA_CATEGORY_NOT_APPLICABLE,
-  INDIC_MATRA_CATEGORY_OVERSTRUCK              = POS_AFTER_MAIN,
-  INDIC_MATRA_CATEGORY_VISUAL_ORDER_LEFT       = POS_PRE_M
+  INDIC_MATRA_CATEGORY_BOTTOM_AND_RIGHT                        = INDIC_MATRA_CATEGORY_RIGHT,
+  INDIC_MATRA_CATEGORY_LEFT_AND_RIGHT                  = INDIC_MATRA_CATEGORY_RIGHT,
+  INDIC_MATRA_CATEGORY_TOP_AND_BOTTOM                  = INDIC_MATRA_CATEGORY_BOTTOM,
+  INDIC_MATRA_CATEGORY_TOP_AND_BOTTOM_AND_RIGHT                = INDIC_MATRA_CATEGORY_RIGHT,
+  INDIC_MATRA_CATEGORY_TOP_AND_LEFT                    = INDIC_MATRA_CATEGORY_TOP,
+  INDIC_MATRA_CATEGORY_TOP_AND_LEFT_AND_RIGHT          = INDIC_MATRA_CATEGORY_RIGHT,
+  INDIC_MATRA_CATEGORY_TOP_AND_RIGHT                   = INDIC_MATRA_CATEGORY_RIGHT,
+
+  INDIC_MATRA_CATEGORY_OVERSTRUCK                      = POS_AFTER_MAIN,
+  INDIC_MATRA_CATEGORY_VISUAL_ORDER_LEFT               = POS_PRE_M
 };
 
 /* Note: We use ASSERT_STATIC_EXPR_ZERO() instead of ASSERT_STATIC_EXPR() and the comma operation
  * because gcc fails to optimize the latter and fills the table in at runtime. */
 #define INDIC_COMBINE_CATEGORIES(S,M) \
-  (ASSERT_STATIC_EXPR_ZERO (M == INDIC_MATRA_CATEGORY_NOT_APPLICABLE || (S == INDIC_SYLLABIC_CATEGORY_VIRAMA || S == INDIC_SYLLABIC_CATEGORY_VOWEL_DEPENDENT)) + \
+  (ASSERT_STATIC_EXPR_ZERO (M == INDIC_MATRA_CATEGORY_NOT_APPLICABLE || \
+                           ( \
+                            S == INDIC_SYLLABIC_CATEGORY_CONSONANT_MEDIAL || \
+                            S == INDIC_SYLLABIC_CATEGORY_GEMINATION_MARK || \
+                            S == INDIC_SYLLABIC_CATEGORY_REGISTER_SHIFTER || \
+                            S == INDIC_SYLLABIC_CATEGORY_CONSONANT_SUCCEEDING_REPHA || \
+                            S == INDIC_SYLLABIC_CATEGORY_VIRAMA || \
+                            S == INDIC_SYLLABIC_CATEGORY_VOWEL_DEPENDENT || \
+                            false)) + \
    ASSERT_STATIC_EXPR_ZERO (S < 255 && M < 255) + \
    ((M << 8) | S))
 
-
-#include "hb-ot-shape-complex-indic-table.hh"
-
-
-#define IN_HALF_BLOCK(u, Base) (((u) & ~0x7F) == (Base))
-
-#define IS_DEVA(u) (IN_HALF_BLOCK (u, 0x0900))
-#define IS_BENG(u) (IN_HALF_BLOCK (u, 0x0980))
-#define IS_GURU(u) (IN_HALF_BLOCK (u, 0x0A00))
-#define IS_GUJR(u) (IN_HALF_BLOCK (u, 0x0A80))
-#define IS_ORYA(u) (IN_HALF_BLOCK (u, 0x0B00))
-#define IS_TAML(u) (IN_HALF_BLOCK (u, 0x0B80))
-#define IS_TELU(u) (IN_HALF_BLOCK (u, 0x0C00))
-#define IS_KNDA(u) (IN_HALF_BLOCK (u, 0x0C80))
-#define IS_MLYM(u) (IN_HALF_BLOCK (u, 0x0D00))
-#define IS_SINH(u) (IN_HALF_BLOCK (u, 0x0D80))
-#define IS_KHMR(u) (IN_HALF_BLOCK (u, 0x1780))
-
-
-#define MATRA_POS_LEFT(u)      POS_PRE_M
-#define MATRA_POS_RIGHT(u)     ( \
-                                 IS_DEVA(u) ? POS_AFTER_SUB  : \
-                                 IS_BENG(u) ? POS_AFTER_POST : \
-                                 IS_GURU(u) ? POS_AFTER_POST : \
-                                 IS_GUJR(u) ? POS_AFTER_POST : \
-                                 IS_ORYA(u) ? POS_AFTER_POST : \
-                                 IS_TAML(u) ? POS_AFTER_POST : \
-                                 IS_TELU(u) ? (u <= 0x0C42 ? POS_BEFORE_SUB : POS_AFTER_SUB) : \
-                                 IS_KNDA(u) ? (u < 0x0CC3 || u > 0xCD6 ? POS_BEFORE_SUB : POS_AFTER_SUB) : \
-                                 IS_MLYM(u) ? POS_AFTER_POST : \
-                                 IS_SINH(u) ? POS_AFTER_SUB  : \
-                                 IS_KHMR(u) ? POS_AFTER_POST : \
-                                 /*default*/  POS_AFTER_SUB    \
-                               )
-#define MATRA_POS_TOP(u)       ( /* BENG and MLYM don't have top matras. */ \
-                                 IS_DEVA(u) ? POS_AFTER_SUB  : \
-                                 IS_GURU(u) ? POS_AFTER_POST : /* Deviate from spec */ \
-                                 IS_GUJR(u) ? POS_AFTER_SUB  : \
-                                 IS_ORYA(u) ? POS_AFTER_MAIN : \
-                                 IS_TAML(u) ? POS_AFTER_SUB  : \
-                                 IS_TELU(u) ? POS_BEFORE_SUB : \
-                                 IS_KNDA(u) ? POS_BEFORE_SUB : \
-                                 IS_SINH(u) ? POS_AFTER_SUB  : \
-                                 IS_KHMR(u) ? POS_AFTER_POST : \
-                                 /*default*/  POS_AFTER_SUB    \
-                               )
-#define MATRA_POS_BOTTOM(u)    ( \
-                                 IS_DEVA(u) ? POS_AFTER_SUB  : \
-                                 IS_BENG(u) ? POS_AFTER_SUB  : \
-                                 IS_GURU(u) ? POS_AFTER_POST : \
-                                 IS_GUJR(u) ? POS_AFTER_POST : \
-                                 IS_ORYA(u) ? POS_AFTER_SUB  : \
-                                 IS_TAML(u) ? POS_AFTER_POST : \
-                                 IS_TELU(u) ? POS_BEFORE_SUB : \
-                                 IS_KNDA(u) ? POS_BEFORE_SUB : \
-                                 IS_MLYM(u) ? POS_AFTER_POST : \
-                                 IS_SINH(u) ? POS_AFTER_SUB  : \
-                                 IS_KHMR(u) ? POS_AFTER_POST : \
-                                 /*default*/  POS_AFTER_SUB    \
-                               )
-
-
-static inline indic_position_t
-matra_position (hb_codepoint_t u, indic_position_t side)
-{
-  switch ((int) side)
-  {
-    case POS_PRE_C:    return MATRA_POS_LEFT (u);
-    case POS_POST_C:   return MATRA_POS_RIGHT (u);
-    case POS_ABOVE_C:  return MATRA_POS_TOP (u);
-    case POS_BELOW_C:  return MATRA_POS_BOTTOM (u);
-  };
-  return side;
-}
-
-
-
-/* XXX
- * This is a hack for now.  We should move this data into the main Indic table.
- * Or completely remove it and just check in the tables.
- */
-static const hb_codepoint_t ra_chars[] = {
-  0x0930, /* Devanagari */
-  0x09B0, /* Bengali */
-  0x09F0, /* Bengali */
-  0x0A30, /* Gurmukhi */       /* No Reph */
-  0x0AB0, /* Gujarati */
-  0x0B30, /* Oriya */
-  0x0BB0, /* Tamil */          /* No Reph */
-  0x0C30, /* Telugu */         /* Reph formed only with ZWJ */
-  0x0CB0, /* Kannada */
-  0x0D30, /* Malayalam */      /* No Reph, Logical Repha */
-
-  0x0DBB, /* Sinhala */                /* Reph formed only with ZWJ */
-
-  0x179A, /* Khmer */          /* No Reph, Visual Repha */
-};
-
-static inline indic_position_t
-consonant_position (hb_codepoint_t  u)
-{
-  if ((u & ~0x007F) == 0x1780)
-    return POS_BELOW_C; /* In Khmer coeng model, post and below forms should not be reordered. */
-  return POS_BASE_C; /* Will recategorize later based on font lookups. */
-}
-
-static inline bool
-is_ra (hb_codepoint_t u)
-{
-  for (unsigned int i = 0; i < ARRAY_LENGTH (ra_chars); i++)
-    if (u == ra_chars[i])
-      return true;
-  return false;
-}
-
-
-static inline bool
-is_one_of (const hb_glyph_info_t &info, unsigned int flags)
-{
-  /* If it ligated, all bets are off. */
-  if (is_a_ligature (info)) return false;
-  return !!(FLAG (info.indic_category()) & flags);
-}
-
-#define JOINER_FLAGS (FLAG (OT_ZWJ) | FLAG (OT_ZWNJ))
-static inline bool
-is_joiner (const hb_glyph_info_t &info)
-{
-  return is_one_of (info, JOINER_FLAGS);
-}
-
-/* Note:
- *
- * We treat Vowels and placeholders as if they were consonants.  This is safe because Vowels
- * cannot happen in a consonant syllable.  The plus side however is, we can call the
- * consonant syllable logic from the vowel syllable function and get it all right! */
-#define CONSONANT_FLAGS (FLAG (OT_C) | FLAG (OT_CM) | FLAG (OT_Ra) | FLAG (OT_V) | FLAG (OT_NBSP) | FLAG (OT_DOTTEDCIRCLE))
-static inline bool
-is_consonant (const hb_glyph_info_t &info)
-{
-  return is_one_of (info, CONSONANT_FLAGS);
-}
-
-#define HALANT_OR_COENG_FLAGS (FLAG (OT_H) | FLAG (OT_Coeng))
-static inline bool
-is_halant_or_coeng (const hb_glyph_info_t &info)
-{
-  return is_one_of (info, HALANT_OR_COENG_FLAGS);
-}
-
-static inline void
-set_indic_properties (hb_glyph_info_t &info)
-{
-  hb_codepoint_t u = info.codepoint;
-  unsigned int type = get_indic_categories (u);
-  indic_category_t cat = (indic_category_t) (type & 0x7F);
-  indic_position_t pos = (indic_position_t) (type >> 8);
-
-
-  /*
-   * Re-assign category
-   */
-
-
-  /* The spec says U+0952 is OT_A.  However, testing shows that Uniscribe
-   * treats U+0951..U+0952 all as OT_VD.
-   * TESTS:
-   * U+092E,U+0947,U+0952
-   * U+092E,U+0952,U+0947
-   * U+092E,U+0947,U+0951
-   * U+092E,U+0951,U+0947
-   * */
-  if (unlikely (hb_in_range<hb_codepoint_t> (u, 0x0951, 0x0954)))
-    cat = OT_VD;
-
-  if (unlikely (u == 0x17D1))
-    cat = OT_X;
-  if (cat == OT_X &&
-      unlikely (hb_in_range<hb_codepoint_t> (u, 0x17CB, 0x17D3))) /* Khmer Various signs */
-  {
-    /* These are like Top Matras. */
-    cat = OT_M;
-    pos = POS_ABOVE_C;
-  }
-  if (u == 0x17C6) /* Khmer Bindu doesn't like to be repositioned. */
-    cat = OT_N;
-
-  if (unlikely (u == 0x17D2)) cat = OT_Coeng; /* Khmer coeng */
-  else if (unlikely (u == 0x200C)) cat = OT_ZWNJ;
-  else if (unlikely (u == 0x200D)) cat = OT_ZWJ;
-  else if (unlikely (u == 0x25CC)) cat = OT_DOTTEDCIRCLE;
-  else if (unlikely (u == 0x0A71)) cat = OT_SM; /* GURMUKHI ADDAK.  More like consonant medial. like 0A75. */
-
-  if (cat == OT_Repha) {
-    /* There are two kinds of characters marked as Repha:
-     * - The ones that are GenCat=Mn are already positioned visually, ie. after base. (eg. Khmer)
-     * - The ones that are GenCat=Lo is encoded logically, ie. beginning of syllable. (eg. Malayalam)
-     *
-     * We recategorize the first kind to look like a Nukta and attached to the base directly.
-     */
-    if (_hb_glyph_info_get_general_category (&info) == HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK)
-      cat = OT_N;
-  }
-
-
-
-  /*
-   * Re-assign position.
-   */
-
-  if ((FLAG (cat) & CONSONANT_FLAGS))
-  {
-    pos = consonant_position (u);
-    if (is_ra (u))
-      cat = OT_Ra;
-  }
-  else if (cat == OT_M)
-  {
-    pos = matra_position (u, pos);
-  }
-  else if (cat == OT_SM || cat == OT_VD)
-  {
-    pos = POS_SMVD;
-  }
-
-  if (unlikely (u == 0x0B01)) pos = POS_BEFORE_SUB; /* Oriya Bindu is BeforeSub in the spec. */
-
-
-
-  info.indic_category() = cat;
-  info.indic_position() = pos;
-}
-
-
+HB_INTERNAL INDIC_TABLE_ELEMENT_TYPE
+hb_indic_get_categories (hb_codepoint_t u);
 
 #endif /* HB_OT_SHAPE_COMPLEX_INDIC_PRIVATE_HH */