text/dali/internal/libunibreak/linebreakdef.h

   1 /* vim: set expandtab tabstop=4 softtabstop=4 shiftwidth=4: */
   2
   3 /*
   4  * Line breaking in a Unicode sequence.  Designed to be used in a
   5  * generic text renderer.
   6  *
   7  * Copyright (C) 2008-2013 Wu Yongwei <wuyongwei at gmail dot com>
   8  * Copyright (C) 2013 Petr Filipsky <philodej at gmail dot com>
   9  *
  10  * This software is provided 'as-is', without any express or implied
  11  * warranty.  In no event will the author be held liable for any damages
  12  * arising from the use of this software.
  13  *
  14  * Permission is granted to anyone to use this software for any purpose,
  15  * including commercial applications, and to alter it and redistribute
  16  * it freely, subject to the following restrictions:
  17  *
  18  * 1. The origin of this software must not be misrepresented; you must
  19  *    not claim that you wrote the original software.  If you use this
  20  *    software in a product, an acknowledgement in the product
  21  *    documentation would be appreciated but is not required.
  22  * 2. Altered source versions must be plainly marked as such, and must
  23  *    not be misrepresented as being the original software.
  24  * 3. This notice may not be removed or altered from any source
  25  *    distribution.
  26  *
  27  * The main reference is Unicode Standard Annex 14 (UAX #14):
  28  *      <URL:http://www.unicode.org/reports/tr14/>
  29  *
  30  * When this library was designed, this annex was at Revision 19, for
  31  * Unicode 5.0.0:
  32  *      <URL:http://www.unicode.org/reports/tr14/tr14-19.html>
  33  *
  34  * This library has been updated according to Revision 30, for
  35  * Unicode 6.2.0:
  36  *      <URL:http://www.unicode.org/reports/tr14/tr14-30.html>
  37  *
  38  * The Unicode Terms of Use are available at
  39  *      <URL:http://www.unicode.org/copyright.html>
  40  */
  41
  42 /**
  43  * @file    linebreakdef.h
  44  *
  45  * Definitions of internal data structures, declarations of global
  46  * variables, and function prototypes for the line breaking algorithm.
  47  *
  48  * @version 2.4, 2013/11/10
  49  * @author  Wu Yongwei
  50  * @author  Petr Filipsky
  51  */
  52
  53 /**
  54  * Constant value to mark the end of string.  It is not a valid Unicode
  55  * character.
  56  */
  57 #define EOS 0xFFFFFFFF
  58
  59 /**
  60  * Line break classes.  This is a direct mapping of Table 1 of Unicode
  61  * Standard Annex 14, Revision 26.
  62  */
  63 enum LineBreakClass
  64 {
  65     /* This is used to signal an error condition. */
  66     LBP_Undefined,  /**< Undefined */
  67
  68     /* The following break classes are treated in the pair table. */
  69     LBP_OP,         /**< Opening punctuation */
  70     LBP_CL,         /**< Closing punctuation */
  71     LBP_CP,         /**< Closing parenthesis */
  72     LBP_QU,         /**< Ambiguous quotation */
  73     LBP_GL,         /**< Glue */
  74     LBP_NS,         /**< Non-starters */
  75     LBP_EX,         /**< Exclamation/Interrogation */
  76     LBP_SY,         /**< Symbols allowing break after */
  77     LBP_IS,         /**< Infix separator */
  78     LBP_PR,         /**< Prefix */
  79     LBP_PO,         /**< Postfix */
  80     LBP_NU,         /**< Numeric */
  81     LBP_AL,         /**< Alphabetic */
  82     LBP_HL,         /**< Hebrew letter */
  83     LBP_ID,         /**< Ideographic */
  84     LBP_IN,         /**< Inseparable characters */
  85     LBP_HY,         /**< Hyphen */
  86     LBP_BA,         /**< Break after */
  87     LBP_BB,         /**< Break before */
  88     LBP_B2,         /**< Break on either side (but not pair) */
  89     LBP_ZW,         /**< Zero-width space */
  90     LBP_CM,         /**< Combining marks */
  91     LBP_WJ,         /**< Word joiner */
  92     LBP_H2,         /**< Hangul LV */
  93     LBP_H3,         /**< Hangul LVT */
  94     LBP_JL,         /**< Hangul L Jamo */
  95     LBP_JV,         /**< Hangul V Jamo */
  96     LBP_JT,         /**< Hangul T Jamo */
  97     LBP_RI,         /**< Regional indicator */
  98
  99     /* The following break classes are not treated in the pair table */
 100     LBP_AI,         /**< Ambiguous (alphabetic or ideograph) */
 101     LBP_BK,         /**< Break (mandatory) */
 102     LBP_CB,         /**< Contingent break */
 103     LBP_CJ,         /**< Conditional Japanese starter */
 104     LBP_CR,         /**< Carriage return */
 105     LBP_LF,         /**< Line feed */
 106     LBP_NL,         /**< Next line */
 107     LBP_SA,         /**< South-East Asian */
 108     LBP_SG,         /**< Surrogates */
 109     LBP_SP,         /**< Space */
 110     LBP_XX          /**< Unknown */
 111 };
 112
 113 /**
 114  * Struct for entries of line break properties.  The array of the
 115  * entries \e must be sorted.
 116  */
 117 struct LineBreakProperties
 118 {
 119     utf32_t start;              /**< Starting coding point */
 120     utf32_t end;                /**< End coding point */
 121     enum LineBreakClass prop;   /**< The line breaking property */
 122 };
 123
 124 /**
 125  * Struct for association of language-specific line breaking properties
 126  * with language names.
 127  */
 128 struct LineBreakPropertiesLang
 129 {
 130     const char *lang;                   /**< Language name */
 131     size_t namelen;                     /**< Length of name to match */
 132     struct LineBreakProperties *lbp;    /**< Pointer to associated data */
 133 };
 134
 135 /**
 136  * Context representing internal state of the line breaking algorithm.
 137  * This is useful to callers if incremental analysis is wanted.
 138  */
 139 struct LineBreakContext
 140 {
 141     const char *lang;               /**< Language name */
 142     struct LineBreakProperties *lbpLang;/**< Pointer to LineBreakProperties */
 143     enum LineBreakClass lbcCur;     /**< Breaking class of current codepoint */
 144     enum LineBreakClass lbcNew;     /**< Breaking class of next codepoint */
 145     enum LineBreakClass lbcLast;    /**< Breaking class of last codepoint */
 146 };
 147
 148 /**
 149  * Abstract function interface for #lb_get_next_char_utf8,
 150  * #lb_get_next_char_utf16, and #lb_get_next_char_utf32.
 151  */
 152 typedef utf32_t (*get_next_char_t)(const void *, size_t, size_t *);
 153
 154 /* Declarations */
 155 extern struct LineBreakProperties lb_prop_default[];
 156 extern struct LineBreakPropertiesLang lb_prop_lang_map[];
 157
 158 /* Function Prototype */
 159 utf32_t lb_get_next_char_utf8(const utf8_t *s, size_t len, size_t *ip);
 160 utf32_t lb_get_next_char_utf16(const utf16_t *s, size_t len, size_t *ip);
 161 utf32_t lb_get_next_char_utf32(const utf32_t *s, size_t len, size_t *ip);
 162 void lb_init_break_context(
 163         struct LineBreakContext* lbpCtx,
 164         utf32_t ch,
 165         const char* lang);
 166 int lb_process_next_char(
 167         struct LineBreakContext* lbpCtx,
 168         utf32_t ch);
 169 void set_linebreaks(
 170         const void *s,
 171         size_t len,
 172         const char *lang,
 173         char *brks,
 174         get_next_char_t get_next_char);