1 /* vim: set expandtab tabstop=4 softtabstop=4 shiftwidth=4: */
4 * Line breaking in a Unicode sequence. Designed to be used in a
5 * generic text renderer.
7 * Copyright (C) 2008-2013 Wu Yongwei <wuyongwei at gmail dot com>
8 * Copyright (C) 2013 Petr Filipsky <philodej at gmail dot com>
10 * This software is provided 'as-is', without any express or implied
11 * warranty. In no event will the author be held liable for any damages
12 * arising from the use of this software.
14 * Permission is granted to anyone to use this software for any purpose,
15 * including commercial applications, and to alter it and redistribute
16 * it freely, subject to the following restrictions:
18 * 1. The origin of this software must not be misrepresented; you must
19 * not claim that you wrote the original software. If you use this
20 * software in a product, an acknowledgement in the product
21 * documentation would be appreciated but is not required.
22 * 2. Altered source versions must be plainly marked as such, and must
23 * not be misrepresented as being the original software.
24 * 3. This notice may not be removed or altered from any source
27 * The main reference is Unicode Standard Annex 14 (UAX #14):
28 * <URL:http://www.unicode.org/reports/tr14/>
30 * When this library was designed, this annex was at Revision 19, for
32 * <URL:http://www.unicode.org/reports/tr14/tr14-19.html>
34 * This library has been updated according to Revision 30, for
36 * <URL:http://www.unicode.org/reports/tr14/tr14-30.html>
38 * The Unicode Terms of Use are available at
39 * <URL:http://www.unicode.org/copyright.html>
43 * @file linebreakdef.h
45 * Definitions of internal data structures, declarations of global
46 * variables, and function prototypes for the line breaking algorithm.
48 * @version 2.4, 2013/11/10
50 * @author Petr Filipsky
54 * Constant value to mark the end of string. It is not a valid Unicode
57 #define EOS 0xFFFFFFFF
60 * Line break classes. This is a direct mapping of Table 1 of Unicode
61 * Standard Annex 14, Revision 26.
65 /* This is used to signal an error condition. */
66 LBP_Undefined, /**< Undefined */
68 /* The following break classes are treated in the pair table. */
69 LBP_OP, /**< Opening punctuation */
70 LBP_CL, /**< Closing punctuation */
71 LBP_CP, /**< Closing parenthesis */
72 LBP_QU, /**< Ambiguous quotation */
74 LBP_NS, /**< Non-starters */
75 LBP_EX, /**< Exclamation/Interrogation */
76 LBP_SY, /**< Symbols allowing break after */
77 LBP_IS, /**< Infix separator */
78 LBP_PR, /**< Prefix */
79 LBP_PO, /**< Postfix */
80 LBP_NU, /**< Numeric */
81 LBP_AL, /**< Alphabetic */
82 LBP_HL, /**< Hebrew letter */
83 LBP_ID, /**< Ideographic */
84 LBP_IN, /**< Inseparable characters */
85 LBP_HY, /**< Hyphen */
86 LBP_BA, /**< Break after */
87 LBP_BB, /**< Break before */
88 LBP_B2, /**< Break on either side (but not pair) */
89 LBP_ZW, /**< Zero-width space */
90 LBP_CM, /**< Combining marks */
91 LBP_WJ, /**< Word joiner */
92 LBP_H2, /**< Hangul LV */
93 LBP_H3, /**< Hangul LVT */
94 LBP_JL, /**< Hangul L Jamo */
95 LBP_JV, /**< Hangul V Jamo */
96 LBP_JT, /**< Hangul T Jamo */
97 LBP_RI, /**< Regional indicator */
99 /* The following break classes are not treated in the pair table */
100 LBP_AI, /**< Ambiguous (alphabetic or ideograph) */
101 LBP_BK, /**< Break (mandatory) */
102 LBP_CB, /**< Contingent break */
103 LBP_CJ, /**< Conditional Japanese starter */
104 LBP_CR, /**< Carriage return */
105 LBP_LF, /**< Line feed */
106 LBP_NL, /**< Next line */
107 LBP_SA, /**< South-East Asian */
108 LBP_SG, /**< Surrogates */
109 LBP_SP, /**< Space */
110 LBP_XX /**< Unknown */
114 * Struct for entries of line break properties. The array of the
115 * entries \e must be sorted.
117 struct LineBreakProperties
119 utf32_t start; /**< Starting coding point */
120 utf32_t end; /**< End coding point */
121 enum LineBreakClass prop; /**< The line breaking property */
125 * Struct for association of language-specific line breaking properties
126 * with language names.
128 struct LineBreakPropertiesLang
130 const char *lang; /**< Language name */
131 size_t namelen; /**< Length of name to match */
132 struct LineBreakProperties *lbp; /**< Pointer to associated data */
136 * Context representing internal state of the line breaking algorithm.
137 * This is useful to callers if incremental analysis is wanted.
139 struct LineBreakContext
141 const char *lang; /**< Language name */
142 struct LineBreakProperties *lbpLang;/**< Pointer to LineBreakProperties */
143 enum LineBreakClass lbcCur; /**< Breaking class of current codepoint */
144 enum LineBreakClass lbcNew; /**< Breaking class of next codepoint */
145 enum LineBreakClass lbcLast; /**< Breaking class of last codepoint */
149 * Abstract function interface for #lb_get_next_char_utf8,
150 * #lb_get_next_char_utf16, and #lb_get_next_char_utf32.
152 typedef utf32_t (*get_next_char_t)(const void *, size_t, size_t *);
155 extern struct LineBreakProperties lb_prop_default[];
156 extern struct LineBreakPropertiesLang lb_prop_lang_map[];
158 /* Function Prototype */
159 utf32_t lb_get_next_char_utf8(const utf8_t *s, size_t len, size_t *ip);
160 utf32_t lb_get_next_char_utf16(const utf16_t *s, size_t len, size_t *ip);
161 utf32_t lb_get_next_char_utf32(const utf32_t *s, size_t len, size_t *ip);
162 void lb_init_break_context(
163 struct LineBreakContext* lbpCtx,
166 int lb_process_next_char(
167 struct LineBreakContext* lbpCtx,
174 get_next_char_t get_next_char);