1 /* vim: set expandtab tabstop=4 softtabstop=4 shiftwidth=4: */
4 * Line breaking in a Unicode sequence. Designed to be used in a
5 * generic text renderer.
7 * Copyright (C) 2008-2013 Wu Yongwei <wuyongwei at gmail dot com>
8 * Copyright (C) 2013 Petr Filipsky <philodej at gmail dot com>
10 * This software is provided 'as-is', without any express or implied
11 * warranty. In no event will the author be held liable for any damages
12 * arising from the use of this software.
14 * Permission is granted to anyone to use this software for any purpose,
15 * including commercial applications, and to alter it and redistribute
16 * it freely, subject to the following restrictions:
18 * 1. The origin of this software must not be misrepresented; you must
19 * not claim that you wrote the original software. If you use this
20 * software in a product, an acknowledgement in the product
21 * documentation would be appreciated but is not required.
22 * 2. Altered source versions must be plainly marked as such, and must
23 * not be misrepresented as being the original software.
24 * 3. This notice may not be removed or altered from any source
27 * The main reference is Unicode Standard Annex 14 (UAX #14):
28 * <URL:http://www.unicode.org/reports/tr14/>
30 * When this library was designed, this annex was at Revision 19, for
32 * <URL:http://www.unicode.org/reports/tr14/tr14-19.html>
34 * This library has been updated according to Revision 30, for
36 * <URL:http://www.unicode.org/reports/tr14/tr14-30.html>
38 * The Unicode Terms of Use are available at
39 * <URL:http://www.unicode.org/copyright.html>
45 * Implementation of the line breaking algorithm as described in Unicode
48 * @version 2.5, 2013/11/14
50 * @author Petr Filipsky
56 #include "linebreak.h"
57 #include "linebreakdef.h"
60 * Special value used internally to indicate an undefined break result.
62 #define LINEBREAK_UNDEFINED -1
65 * Size of the second-level index to the line breaking properties.
67 #define LINEBREAK_INDEX_SIZE 40
70 * Version number of the library.
72 const int linebreak_version = LINEBREAK_VERSION;
75 * Enumeration of break actions. They are used in the break action
80 DIR_BRK, /**< Direct break opportunity */
81 IND_BRK, /**< Indirect break opportunity */
82 CMI_BRK, /**< Indirect break opportunity for combining marks */
83 CMP_BRK, /**< Prohibited break for combining marks */
84 PRH_BRK /**< Prohibited break */
88 * Break action pair table. This is a direct mapping of Table 2 of
89 * Unicode Standard Annex 14, Revision 30.
91 static enum BreakAction baTable[LBP_RI][LBP_RI] = {
93 PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
94 PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
95 PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
96 CMP_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
99 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, PRH_BRK, PRH_BRK,
100 PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
101 DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
102 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
105 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, PRH_BRK, PRH_BRK,
106 PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
107 DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
108 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
111 PRH_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
112 PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
113 IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
114 CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
117 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
118 PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
119 IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
120 CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
123 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
124 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
125 DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
126 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
129 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
130 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
131 DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
132 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
135 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
136 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK,
137 DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
138 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
141 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
142 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
143 DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
144 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
147 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
148 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
149 IND_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
150 CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
153 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
154 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
155 DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
156 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
159 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
160 PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
161 DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
162 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
165 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
166 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
167 DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
168 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
171 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
172 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
173 DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
174 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
177 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
178 PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
179 DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
180 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
183 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
184 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
185 DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
186 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
189 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, DIR_BRK, IND_BRK, PRH_BRK,
190 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK,
191 DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
192 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
195 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, DIR_BRK, IND_BRK, PRH_BRK,
196 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
197 DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
198 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
201 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
202 PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
203 IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
204 CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
207 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
208 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
209 DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, PRH_BRK, PRH_BRK,
210 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
213 DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
214 DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
215 DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
216 DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
219 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
220 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
221 DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
222 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
225 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
226 PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
227 IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
228 CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
231 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
232 PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
233 DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
234 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK,
237 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
238 PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
239 DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
240 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK,
243 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
244 PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
245 DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
246 CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK,
249 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
250 PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
251 DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
252 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK,
255 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
256 PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
257 DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
258 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK,
261 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
262 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
263 DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
264 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
269 * Struct for the second-level index to the line breaking properties.
271 struct LineBreakPropertiesIndex
273 utf32_t end; /**< End coding point */
274 struct LineBreakProperties *lbp;/**< Pointer to line breaking properties */
278 * Second-level index to the line breaking properties.
280 static struct LineBreakPropertiesIndex lb_prop_index[LINEBREAK_INDEX_SIZE] =
282 { 0xFFFFFFFF, lb_prop_default }
286 * Initializes the second-level index to the line breaking properties.
287 * If it is not called, the performance of #get_char_lb_class_lang (and
288 * thus the main functionality) can be pretty bad, especially for big
289 * code points like those of Chinese.
291 void init_linebreak(void)
299 while (lb_prop_default[len].prop != LBP_Undefined)
301 step = len / LINEBREAK_INDEX_SIZE;
303 for (i = 0; i < LINEBREAK_INDEX_SIZE; ++i)
305 lb_prop_index[i].lbp = lb_prop_default + iPropDefault;
306 iPropDefault += step;
307 lb_prop_index[i].end = lb_prop_default[iPropDefault].start - 1;
309 lb_prop_index[--i].end = 0xFFFFFFFF;
313 * Gets the language-specific line breaking properties.
315 * @param lang language of the text
316 * @return pointer to the language-specific line breaking
317 * properties array if found; \c NULL otherwise
319 static struct LineBreakProperties *get_lb_prop_lang(const char *lang)
321 struct LineBreakPropertiesLang *lbplIter;
324 for (lbplIter = lb_prop_lang_map; lbplIter->lang != NULL; ++lbplIter)
326 if (strncmp(lang, lbplIter->lang, lbplIter->namelen) == 0)
328 return lbplIter->lbp;
336 * Gets the line breaking class of a character from a line breaking
339 * @param ch character to check
340 * @param lbp pointer to the line breaking properties array
341 * @return the line breaking class if found; \c LBP_XX otherwise
343 static enum LineBreakClass get_char_lb_class(
345 struct LineBreakProperties *lbp)
347 while (lbp->prop != LBP_Undefined && ch >= lbp->start)
357 * Gets the line breaking class of a character from the default line
358 * breaking properties array.
360 * @param ch character to check
361 * @return the line breaking class if found; \c LBP_XX otherwise
363 static enum LineBreakClass get_char_lb_class_default(
367 while (ch > lb_prop_index[i].end)
369 assert(i < LINEBREAK_INDEX_SIZE);
370 return get_char_lb_class(ch, lb_prop_index[i].lbp);
374 * Gets the line breaking class of a character for a specific
375 * language. This function will check the language-specific data first,
376 * and then the default data if there is no language-specific property
377 * available for the character.
379 * @param ch character to check
380 * @param lbpLang pointer to the language-specific line breaking
382 * @return the line breaking class if found; \c LBP_XX
385 static enum LineBreakClass get_char_lb_class_lang(
387 struct LineBreakProperties *lbpLang)
389 enum LineBreakClass lbcResult;
391 /* Find the language-specific line breaking class for a character */
394 lbcResult = get_char_lb_class(ch, lbpLang);
395 if (lbcResult != LBP_XX)
399 /* Find the generic language-specific line breaking class, if no
400 * language context is provided, or language-specific data are not
401 * available for the specific character in the specified language */
402 return get_char_lb_class_default(ch);
406 * Resolves the line breaking class for certain ambiguous or complicated
407 * characters. They are treated in a simplistic way in this
410 * @param lbc line breaking class to resolve
411 * @param lang language of the text
412 * @return the resolved line breaking class
414 static enum LineBreakClass resolve_lb_class(
415 enum LineBreakClass lbc,
422 (strncmp(lang, "zh", 2) == 0 || /* Chinese */
423 strncmp(lang, "ja", 2) == 0 || /* Japanese */
424 strncmp(lang, "ko", 2) == 0)) /* Korean */
433 /* Simplified for `normal' line breaking. See
434 * <url:http://www.unicode.org/reports/tr14/tr14-30.html#CJ>
447 * Treats specially for the first character in a line.
449 * @param[in,out] lbpCtx pointer to the line breaking context
450 * @pre \a lbpCtx->lbcCur has a valid line break class
451 * @post \a lbpCtx->lbcCur has the updated line break class
453 static void treat_first_char(
454 struct LineBreakContext* lbpCtx)
456 switch (lbpCtx->lbcCur)
460 lbpCtx->lbcCur = LBP_BK; /* Rule LB5 */
463 lbpCtx->lbcCur = LBP_BA; /* Rule LB20 */
466 lbpCtx->lbcCur = LBP_WJ; /* Leading space treated as WJ */
474 * Tries telling the line break opportunity by simple rules.
476 * @param[in,out] lbpCtx pointer to the line breaking context
477 * @pre \a lbpCtx->lbcCur has the current line break
478 * class; and \a lbpCtx->lbcNew has the line
479 * break class for the next character
480 * @post \a lbpCtx->lbcCur has the updated line break
482 * @return break result, one of #LINEBREAK_MUSTBREAK,
483 * #LINEBREAK_ALLOWBREAK, and #LINEBREAK_NOBREAK
484 * if identified; or #LINEBREAK_UNDEFINED if
485 * table lookup is needed
487 static int get_lb_result_simple(
488 struct LineBreakContext* lbpCtx)
490 if (lbpCtx->lbcCur == LBP_BK
491 || (lbpCtx->lbcCur == LBP_CR && lbpCtx->lbcNew != LBP_LF))
493 return LINEBREAK_MUSTBREAK; /* Rules LB4 and LB5 */
496 switch (lbpCtx->lbcNew)
499 return LINEBREAK_NOBREAK; /* Rule LB7; no change to lbcCur */
503 lbpCtx->lbcCur = LBP_BK; /* Mandatory break after */
504 return LINEBREAK_NOBREAK; /* Rule LB6 */
506 lbpCtx->lbcCur = LBP_CR;
507 return LINEBREAK_NOBREAK; /* Rule LB6 */
509 lbpCtx->lbcCur = LBP_BA;
510 return LINEBREAK_ALLOWBREAK; /* Rule LB20 */
512 return LINEBREAK_UNDEFINED; /* Table lookup is needed */
517 * Tells the line break opportunity by table lookup.
519 * @param[in,out] lbpCtx pointer to the line breaking context
520 * @pre \a lbpCtx->lbcCur has the current line break
521 * class; \a lbpCtx->lbcLast has the line break
522 * class for the last character; and \a
523 * lbcCur->lbcNew has the line break class for
525 * @post \a lbpCtx->lbcCur has the updated line break
527 * @return break result, one of #LINEBREAK_MUSTBREAK,
528 * #LINEBREAK_ALLOWBREAK, and #LINEBREAK_NOBREAK
530 static int get_lb_result_lookup(
531 struct LineBreakContext* lbpCtx)
533 /* TODO: Rule LB21a, as introduced by Revision 28 of UAX#14, is not
534 * yet implemented below. */
535 int brk = LINEBREAK_UNDEFINED;
536 assert((lbpCtx->lbcCur > 0) && (lbpCtx->lbcCur <= LBP_RI));
537 assert((lbpCtx->lbcNew > 0) && (lbpCtx->lbcNew <= LBP_RI));
539 enum LineBreakClass lbcCur, lbcNew;
541 switch (lbpCtx->lbcCur)
543 case LBP_H2: /**< Hangul LV */
544 case LBP_H3: /**< Hangul LVT */
545 case LBP_JL: /**< Hangul L Jamo */
546 case LBP_JV: /**< Hangul V Jamo */
547 case LBP_JT: /**< Hangul T Jamo */
551 lbcCur = lbpCtx->lbcCur;
555 switch (lbpCtx->lbcNew)
557 case LBP_H2: /**< Hangul LV */
558 case LBP_H3: /**< Hangul LVT */
559 case LBP_JL: /**< Hangul L Jamo */
560 case LBP_JV: /**< Hangul V Jamo */
561 case LBP_JT: /**< Hangul T Jamo */
565 lbcNew = lbpCtx->lbcNew;
569 switch (baTable[lbcCur - 1][lbcNew - 1])
572 brk = LINEBREAK_ALLOWBREAK;
576 brk = (lbpCtx->lbcLast == LBP_SP)
577 ? LINEBREAK_ALLOWBREAK
581 brk = LINEBREAK_NOBREAK;
582 if (lbpCtx->lbcLast != LBP_SP)
583 return brk; /* Do not update lbcCur */
586 brk = LINEBREAK_NOBREAK;
590 lbpCtx->lbcCur = lbpCtx->lbcNew;
595 * Initializes line breaking context for a given language.
597 * @param[in,out] lbpCtx pointer to the line breaking context
598 * @param[in] ch the first character to process
599 * @param[in] lang language of the input
600 * @post the line breaking context is initialized
602 void lb_init_break_context(
603 struct LineBreakContext* lbpCtx,
608 lbpCtx->lbpLang = get_lb_prop_lang(lang);
609 lbpCtx->lbcLast = LBP_Undefined;
610 lbpCtx->lbcNew = LBP_Undefined;
611 lbpCtx->lbcCur = resolve_lb_class(
612 get_char_lb_class_lang(ch, lbpCtx->lbpLang),
614 treat_first_char(lbpCtx);
618 * Updates LineBreakingContext for the next code point and returns
619 * the detected break.
621 * @param[in,out] lbpCtx pointer to the line breaking context
622 * @param[in] ch Unicode code point
623 * @return break result, one of #LINEBREAK_MUSTBREAK,
624 * #LINEBREAK_ALLOWBREAK, and #LINEBREAK_NOBREAK
625 * @post the line breaking context is updated
627 int lb_process_next_char(
628 struct LineBreakContext* lbpCtx,
633 lbpCtx->lbcLast = lbpCtx->lbcNew;
634 lbpCtx->lbcNew = get_char_lb_class_lang(ch, lbpCtx->lbpLang);
635 brk = get_lb_result_simple(lbpCtx);
638 case LINEBREAK_MUSTBREAK:
639 lbpCtx->lbcCur = resolve_lb_class(lbpCtx->lbcNew, lbpCtx->lang);
640 treat_first_char(lbpCtx);
642 case LINEBREAK_UNDEFINED:
643 lbpCtx->lbcNew = resolve_lb_class(lbpCtx->lbcNew, lbpCtx->lang);
644 brk = get_lb_result_lookup(lbpCtx);
653 * Gets the next Unicode character in a UTF-8 sequence. The index will
654 * be advanced to the next complete character, unless the end of string
655 * is reached in the middle of a UTF-8 sequence.
657 * @param[in] s input UTF-8 string
658 * @param[in] len length of the string in bytes
659 * @param[in,out] ip pointer to the index
660 * @return the Unicode character beginning at the index; or
661 * #EOS if end of input is encountered
663 utf32_t lb_get_next_char_utf8(
676 if (ch < 0xC2 || ch > 0xF4)
677 { /* One-byte sequence, tail (should not occur), or invalid */
682 { /* Two-byte sequence */
685 res = ((ch & 0x1F) << 6) + (s[*ip + 1] & 0x3F);
690 { /* Three-byte sequence */
693 res = ((ch & 0x0F) << 12) +
694 ((s[*ip + 1] & 0x3F) << 6) +
695 ((s[*ip + 2] & 0x3F));
700 { /* Four-byte sequence */
703 res = ((ch & 0x07) << 18) +
704 ((s[*ip + 1] & 0x3F) << 12) +
705 ((s[*ip + 2] & 0x3F) << 6) +
706 ((s[*ip + 3] & 0x3F));
713 * Gets the next Unicode character in a UTF-16 sequence. The index will
714 * be advanced to the next complete character, unless the end of string
715 * is reached in the middle of a UTF-16 surrogate pair.
717 * @param[in] s input UTF-16 string
718 * @param[in] len length of the string in words
719 * @param[in,out] ip pointer to the index
720 * @return the Unicode character beginning at the index; or
721 * #EOS if end of input is encountered
723 utf32_t lb_get_next_char_utf16(
735 if (ch < 0xD800 || ch > 0xDBFF)
736 { /* If the character is not a high surrogate */
740 { /* If the input ends here (an error) */
744 if (s[*ip] < 0xDC00 || s[*ip] > 0xDFFF)
745 { /* If the next character is not the low surrogate (an error) */
748 /* Return the constructed character and advance the index again */
749 return (((utf32_t)ch & 0x3FF) << 10) + (s[(*ip)++] & 0x3FF) + 0x10000;
753 * Gets the next Unicode character in a UTF-32 sequence. The index will
754 * be advanced to the next character.
756 * @param[in] s input UTF-32 string
757 * @param[in] len length of the string in dwords
758 * @param[in,out] ip pointer to the index
759 * @return the Unicode character beginning at the index; or
760 * #EOS if end of input is encountered
762 utf32_t lb_get_next_char_utf32(
774 * Sets the line breaking information for a generic input string.
776 * @param[in] s input string
777 * @param[in] len length of the input
778 * @param[in] lang language of the input
779 * @param[out] brks pointer to the output breaking data,
780 * containing #LINEBREAK_MUSTBREAK,
781 * #LINEBREAK_ALLOWBREAK, #LINEBREAK_NOBREAK,
782 * or #LINEBREAK_INSIDEACHAR
783 * @param[in] get_next_char function to get the next UTF-32 character
790 get_next_char_t get_next_char)
793 struct LineBreakContext lbCtx;
797 --posLast; /* To be ++'d later */
798 ch = get_next_char(s, len, &posCur);
801 lb_init_break_context(&lbCtx, ch, lang);
803 /* Process a line till an explicit break or end of string */
806 for (++posLast; posLast < posCur - 1; ++posLast)
808 brks[posLast] = LINEBREAK_INSIDEACHAR;
810 assert(posLast == posCur - 1);
811 ch = get_next_char(s, len, &posCur);
814 brks[posLast] = lb_process_next_char(&lbCtx, ch);
817 assert(posLast == posCur - 1 && posCur <= len);
818 /* Break after the last character */
819 brks[posLast] = LINEBREAK_MUSTBREAK;
820 /* When the input contains incomplete sequences */
823 brks[posCur++] = LINEBREAK_INSIDEACHAR;
828 * Sets the line breaking information for a UTF-8 input string.
830 * @param[in] s input UTF-8 string
831 * @param[in] len length of the input
832 * @param[in] lang language of the input
833 * @param[out] brks pointer to the output breaking data, containing
834 * #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
835 * #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
837 void set_linebreaks_utf8(
843 set_linebreaks(s, len, lang, brks,
844 (get_next_char_t)lb_get_next_char_utf8);
848 * Sets the line breaking information for a UTF-16 input string.
850 * @param[in] s input UTF-16 string
851 * @param[in] len length of the input
852 * @param[in] lang language of the input
853 * @param[out] brks pointer to the output breaking data, containing
854 * #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
855 * #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
857 void set_linebreaks_utf16(
863 set_linebreaks(s, len, lang, brks,
864 (get_next_char_t)lb_get_next_char_utf16);
868 * Sets the line breaking information for a UTF-32 input string.
870 * @param[in] s input UTF-32 string
871 * @param[in] len length of the input
872 * @param[in] lang language of the input
873 * @param[out] brks pointer to the output breaking data, containing
874 * #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
875 * #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
877 void set_linebreaks_utf32(
883 set_linebreaks(s, len, lang, brks,
884 (get_next_char_t)lb_get_next_char_utf32);
888 * Tells whether a line break can occur between two Unicode characters.
889 * This is a wrapper function to expose a simple interface. Generally
890 * speaking, it is better to use #set_linebreaks_utf32 instead, since
891 * complicated cases involving combining marks, spaces, etc. cannot be
892 * correctly processed.
894 * @param char1 the first Unicode character
895 * @param char2 the second Unicode character
896 * @param lang language of the input
897 * @return one of #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
898 * #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
900 int is_line_breakable(
909 set_linebreaks_utf32(s, 2, lang, brks);