1 /* vim: set expandtab tabstop=4 softtabstop=4 shiftwidth=4: */
4 * Line breaking in a Unicode sequence. Designed to be used in a
5 * generic text renderer.
7 * Copyright (C) 2008-2015 Wu Yongwei <wuyongwei at gmail dot com>
8 * Copyright (C) 2013 Petr Filipsky <philodej at gmail dot com>
10 * This software is provided 'as-is', without any express or implied
11 * warranty. In no event will the author be held liable for any damages
12 * arising from the use of this software.
14 * Permission is granted to anyone to use this software for any purpose,
15 * including commercial applications, and to alter it and redistribute
16 * it freely, subject to the following restrictions:
18 * 1. The origin of this software must not be misrepresented; you must
19 * not claim that you wrote the original software. If you use this
20 * software in a product, an acknowledgement in the product
21 * documentation would be appreciated but is not required.
22 * 2. Altered source versions must be plainly marked as such, and must
23 * not be misrepresented as being the original software.
24 * 3. This notice may not be removed or altered from any source
27 * The main reference is Unicode Standard Annex 14 (UAX #14):
28 * <URL:http://www.unicode.org/reports/tr14/>
30 * When this library was designed, this annex was at Revision 19, for
32 * <URL:http://www.unicode.org/reports/tr14/tr14-19.html>
34 * This library has been updated according to Revision 33, for
36 * <URL:http://www.unicode.org/reports/tr14/tr14-33.html>
38 * The Unicode Terms of Use are available at
39 * <URL:http://www.unicode.org/copyright.html>
45 * Implementation of the line breaking algorithm as described in Unicode
48 * @version 2.7, 2015/04/18
50 * @author Petr Filipsky
56 #include "linebreak.h"
57 #include "linebreakdef.h"
60 * Special value used internally to indicate an undefined break result.
62 #define LINEBREAK_UNDEFINED -1
65 * Size of the second-level index to the line breaking properties.
67 #define LINEBREAK_INDEX_SIZE 40
70 * Enumeration of break actions. They are used in the break action
75 DIR_BRK, /**< Direct break opportunity */
76 IND_BRK, /**< Indirect break opportunity */
77 CMI_BRK, /**< Indirect break opportunity for combining marks */
78 CMP_BRK, /**< Prohibited break for combining marks */
79 PRH_BRK /**< Prohibited break */
83 * Break action pair table. This is a direct mapping of Table 2 of
84 * Unicode Standard Annex 14, Revision 30.
86 static enum BreakAction baTable[LBP_RI][LBP_RI] = {
88 PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
89 PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
90 PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
91 CMP_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
94 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, PRH_BRK, PRH_BRK,
95 PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
96 DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
97 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
100 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, PRH_BRK, PRH_BRK,
101 PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
102 DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
103 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
106 PRH_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
107 PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
108 IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
109 CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
112 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
113 PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
114 IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
115 CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
118 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
119 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
120 DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
121 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
124 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
125 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
126 DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
127 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
130 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
131 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK,
132 DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
133 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
136 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
137 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
138 DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
139 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
142 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
143 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
144 IND_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
145 CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
148 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
149 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
150 DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
151 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
154 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
155 PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
156 DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
157 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
160 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
161 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
162 DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
163 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
166 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
167 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
168 DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
169 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
172 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
173 PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
174 DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
175 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
178 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
179 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
180 DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
181 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
184 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, DIR_BRK, IND_BRK, PRH_BRK,
185 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK,
186 DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
187 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
190 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, DIR_BRK, IND_BRK, PRH_BRK,
191 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
192 DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
193 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
196 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
197 PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
198 IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
199 CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
202 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
203 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
204 DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, PRH_BRK, PRH_BRK,
205 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
208 DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
209 DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
210 DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
211 DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
214 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
215 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
216 DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
217 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
220 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
221 PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
222 IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
223 CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
226 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
227 PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
228 DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
229 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK,
232 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
233 PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
234 DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
235 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK,
238 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
239 PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
240 DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
241 CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK,
244 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
245 PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
246 DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
247 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK,
250 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
251 PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
252 DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
253 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK,
256 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
257 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
258 DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
259 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
264 * Struct for the second-level index to the line breaking properties.
266 struct LineBreakPropertiesIndex
268 utf32_t end; /**< End coding point */
269 struct LineBreakProperties *lbp;/**< Pointer to line breaking properties */
273 * Second-level index to the line breaking properties.
275 static struct LineBreakPropertiesIndex lb_prop_index[LINEBREAK_INDEX_SIZE] =
277 { 0xFFFFFFFF, lb_prop_default }
281 * Initializes the second-level index to the line breaking properties.
282 * If it is not called, the performance of #get_char_lb_class_lang (and
283 * thus the main functionality) can be pretty bad, especially for big
284 * code points like those of Chinese.
286 void init_linebreak(void)
294 while (lb_prop_default[len].prop != LBP_Undefined)
296 step = len / LINEBREAK_INDEX_SIZE;
298 for (i = 0; i < LINEBREAK_INDEX_SIZE; ++i)
300 lb_prop_index[i].lbp = lb_prop_default + iPropDefault;
301 iPropDefault += step;
302 lb_prop_index[i].end = lb_prop_default[iPropDefault].start - 1;
304 lb_prop_index[--i].end = 0xFFFFFFFF;
308 * Gets the language-specific line breaking properties.
310 * @param lang language of the text
311 * @return pointer to the language-specific line breaking
312 * properties array if found; \c NULL otherwise
314 static struct LineBreakProperties *get_lb_prop_lang(const char *lang)
316 struct LineBreakPropertiesLang *lbplIter;
319 for (lbplIter = lb_prop_lang_map; lbplIter->lang != NULL; ++lbplIter)
321 if (strncmp(lang, lbplIter->lang, lbplIter->namelen) == 0)
323 return lbplIter->lbp;
331 * Gets the line breaking class of a character from a line breaking
334 * @param ch character to check
335 * @param lbp pointer to the line breaking properties array
336 * @return the line breaking class if found; \c LBP_XX otherwise
338 static enum LineBreakClass get_char_lb_class(
340 struct LineBreakProperties *lbp)
342 while (lbp->prop != LBP_Undefined && ch >= lbp->start)
352 * Gets the line breaking class of a character from the default line
353 * breaking properties array.
355 * @param ch character to check
356 * @return the line breaking class if found; \c LBP_XX otherwise
358 static enum LineBreakClass get_char_lb_class_default(
362 while (ch > lb_prop_index[i].end)
364 assert(i < LINEBREAK_INDEX_SIZE);
365 return get_char_lb_class(ch, lb_prop_index[i].lbp);
369 * Gets the line breaking class of a character for a specific
370 * language. This function will check the language-specific data first,
371 * and then the default data if there is no language-specific property
372 * available for the character.
374 * @param ch character to check
375 * @param lbpLang pointer to the language-specific line breaking
377 * @return the line breaking class if found; \c LBP_XX
380 static enum LineBreakClass get_char_lb_class_lang(
382 struct LineBreakProperties *lbpLang)
384 enum LineBreakClass lbcResult;
386 /* Find the language-specific line breaking class for a character */
389 lbcResult = get_char_lb_class(ch, lbpLang);
390 if (lbcResult != LBP_XX)
394 /* Find the generic language-specific line breaking class, if no
395 * language context is provided, or language-specific data are not
396 * available for the specific character in the specified language */
397 return get_char_lb_class_default(ch);
401 * Resolves the line breaking class for certain ambiguous or complicated
402 * characters. They are treated in a simplistic way in this
405 * @param lbc line breaking class to resolve
406 * @param lang language of the text
407 * @return the resolved line breaking class
409 static enum LineBreakClass resolve_lb_class(
410 enum LineBreakClass lbc,
417 (strncmp(lang, "zh", 2) == 0 || /* Chinese */
418 strncmp(lang, "ja", 2) == 0 || /* Japanese */
419 strncmp(lang, "ko", 2) == 0)) /* Korean */
428 /* Simplified for `normal' line breaking. See
429 * <url:http://www.unicode.org/reports/tr14/tr14-30.html#CJ>
442 * Treats specially for the first character in a line.
444 * @param[in,out] lbpCtx pointer to the line breaking context
445 * @pre \a lbpCtx->lbcCur has a valid line break class
446 * @post \a lbpCtx->lbcCur has the updated line break class
448 static void treat_first_char(
449 struct LineBreakContext *lbpCtx)
451 switch (lbpCtx->lbcCur)
455 lbpCtx->lbcCur = LBP_BK; /* Rule LB5 */
458 lbpCtx->lbcCur = LBP_BA; /* Rule LB20 */
461 lbpCtx->lbcCur = LBP_WJ; /* Leading space treated as WJ */
464 lbpCtx->fLb21aHebrew = 1; /* Rule LB21a */
471 * Tries telling the line break opportunity by simple rules.
473 * @param[in,out] lbpCtx pointer to the line breaking context
474 * @pre \a lbpCtx->lbcCur has the current line break
475 * class; and \a lbpCtx->lbcNew has the line
476 * break class for the next character
477 * @post \a lbpCtx->lbcCur has the updated line break
479 * @return break result, one of #LINEBREAK_MUSTBREAK,
480 * #LINEBREAK_ALLOWBREAK, and #LINEBREAK_NOBREAK
481 * if identified; or #LINEBREAK_UNDEFINED if
482 * table lookup is needed
484 static int get_lb_result_simple(
485 struct LineBreakContext *lbpCtx)
487 if (lbpCtx->lbcCur == LBP_BK
488 || (lbpCtx->lbcCur == LBP_CR && lbpCtx->lbcNew != LBP_LF))
490 return LINEBREAK_MUSTBREAK; /* Rules LB4 and LB5 */
493 switch (lbpCtx->lbcNew)
496 return LINEBREAK_NOBREAK; /* Rule LB7; no change to lbcCur */
500 lbpCtx->lbcCur = LBP_BK; /* Mandatory break after */
501 return LINEBREAK_NOBREAK; /* Rule LB6 */
503 lbpCtx->lbcCur = LBP_CR;
504 return LINEBREAK_NOBREAK; /* Rule LB6 */
506 lbpCtx->lbcCur = LBP_BA;
507 return LINEBREAK_ALLOWBREAK; /* Rule LB20 */
509 return LINEBREAK_UNDEFINED; /* Table lookup is needed */
514 * Tells the line break opportunity by table lookup.
516 * @param[in,out] lbpCtx pointer to the line breaking context
517 * @pre \a lbpCtx->lbcCur has the current line break
518 * class; \a lbpCtx->lbcLast has the line break
519 * class for the last character; and \a
520 * lbcCur->lbcNew has the line break class for
522 * @post \a lbpCtx->lbcCur has the updated line break
524 * @return break result, one of #LINEBREAK_MUSTBREAK,
525 * #LINEBREAK_ALLOWBREAK, and #LINEBREAK_NOBREAK
527 static int get_lb_result_lookup(
528 struct LineBreakContext *lbpCtx)
530 int brk = LINEBREAK_UNDEFINED;
532 assert((lbpCtx->lbcCur > 0) && (lbpCtx->lbcCur <= LBP_RI));
533 assert((lbpCtx->lbcNew > 0) && (lbpCtx->lbcNew <= LBP_RI));
535 /* Fix for Hangul word wrap */
536 enum LineBreakClass lbcCur, lbcNew;
538 switch (lbpCtx->lbcCur)
540 case LBP_H2: /**< Hangul LV */
541 case LBP_H3: /**< Hangul LVT */
542 case LBP_JL: /**< Hangul L Jamo */
543 case LBP_JV: /**< Hangul V Jamo */
544 case LBP_JT: /**< Hangul T Jamo */
548 lbcCur = lbpCtx->lbcCur;
552 switch (lbpCtx->lbcNew)
554 case LBP_H2: /**< Hangul LV */
555 case LBP_H3: /**< Hangul LVT */
556 case LBP_JL: /**< Hangul L Jamo */
557 case LBP_JV: /**< Hangul V Jamo */
558 case LBP_JT: /**< Hangul T Jamo */
562 lbcNew = lbpCtx->lbcNew;
566 switch (baTable[lbcCur - 1][lbcNew - 1])
570 brk = LINEBREAK_ALLOWBREAK;
574 brk = (lbpCtx->lbcLast == LBP_SP)
575 ? LINEBREAK_ALLOWBREAK
579 brk = LINEBREAK_NOBREAK;
580 if (lbpCtx->lbcLast != LBP_SP)
581 return brk; /* Do not update lbcCur */
584 brk = LINEBREAK_NOBREAK;
588 /* Special processing due to rule LB21a */
589 if (lbpCtx->fLb21aHebrew &&
590 (lbpCtx->lbcCur == LBP_HY || lbpCtx->lbcCur == LBP_BA))
592 brk = LINEBREAK_NOBREAK;
593 lbpCtx->fLb21aHebrew = 0;
595 else if (!(lbpCtx->lbcNew == LBP_HY || lbpCtx->lbcNew == LBP_BA))
597 lbpCtx->fLb21aHebrew = (lbpCtx->lbcNew == LBP_HL);
600 lbpCtx->lbcCur = lbpCtx->lbcNew;
605 * Initializes line breaking context for a given language.
607 * @param[in,out] lbpCtx pointer to the line breaking context
608 * @param[in] ch the first character to process
609 * @param[in] lang language of the input
610 * @post the line breaking context is initialized
612 void lb_init_break_context(
613 struct LineBreakContext *lbpCtx,
618 lbpCtx->lbpLang = get_lb_prop_lang(lang);
619 lbpCtx->lbcLast = LBP_Undefined;
620 lbpCtx->lbcNew = LBP_Undefined;
621 lbpCtx->lbcCur = resolve_lb_class(
622 get_char_lb_class_lang(ch, lbpCtx->lbpLang),
624 lbpCtx->fLb21aHebrew = 0;
625 treat_first_char(lbpCtx);
629 * Updates LineBreakingContext for the next code point and returns
630 * the detected break.
632 * @param[in,out] lbpCtx pointer to the line breaking context
633 * @param[in] ch Unicode code point
634 * @return break result, one of #LINEBREAK_MUSTBREAK,
635 * #LINEBREAK_ALLOWBREAK, and #LINEBREAK_NOBREAK
636 * @post the line breaking context is updated
638 int lb_process_next_char(
639 struct LineBreakContext *lbpCtx,
644 lbpCtx->lbcLast = lbpCtx->lbcNew;
645 lbpCtx->lbcNew = get_char_lb_class_lang(ch, lbpCtx->lbpLang);
646 brk = get_lb_result_simple(lbpCtx);
649 case LINEBREAK_MUSTBREAK:
650 lbpCtx->lbcCur = resolve_lb_class(lbpCtx->lbcNew, lbpCtx->lang);
651 treat_first_char(lbpCtx);
653 case LINEBREAK_UNDEFINED:
654 lbpCtx->lbcNew = resolve_lb_class(lbpCtx->lbcNew, lbpCtx->lang);
655 brk = get_lb_result_lookup(lbpCtx);
664 * Sets the line breaking information for a generic input string.
666 * @param[in] s input string
667 * @param[in] len length of the input
668 * @param[in] lang language of the input
669 * @param[out] brks pointer to the output breaking data,
670 * containing #LINEBREAK_MUSTBREAK,
671 * #LINEBREAK_ALLOWBREAK, #LINEBREAK_NOBREAK,
672 * or #LINEBREAK_INSIDEACHAR
673 * @param[in] get_next_char function to get the next UTF-32 character
680 get_next_char_t get_next_char)
683 struct LineBreakContext lbCtx;
687 --posLast; /* To be ++'d later */
688 ch = get_next_char(s, len, &posCur);
691 lb_init_break_context(&lbCtx, ch, lang);
693 /* Process a line till an explicit break or end of string */
696 for (++posLast; posLast < posCur - 1; ++posLast)
698 brks[posLast] = LINEBREAK_INSIDEACHAR;
700 assert(posLast == posCur - 1);
701 ch = get_next_char(s, len, &posCur);
704 brks[posLast] = lb_process_next_char(&lbCtx, ch);
707 assert(posLast == posCur - 1 && posCur <= len);
708 /* Break after the last character */
709 brks[posLast] = LINEBREAK_MUSTBREAK;
710 /* When the input contains incomplete sequences */
713 brks[posCur++] = LINEBREAK_INSIDEACHAR;
718 * Sets the line breaking information for a UTF-8 input string.
720 * @param[in] s input UTF-8 string
721 * @param[in] len length of the input
722 * @param[in] lang language of the input
723 * @param[out] brks pointer to the output breaking data, containing
724 * #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
725 * #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
727 void set_linebreaks_utf8(
733 set_linebreaks(s, len, lang, brks,
734 (get_next_char_t)ub_get_next_char_utf8);
738 * Sets the line breaking information for a UTF-16 input string.
740 * @param[in] s input UTF-16 string
741 * @param[in] len length of the input
742 * @param[in] lang language of the input
743 * @param[out] brks pointer to the output breaking data, containing
744 * #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
745 * #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
747 void set_linebreaks_utf16(
753 set_linebreaks(s, len, lang, brks,
754 (get_next_char_t)ub_get_next_char_utf16);
758 * Sets the line breaking information for a UTF-32 input string.
760 * @param[in] s input UTF-32 string
761 * @param[in] len length of the input
762 * @param[in] lang language of the input
763 * @param[out] brks pointer to the output breaking data, containing
764 * #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
765 * #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
767 void set_linebreaks_utf32(
773 set_linebreaks(s, len, lang, brks,
774 (get_next_char_t)ub_get_next_char_utf32);
778 * Tells whether a line break can occur between two Unicode characters.
779 * This is a wrapper function to expose a simple interface. Generally
780 * speaking, it is better to use #set_linebreaks_utf32 instead, since
781 * complicated cases involving combining marks, spaces, etc. cannot be
782 * correctly processed.
784 * @param char1 the first Unicode character
785 * @param char2 the second Unicode character
786 * @param lang language of the input
787 * @return one of #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
788 * #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
790 int is_line_breakable(
799 set_linebreaks_utf32(s, 2, lang, brks);