1 /* vim: set expandtab tabstop=4 softtabstop=4 shiftwidth=4: */
4 * Line breaking in a Unicode sequence. Designed to be used in a
5 * generic text renderer.
7 * Copyright (C) 2008-2013 Wu Yongwei <wuyongwei at gmail dot com>
8 * Copyright (C) 2013 Petr Filipsky <philodej at gmail dot com>
10 * This software is provided 'as-is', without any express or implied
11 * warranty. In no event will the author be held liable for any damages
12 * arising from the use of this software.
14 * Permission is granted to anyone to use this software for any purpose,
15 * including commercial applications, and to alter it and redistribute
16 * it freely, subject to the following restrictions:
18 * 1. The origin of this software must not be misrepresented; you must
19 * not claim that you wrote the original software. If you use this
20 * software in a product, an acknowledgement in the product
21 * documentation would be appreciated but is not required.
22 * 2. Altered source versions must be plainly marked as such, and must
23 * not be misrepresented as being the original software.
24 * 3. This notice may not be removed or altered from any source
27 * The main reference is Unicode Standard Annex 14 (UAX #14):
28 * <URL:http://www.unicode.org/reports/tr14/>
30 * When this library was designed, this annex was at Revision 19, for
32 * <URL:http://www.unicode.org/reports/tr14/tr14-19.html>
34 * This library has been updated according to Revision 30, for
36 * <URL:http://www.unicode.org/reports/tr14/tr14-30.html>
38 * The Unicode Terms of Use are available at
39 * <URL:http://www.unicode.org/copyright.html>
45 * Implementation of the line breaking algorithm as described in Unicode
48 * @version 2.5, 2013/11/14
50 * @author Petr Filipsky
56 #include "linebreak.h"
57 #include "linebreakdef.h"
60 * Special value used internally to indicate an undefined break result.
62 #define LINEBREAK_UNDEFINED -1
65 * Size of the second-level index to the line breaking properties.
67 #define LINEBREAK_INDEX_SIZE 40
70 * Version number of the library.
72 const int linebreak_version = LINEBREAK_VERSION;
75 * Enumeration of break actions. They are used in the break action
80 DIR_BRK, /**< Direct break opportunity */
81 IND_BRK, /**< Indirect break opportunity */
82 CMI_BRK, /**< Indirect break opportunity for combining marks */
83 CMP_BRK, /**< Prohibited break for combining marks */
84 PRH_BRK /**< Prohibited break */
88 * Break action pair table. This is a direct mapping of Table 2 of
89 * Unicode Standard Annex 14, Revision 30.
91 static enum BreakAction baTable[LBP_RI][LBP_RI] = {
93 PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
94 PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
95 PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
96 CMP_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
99 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, PRH_BRK, PRH_BRK,
100 PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
101 DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
102 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
105 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, PRH_BRK, PRH_BRK,
106 PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
107 DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
108 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
111 PRH_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
112 PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
113 IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
114 CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
117 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
118 PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
119 IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
120 CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
123 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
124 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
125 DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
126 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
129 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
130 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
131 DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
132 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
135 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
136 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK,
137 DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
138 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
141 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
142 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
143 DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
144 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
147 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
148 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
149 IND_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
150 CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
153 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
154 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
155 DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
156 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
159 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
160 PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
161 DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
162 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
165 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
166 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
167 DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
168 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
171 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
172 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
173 DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
174 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
177 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
178 PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
179 DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
180 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
183 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
184 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
185 DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
186 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
189 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, DIR_BRK, IND_BRK, PRH_BRK,
190 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK,
191 DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
192 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
195 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, DIR_BRK, IND_BRK, PRH_BRK,
196 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
197 DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
198 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
201 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
202 PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
203 IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
204 CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
207 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
208 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
209 DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, PRH_BRK, PRH_BRK,
210 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
213 DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
214 DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
215 DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
216 DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
219 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
220 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
221 DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
222 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
225 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
226 PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
227 IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
228 CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
231 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
232 PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
233 DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
234 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK,
237 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
238 PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
239 DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
240 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK,
243 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
244 PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
245 DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
246 CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK,
249 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
250 PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
251 DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
252 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK,
255 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
256 PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
257 DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
258 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK,
261 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
262 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
263 DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
264 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
269 * Struct for the second-level index to the line breaking properties.
271 struct LineBreakPropertiesIndex
273 utf32_t end; /**< End coding point */
274 struct LineBreakProperties *lbp;/**< Pointer to line breaking properties */
278 * Second-level index to the line breaking properties.
280 static struct LineBreakPropertiesIndex lb_prop_index[LINEBREAK_INDEX_SIZE] =
282 { 0xFFFFFFFF, lb_prop_default }
286 * Initializes the second-level index to the line breaking properties.
287 * If it is not called, the performance of #get_char_lb_class_lang (and
288 * thus the main functionality) can be pretty bad, especially for big
289 * code points like those of Chinese.
291 void init_linebreak(void)
299 while (lb_prop_default[len].prop != LBP_Undefined)
301 step = len / LINEBREAK_INDEX_SIZE;
303 for (i = 0; i < LINEBREAK_INDEX_SIZE; ++i)
305 lb_prop_index[i].lbp = lb_prop_default + iPropDefault;
306 iPropDefault += step;
307 lb_prop_index[i].end = lb_prop_default[iPropDefault].start - 1;
309 lb_prop_index[--i].end = 0xFFFFFFFF;
313 * Gets the language-specific line breaking properties.
315 * @param lang language of the text
316 * @return pointer to the language-specific line breaking
317 * properties array if found; \c NULL otherwise
319 static struct LineBreakProperties *get_lb_prop_lang(const char *lang)
321 struct LineBreakPropertiesLang *lbplIter;
324 for (lbplIter = lb_prop_lang_map; lbplIter->lang != NULL; ++lbplIter)
326 if (strncmp(lang, lbplIter->lang, lbplIter->namelen) == 0)
328 return lbplIter->lbp;
336 * Gets the line breaking class of a character from a line breaking
339 * @param ch character to check
340 * @param lbp pointer to the line breaking properties array
341 * @return the line breaking class if found; \c LBP_XX otherwise
343 static enum LineBreakClass get_char_lb_class(
345 struct LineBreakProperties *lbp)
347 while (lbp->prop != LBP_Undefined && ch >= lbp->start)
357 * Gets the line breaking class of a character from the default line
358 * breaking properties array.
360 * @param ch character to check
361 * @return the line breaking class if found; \c LBP_XX otherwise
363 static enum LineBreakClass get_char_lb_class_default(
367 while (ch > lb_prop_index[i].end)
369 assert(i < LINEBREAK_INDEX_SIZE);
370 return get_char_lb_class(ch, lb_prop_index[i].lbp);
374 * Gets the line breaking class of a character for a specific
375 * language. This function will check the language-specific data first,
376 * and then the default data if there is no language-specific property
377 * available for the character.
379 * @param ch character to check
380 * @param lbpLang pointer to the language-specific line breaking
382 * @return the line breaking class if found; \c LBP_XX
385 static enum LineBreakClass get_char_lb_class_lang(
387 struct LineBreakProperties *lbpLang)
389 enum LineBreakClass lbcResult;
391 /* Find the language-specific line breaking class for a character */
394 lbcResult = get_char_lb_class(ch, lbpLang);
395 if (lbcResult != LBP_XX)
399 /* Find the generic language-specific line breaking class, if no
400 * language context is provided, or language-specific data are not
401 * available for the specific character in the specified language */
402 return get_char_lb_class_default(ch);
406 * Resolves the line breaking class for certain ambiguous or complicated
407 * characters. They are treated in a simplistic way in this
410 * @param lbc line breaking class to resolve
411 * @param lang language of the text
412 * @return the resolved line breaking class
414 static enum LineBreakClass resolve_lb_class(
415 enum LineBreakClass lbc,
422 (strncmp(lang, "zh", 2) == 0 || /* Chinese */
423 strncmp(lang, "ja", 2) == 0 || /* Japanese */
424 strncmp(lang, "ko", 2) == 0)) /* Korean */
433 /* Simplified for `normal' line breaking. See
434 * <url:http://www.unicode.org/reports/tr14/tr14-30.html#CJ>
447 * Treats specially for the first character in a line.
449 * @param[in,out] lbpCtx pointer to the line breaking context
450 * @pre \a lbpCtx->lbcCur has a valid line break class
451 * @post \a lbpCtx->lbcCur has the updated line break class
453 static void treat_first_char(
454 struct LineBreakContext* lbpCtx)
456 switch (lbpCtx->lbcCur)
460 lbpCtx->lbcCur = LBP_BK; /* Rule LB5 */
463 lbpCtx->lbcCur = LBP_BA; /* Rule LB20 */
466 lbpCtx->lbcCur = LBP_WJ; /* Leading space treated as WJ */
474 * Tries telling the line break opportunity by simple rules.
476 * @param[in,out] lbpCtx pointer to the line breaking context
477 * @pre \a lbpCtx->lbcCur has the current line break
478 * class; and \a lbpCtx->lbcNew has the line
479 * break class for the next character
480 * @post \a lbpCtx->lbcCur has the updated line break
482 * @return break result, one of #LINEBREAK_MUSTBREAK,
483 * #LINEBREAK_ALLOWBREAK, and #LINEBREAK_NOBREAK
484 * if identified; or #LINEBREAK_UNDEFINED if
485 * table lookup is needed
487 static int get_lb_result_simple(
488 struct LineBreakContext* lbpCtx)
490 if (lbpCtx->lbcCur == LBP_BK
491 || (lbpCtx->lbcCur == LBP_CR && lbpCtx->lbcNew != LBP_LF))
493 return LINEBREAK_MUSTBREAK; /* Rules LB4 and LB5 */
496 switch (lbpCtx->lbcNew)
499 return LINEBREAK_NOBREAK; /* Rule LB7; no change to lbcCur */
503 lbpCtx->lbcCur = LBP_BK; /* Mandatory break after */
504 return LINEBREAK_NOBREAK; /* Rule LB6 */
506 lbpCtx->lbcCur = LBP_CR;
507 return LINEBREAK_NOBREAK; /* Rule LB6 */
509 lbpCtx->lbcCur = LBP_BA;
510 return LINEBREAK_ALLOWBREAK; /* Rule LB20 */
512 return LINEBREAK_UNDEFINED; /* Table lookup is needed */
517 * Tells the line break opportunity by table lookup.
519 * @param[in,out] lbpCtx pointer to the line breaking context
520 * @pre \a lbpCtx->lbcCur has the current line break
521 * class; \a lbpCtx->lbcLast has the line break
522 * class for the last character; and \a
523 * lbcCur->lbcNew has the line break class for
525 * @post \a lbpCtx->lbcCur has the updated line break
527 * @return break result, one of #LINEBREAK_MUSTBREAK,
528 * #LINEBREAK_ALLOWBREAK, and #LINEBREAK_NOBREAK
530 static int get_lb_result_lookup(
531 struct LineBreakContext* lbpCtx)
533 /* TODO: Rule LB21a, as introduced by Revision 28 of UAX#14, is not
534 * yet implemented below. */
535 int brk = LINEBREAK_UNDEFINED;
536 assert((lbpCtx->lbcCur > 0) && (lbpCtx->lbcCur <= LBP_RI));
537 assert((lbpCtx->lbcNew > 0) && (lbpCtx->lbcNew <= LBP_RI));
538 switch (baTable[lbpCtx->lbcCur - 1][lbpCtx->lbcNew - 1])
541 brk = LINEBREAK_ALLOWBREAK;
545 brk = (lbpCtx->lbcLast == LBP_SP)
546 ? LINEBREAK_ALLOWBREAK
550 brk = LINEBREAK_NOBREAK;
551 if (lbpCtx->lbcLast != LBP_SP)
552 return brk; /* Do not update lbcCur */
555 brk = LINEBREAK_NOBREAK;
558 lbpCtx->lbcCur = lbpCtx->lbcNew;
563 * Initializes line breaking context for a given language.
565 * @param[in,out] lbpCtx pointer to the line breaking context
566 * @param[in] ch the first character to process
567 * @param[in] lang language of the input
568 * @post the line breaking context is initialized
570 void lb_init_break_context(
571 struct LineBreakContext* lbpCtx,
576 lbpCtx->lbpLang = get_lb_prop_lang(lang);
577 lbpCtx->lbcLast = LBP_Undefined;
578 lbpCtx->lbcNew = LBP_Undefined;
579 lbpCtx->lbcCur = resolve_lb_class(
580 get_char_lb_class_lang(ch, lbpCtx->lbpLang),
582 treat_first_char(lbpCtx);
586 * Updates LineBreakingContext for the next code point and returns
587 * the detected break.
589 * @param[in,out] lbpCtx pointer to the line breaking context
590 * @param[in] ch Unicode code point
591 * @return break result, one of #LINEBREAK_MUSTBREAK,
592 * #LINEBREAK_ALLOWBREAK, and #LINEBREAK_NOBREAK
593 * @post the line breaking context is updated
595 int lb_process_next_char(
596 struct LineBreakContext* lbpCtx,
601 lbpCtx->lbcLast = lbpCtx->lbcNew;
602 lbpCtx->lbcNew = get_char_lb_class_lang(ch, lbpCtx->lbpLang);
603 brk = get_lb_result_simple(lbpCtx);
606 case LINEBREAK_MUSTBREAK:
607 lbpCtx->lbcCur = resolve_lb_class(lbpCtx->lbcNew, lbpCtx->lang);
608 treat_first_char(lbpCtx);
610 case LINEBREAK_UNDEFINED:
611 lbpCtx->lbcNew = resolve_lb_class(lbpCtx->lbcNew, lbpCtx->lang);
612 brk = get_lb_result_lookup(lbpCtx);
621 * Gets the next Unicode character in a UTF-8 sequence. The index will
622 * be advanced to the next complete character, unless the end of string
623 * is reached in the middle of a UTF-8 sequence.
625 * @param[in] s input UTF-8 string
626 * @param[in] len length of the string in bytes
627 * @param[in,out] ip pointer to the index
628 * @return the Unicode character beginning at the index; or
629 * #EOS if end of input is encountered
631 utf32_t lb_get_next_char_utf8(
644 if (ch < 0xC2 || ch > 0xF4)
645 { /* One-byte sequence, tail (should not occur), or invalid */
650 { /* Two-byte sequence */
653 res = ((ch & 0x1F) << 6) + (s[*ip + 1] & 0x3F);
658 { /* Three-byte sequence */
661 res = ((ch & 0x0F) << 12) +
662 ((s[*ip + 1] & 0x3F) << 6) +
663 ((s[*ip + 2] & 0x3F));
668 { /* Four-byte sequence */
671 res = ((ch & 0x07) << 18) +
672 ((s[*ip + 1] & 0x3F) << 12) +
673 ((s[*ip + 2] & 0x3F) << 6) +
674 ((s[*ip + 3] & 0x3F));
681 * Gets the next Unicode character in a UTF-16 sequence. The index will
682 * be advanced to the next complete character, unless the end of string
683 * is reached in the middle of a UTF-16 surrogate pair.
685 * @param[in] s input UTF-16 string
686 * @param[in] len length of the string in words
687 * @param[in,out] ip pointer to the index
688 * @return the Unicode character beginning at the index; or
689 * #EOS if end of input is encountered
691 utf32_t lb_get_next_char_utf16(
703 if (ch < 0xD800 || ch > 0xDBFF)
704 { /* If the character is not a high surrogate */
708 { /* If the input ends here (an error) */
712 if (s[*ip] < 0xDC00 || s[*ip] > 0xDFFF)
713 { /* If the next character is not the low surrogate (an error) */
716 /* Return the constructed character and advance the index again */
717 return (((utf32_t)ch & 0x3FF) << 10) + (s[(*ip)++] & 0x3FF) + 0x10000;
721 * Gets the next Unicode character in a UTF-32 sequence. The index will
722 * be advanced to the next character.
724 * @param[in] s input UTF-32 string
725 * @param[in] len length of the string in dwords
726 * @param[in,out] ip pointer to the index
727 * @return the Unicode character beginning at the index; or
728 * #EOS if end of input is encountered
730 utf32_t lb_get_next_char_utf32(
742 * Sets the line breaking information for a generic input string.
744 * @param[in] s input string
745 * @param[in] len length of the input
746 * @param[in] lang language of the input
747 * @param[out] brks pointer to the output breaking data,
748 * containing #LINEBREAK_MUSTBREAK,
749 * #LINEBREAK_ALLOWBREAK, #LINEBREAK_NOBREAK,
750 * or #LINEBREAK_INSIDEACHAR
751 * @param[in] get_next_char function to get the next UTF-32 character
758 get_next_char_t get_next_char)
761 struct LineBreakContext lbCtx;
765 --posLast; /* To be ++'d later */
766 ch = get_next_char(s, len, &posCur);
769 lb_init_break_context(&lbCtx, ch, lang);
771 /* Process a line till an explicit break or end of string */
774 for (++posLast; posLast < posCur - 1; ++posLast)
776 brks[posLast] = LINEBREAK_INSIDEACHAR;
778 assert(posLast == posCur - 1);
779 ch = get_next_char(s, len, &posCur);
782 brks[posLast] = lb_process_next_char(&lbCtx, ch);
785 assert(posLast == posCur - 1 && posCur <= len);
786 /* Break after the last character */
787 brks[posLast] = LINEBREAK_MUSTBREAK;
788 /* When the input contains incomplete sequences */
791 brks[posCur++] = LINEBREAK_INSIDEACHAR;
796 * Sets the line breaking information for a UTF-8 input string.
798 * @param[in] s input UTF-8 string
799 * @param[in] len length of the input
800 * @param[in] lang language of the input
801 * @param[out] brks pointer to the output breaking data, containing
802 * #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
803 * #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
805 void set_linebreaks_utf8(
811 set_linebreaks(s, len, lang, brks,
812 (get_next_char_t)lb_get_next_char_utf8);
816 * Sets the line breaking information for a UTF-16 input string.
818 * @param[in] s input UTF-16 string
819 * @param[in] len length of the input
820 * @param[in] lang language of the input
821 * @param[out] brks pointer to the output breaking data, containing
822 * #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
823 * #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
825 void set_linebreaks_utf16(
831 set_linebreaks(s, len, lang, brks,
832 (get_next_char_t)lb_get_next_char_utf16);
836 * Sets the line breaking information for a UTF-32 input string.
838 * @param[in] s input UTF-32 string
839 * @param[in] len length of the input
840 * @param[in] lang language of the input
841 * @param[out] brks pointer to the output breaking data, containing
842 * #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
843 * #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
845 void set_linebreaks_utf32(
851 set_linebreaks(s, len, lang, brks,
852 (get_next_char_t)lb_get_next_char_utf32);
856 * Tells whether a line break can occur between two Unicode characters.
857 * This is a wrapper function to expose a simple interface. Generally
858 * speaking, it is better to use #set_linebreaks_utf32 instead, since
859 * complicated cases involving combining marks, spaces, etc. cannot be
860 * correctly processed.
862 * @param char1 the first Unicode character
863 * @param char2 the second Unicode character
864 * @param lang language of the input
865 * @return one of #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
866 * #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
868 int is_line_breakable(
877 set_linebreaks_utf32(s, 2, lang, brks);