1 /* vim: set expandtab tabstop=4 softtabstop=4 shiftwidth=4: */
4 * Word breaking in a Unicode sequence. Designed to be used in a
5 * generic text renderer.
7 * Copyright (C) 2013 Tom Hacohen <tom at stosb dot com>
9 * This software is provided 'as-is', without any express or implied
10 * warranty. In no event will the author be held liable for any damages
11 * arising from the use of this software.
13 * Permission is granted to anyone to use this software for any purpose,
14 * including commercial applications, and to alter it and redistribute
15 * it freely, subject to the following restrictions:
17 * 1. The origin of this software must not be misrepresented; you must
18 * not claim that you wrote the original software. If you use this
19 * software in a product, an acknowledgement in the product
20 * documentation would be appreciated but is not required.
21 * 2. Altered source versions must be plainly marked as such, and must
22 * not be misrepresented as being the original software.
23 * 3. This notice may not be removed or altered from any source
26 * The main reference is Unicode Standard Annex 29 (UAX #29):
27 * <URL:http://unicode.org/reports/tr29>
29 * When this library was designed, this annex was at Revision 17, for
31 * <URL:http://www.unicode.org/reports/tr29/tr29-17.html>
33 * This library has been updated according to Revision 21, for
35 * <URL:http://www.unicode.org/reports/tr29/tr29-21.html>
37 * The Unicode Terms of Use are available at
38 * <URL:http://www.unicode.org/copyright.html>
44 * Implementation of the word breaking algorithm as described in Unicode
47 * @version 2.4, 2013/09/28
54 #include "linebreak.h"
55 #include "linebreakdef.h"
57 #include "wordbreak.h"
58 #include "wordbreakdata.c"
60 #define ARRAY_LEN(x) (sizeof(x) / sizeof(x[0]))
63 * Initializes the wordbreak internals. It currently does nothing, but
64 * it may in the future.
66 void init_wordbreak(void)
71 * Gets the word breaking class of a character.
73 * @param ch character to check
74 * @param wbp pointer to the wbp breaking properties array
75 * @param len size of the wbp array in number of items
76 * @return the word breaking class if found; \c WBP_Any otherwise
78 static enum WordBreakClass get_char_wb_class(
80 struct WordBreakProperties *wbp,
89 mid = (min + max) / 2;
91 if (ch < wbp[mid].start)
93 else if (ch > wbp[mid].end)
104 * Sets the word break types to a specific value in a range.
106 * It sets the inside chars to #WORDBREAK_INSIDEACHAR and the rest to brkType.
107 * Assumes \a brks is initialized - all the cells with #WORDBREAK_NOBREAK are
108 * cells that we really don't want to break after.
110 * @param[in] s input string
111 * @param[out] brks breaks array to fill
112 * @param[in] posStart start position
113 * @param[in] posEnd end position (exclusive)
114 * @param[in] len length of the string
115 * @param[in] brkType breaks type to use
116 * @param[in] get_next_char function to get the next UTF-32 character
118 static void set_brks_to(
125 get_next_char_t get_next_char)
127 size_t posNext = posStart;
128 while (posNext < posEnd)
131 ch = get_next_char(s, len, &posNext);
133 for (; posStart < posNext - 1; ++posStart)
134 brks[posStart] = WORDBREAK_INSIDEACHAR;
135 assert(posStart == posNext - 1);
137 /* Only set it if we haven't set it not to break before. */
138 if (brks[posStart] != WORDBREAK_NOBREAK)
139 brks[posStart] = brkType;
144 /* Checks to see if the class is newline, CR, or LF (rules WB3a and b). */
145 #define IS_WB3ab(cls) ((cls == WBP_Newline) || (cls == WBP_CR) || \
149 * Sets the word breaking information for a generic input string.
151 * @param[in] s input string
152 * @param[in] len length of the input
153 * @param[in] lang language of the input
154 * @param[out] brks pointer to the output breaking data, containing
155 * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
156 * #WORDBREAK_INSIDEACHAR
157 * @param[in] get_next_char function to get the next UTF-32 character
159 static void set_wordbreaks(
164 get_next_char_t get_next_char)
166 enum WordBreakClass wbcLast = WBP_Undefined;
167 /* wbcSeqStart is the class that started the current sequence.
168 * WBP_Undefined is a special case that means "sot".
169 * This value is the class that is at the start of the current rule
170 * matching sequence. For example, in case of Numeric+MidNum+Numeric
171 * it'll be Numeric all the way.
173 enum WordBreakClass wbcSeqStart = WBP_Undefined;
179 /* TODO: Language-specific specialization. */
183 memset(brks, WORDBREAK_BREAK, len);
185 ch = get_next_char(s, len, &posNext);
189 enum WordBreakClass wbcCur;
190 wbcCur = get_char_wb_class(ch, wb_prop_default,
191 ARRAY_LEN(wb_prop_default));
197 set_brks_to(s, brks, posLast, posCur, len,
198 WORDBREAK_BREAK, get_next_char);
199 wbcSeqStart = wbcCur;
204 if (wbcSeqStart == WBP_CR) /* WB3 */
206 set_brks_to(s, brks, posLast, posCur, len,
207 WORDBREAK_NOBREAK, get_next_char);
208 wbcSeqStart = wbcCur;
216 set_brks_to(s, brks, posLast, posCur, len,
217 WORDBREAK_BREAK, get_next_char);
218 wbcSeqStart = wbcCur;
224 /* WB4 - If not the first char/after a newline (WB3a,3b), skip
225 * this class, set it to be the same as the prev, and mark
226 * brks not to break before them. */
227 if ((wbcSeqStart == WBP_Undefined) || IS_WB3ab(wbcSeqStart))
229 set_brks_to(s, brks, posLast, posCur, len,
230 WORDBREAK_BREAK, get_next_char);
231 wbcSeqStart = wbcCur;
235 /* It's surely not the first */
236 brks[posCur - 1] = WORDBREAK_NOBREAK;
237 /* "inherit" the previous class. */
243 if ((wbcSeqStart == WBP_Katakana) || /* WB13 */
244 (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
246 set_brks_to(s, brks, posLast, posCur, len,
247 WORDBREAK_NOBREAK, get_next_char);
249 /* No rule found, reset */
252 set_brks_to(s, brks, posLast, posCur, len,
253 WORDBREAK_BREAK, get_next_char);
255 wbcSeqStart = wbcCur;
260 if ((wbcSeqStart == WBP_ALetter) || /* WB5,6,7 */
261 (wbcLast == WBP_Numeric) || /* WB10 */
262 (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
264 set_brks_to(s, brks, posLast, posCur, len,
265 WORDBREAK_NOBREAK, get_next_char);
267 /* No rule found, reset */
270 set_brks_to(s, brks, posLast, posCur, len,
271 WORDBREAK_BREAK, get_next_char);
273 wbcSeqStart = wbcCur;
278 if ((wbcLast == WBP_ALetter) || /* WB6,7 */
279 (wbcLast == WBP_Numeric)) /* WB11,12 */
285 set_brks_to(s, brks, posLast, posCur, len,
286 WORDBREAK_BREAK, get_next_char);
287 wbcSeqStart = wbcCur;
293 if (wbcLast == WBP_ALetter) /* WB6,7 */
299 set_brks_to(s, brks, posLast, posCur, len,
300 WORDBREAK_BREAK, get_next_char);
301 wbcSeqStart = wbcCur;
307 if (wbcLast == WBP_Numeric) /* WB11,12 */
313 set_brks_to(s, brks, posLast, posCur, len,
314 WORDBREAK_BREAK, get_next_char);
315 wbcSeqStart = wbcCur;
321 if ((wbcSeqStart == WBP_Numeric) || /* WB8,11,12 */
322 (wbcLast == WBP_ALetter) || /* WB9 */
323 (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
325 set_brks_to(s, brks, posLast, posCur, len,
326 WORDBREAK_NOBREAK, get_next_char);
328 /* No rule found, reset */
331 set_brks_to(s, brks, posLast, posCur, len,
332 WORDBREAK_BREAK, get_next_char);
334 wbcSeqStart = wbcCur;
338 case WBP_ExtendNumLet:
340 if ((wbcSeqStart == wbcLast) &&
341 ((wbcLast == WBP_ALetter) ||
342 (wbcLast == WBP_Numeric) ||
343 (wbcLast == WBP_Katakana) ||
344 (wbcLast == WBP_ExtendNumLet)))
346 set_brks_to(s, brks, posLast, posCur, len,
347 WORDBREAK_NOBREAK, get_next_char);
349 /* No rule found, reset */
352 set_brks_to(s, brks, posLast, posCur, len,
353 WORDBREAK_BREAK, get_next_char);
355 wbcSeqStart = wbcCur;
361 if (wbcSeqStart == WBP_Regional)
363 set_brks_to(s, brks, posLast, posCur, len,
364 WORDBREAK_NOBREAK, get_next_char);
366 wbcSeqStart = wbcCur;
371 /* Allow breaks and reset */
372 set_brks_to(s, brks, posLast, posCur, len,
373 WORDBREAK_BREAK, get_next_char);
374 wbcSeqStart = wbcCur;
379 /* Error, should never get here! */
386 ch = get_next_char(s, len, &posNext);
390 set_brks_to(s, brks, posLast, posNext, len,
391 WORDBREAK_BREAK, get_next_char);
395 * Sets the word breaking information for a UTF-8 input string.
397 * @param[in] s input UTF-8 string
398 * @param[in] len length of the input
399 * @param[in] lang language of the input
400 * @param[out] brks pointer to the output breaking data, containing
401 * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
402 * #WORDBREAK_INSIDEACHAR
404 void set_wordbreaks_utf8(
410 set_wordbreaks(s, len, lang, brks,
411 (get_next_char_t)lb_get_next_char_utf8);
415 * Sets the word breaking information for a UTF-16 input string.
417 * @param[in] s input UTF-16 string
418 * @param[in] len length of the input
419 * @param[in] lang language of the input
420 * @param[out] brks pointer to the output breaking data, containing
421 * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
422 * #WORDBREAK_INSIDEACHAR
424 void set_wordbreaks_utf16(
430 set_wordbreaks(s, len, lang, brks,
431 (get_next_char_t)lb_get_next_char_utf16);
435 * Sets the word breaking information for a UTF-32 input string.
437 * @param[in] s input UTF-32 string
438 * @param[in] len length of the input
439 * @param[in] lang language of the input
440 * @param[out] brks pointer to the output breaking data, containing
441 * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
442 * #WORDBREAK_INSIDEACHAR
444 void set_wordbreaks_utf32(
450 set_wordbreaks(s, len, lang, brks,
451 (get_next_char_t)lb_get_next_char_utf32);