1 /* vim: set expandtab tabstop=4 softtabstop=4 shiftwidth=4: */
4 * Word breaking in a Unicode sequence. Designed to be used in a
5 * generic text renderer.
7 * Copyright (C) 2013-2015 Tom Hacohen <tom at stosb dot com>
9 * This software is provided 'as-is', without any express or implied
10 * warranty. In no event will the author be held liable for any damages
11 * arising from the use of this software.
13 * Permission is granted to anyone to use this software for any purpose,
14 * including commercial applications, and to alter it and redistribute
15 * it freely, subject to the following restrictions:
17 * 1. The origin of this software must not be misrepresented; you must
18 * not claim that you wrote the original software. If you use this
19 * software in a product, an acknowledgement in the product
20 * documentation would be appreciated but is not required.
21 * 2. Altered source versions must be plainly marked as such, and must
22 * not be misrepresented as being the original software.
23 * 3. This notice may not be removed or altered from any source
26 * The main reference is Unicode Standard Annex 29 (UAX #29):
27 * <URL:http://unicode.org/reports/tr29>
29 * When this library was designed, this annex was at Revision 17, for
31 * <URL:http://www.unicode.org/reports/tr29/tr29-17.html>
33 * This library has been updated according to Revision 25, for
35 * <URL:http://www.unicode.org/reports/tr29/tr29-25.html>
37 * The Unicode Terms of Use are available at
38 * <URL:http://www.unicode.org/copyright.html>
44 * Implementation of the word breaking algorithm as described in Unicode
47 * @version 2.6, 2015/04/18
54 #include "unibreakdef.h"
55 #include "wordbreak.h"
56 #include "wordbreakdata.c"
58 #define ARRAY_LEN(x) (sizeof(x) / sizeof(x[0]))
61 * Initializes the wordbreak internals. It currently does nothing, but
62 * it may in the future.
64 void init_wordbreak(void)
69 * Gets the word breaking class of a character.
71 * @param ch character to check
72 * @param wbp pointer to the wbp breaking properties array
73 * @param len size of the wbp array in number of items
74 * @return the word breaking class if found; \c WBP_Any otherwise
76 static enum WordBreakClass get_char_wb_class(
78 struct WordBreakProperties *wbp,
87 mid = (min + max) / 2;
89 if (ch < wbp[mid].start)
91 else if (ch > wbp[mid].end)
102 * Sets the word break types to a specific value in a range.
104 * It sets the inside chars to #WORDBREAK_INSIDEACHAR and the rest to brkType.
105 * Assumes \a brks is initialized - all the cells with #WORDBREAK_NOBREAK are
106 * cells that we really don't want to break after.
108 * @param[in] s input string
109 * @param[out] brks breaks array to fill
110 * @param[in] posStart start position
111 * @param[in] posEnd end position (exclusive)
112 * @param[in] len length of the string
113 * @param[in] brkType breaks type to use
114 * @param[in] get_next_char function to get the next UTF-32 character
116 static void set_brks_to(
123 get_next_char_t get_next_char)
125 size_t posNext = posStart;
126 while (posNext < posEnd)
129 ch = get_next_char(s, len, &posNext);
131 for (; posStart < posNext - 1; ++posStart)
132 brks[posStart] = WORDBREAK_INSIDEACHAR;
133 assert(posStart == posNext - 1);
135 /* Only set it if we haven't set it not to break before. */
136 if (brks[posStart] != WORDBREAK_NOBREAK)
137 brks[posStart] = brkType;
142 /* Checks to see if the class is newline, CR, or LF (rules WB3a and b). */
143 #define IS_WB3ab(cls) ((cls == WBP_Newline) || (cls == WBP_CR) || \
147 * Sets the word breaking information for a generic input string.
149 * @param[in] s input string
150 * @param[in] len length of the input
151 * @param[in] lang language of the input
152 * @param[out] brks pointer to the output breaking data, containing
153 * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
154 * #WORDBREAK_INSIDEACHAR
155 * @param[in] get_next_char function to get the next UTF-32 character
157 static void set_wordbreaks(
162 get_next_char_t get_next_char)
164 enum WordBreakClass wbcLast = WBP_Undefined;
165 /* wbcSeqStart is the class that started the current sequence.
166 * WBP_Undefined is a special case that means "sot".
167 * This value is the class that is at the start of the current rule
168 * matching sequence. For example, in case of Numeric+MidNum+Numeric
169 * it'll be Numeric all the way.
171 enum WordBreakClass wbcSeqStart = WBP_Undefined;
177 /* TODO: Language-specific specialization. */
181 memset(brks, WORDBREAK_BREAK, len);
183 ch = get_next_char(s, len, &posNext);
187 enum WordBreakClass wbcCur;
188 wbcCur = get_char_wb_class(ch, wb_prop_default,
189 ARRAY_LEN(wb_prop_default));
195 set_brks_to(s, brks, posLast, posCur, len,
196 WORDBREAK_BREAK, get_next_char);
197 wbcSeqStart = wbcCur;
202 if (wbcSeqStart == WBP_CR) /* WB3 */
204 set_brks_to(s, brks, posLast, posCur, len,
205 WORDBREAK_NOBREAK, get_next_char);
206 wbcSeqStart = wbcCur;
214 set_brks_to(s, brks, posLast, posCur, len,
215 WORDBREAK_BREAK, get_next_char);
216 wbcSeqStart = wbcCur;
222 /* WB4 - If not the first char/after a newline (WB3a,3b), skip
223 * this class, set it to be the same as the prev, and mark
224 * brks not to break before them. */
225 if ((wbcSeqStart == WBP_Undefined) || IS_WB3ab(wbcSeqStart))
227 set_brks_to(s, brks, posLast, posCur, len,
228 WORDBREAK_BREAK, get_next_char);
229 wbcSeqStart = wbcCur;
233 /* It's surely not the first */
234 brks[posCur - 1] = WORDBREAK_NOBREAK;
235 /* "inherit" the previous class. */
241 if ((wbcSeqStart == WBP_Katakana) || /* WB13 */
242 (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
244 set_brks_to(s, brks, posLast, posCur, len,
245 WORDBREAK_NOBREAK, get_next_char);
247 /* No rule found, reset */
250 set_brks_to(s, brks, posLast, posCur, len,
251 WORDBREAK_BREAK, get_next_char);
253 wbcSeqStart = wbcCur;
257 case WBP_Hebrew_Letter:
259 if ((wbcSeqStart == WBP_Hebrew_Letter) &&
260 (wbcLast == WBP_Double_Quote)) /* WB7b,c */
262 if (wbcCur == WBP_Hebrew_Letter)
264 set_brks_to(s, brks, posLast, posCur, len,
265 WORDBREAK_NOBREAK, get_next_char);
269 set_brks_to(s, brks, posLast, posCur, len,
270 WORDBREAK_BREAK, get_next_char);
273 else if (((wbcSeqStart == WBP_ALetter) ||
274 (wbcSeqStart == WBP_Hebrew_Letter)) || /* WB5,6,7 */
275 (wbcLast == WBP_Numeric) || /* WB10 */
276 (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
278 set_brks_to(s, brks, posLast, posCur, len,
279 WORDBREAK_NOBREAK, get_next_char);
281 /* No rule found, reset */
284 set_brks_to(s, brks, posLast, posCur, len,
285 WORDBREAK_BREAK, get_next_char);
287 wbcSeqStart = wbcCur;
291 case WBP_Single_Quote:
292 if (wbcLast == WBP_Hebrew_Letter) /* WB7a */
294 set_brks_to(s, brks, posLast, posCur, len,
295 WORDBREAK_NOBREAK, get_next_char);
296 wbcSeqStart = wbcCur;
299 /* No break on purpose */
301 if (((wbcLast == WBP_ALetter) ||
302 (wbcLast == WBP_Hebrew_Letter)) || /* WB6,7 */
303 (wbcLast == WBP_Numeric)) /* WB11,12 */
309 set_brks_to(s, brks, posLast, posCur, len,
310 WORDBREAK_BREAK, get_next_char);
311 wbcSeqStart = wbcCur;
317 if ((wbcLast == WBP_ALetter) ||
318 (wbcLast == WBP_Hebrew_Letter)) /* WB6,7 */
324 set_brks_to(s, brks, posLast, posCur, len,
325 WORDBREAK_BREAK, get_next_char);
326 wbcSeqStart = wbcCur;
332 if (wbcLast == WBP_Numeric) /* WB11,12 */
338 set_brks_to(s, brks, posLast, posCur, len,
339 WORDBREAK_BREAK, get_next_char);
340 wbcSeqStart = wbcCur;
346 if ((wbcSeqStart == WBP_Numeric) || /* WB8,11,12 */
347 ((wbcLast == WBP_ALetter) ||
348 (wbcLast == WBP_Hebrew_Letter)) || /* WB9 */
349 (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
351 set_brks_to(s, brks, posLast, posCur, len,
352 WORDBREAK_NOBREAK, get_next_char);
354 /* No rule found, reset */
357 set_brks_to(s, brks, posLast, posCur, len,
358 WORDBREAK_BREAK, get_next_char);
360 wbcSeqStart = wbcCur;
364 case WBP_ExtendNumLet:
366 if ((wbcSeqStart == wbcLast) &&
367 ((wbcLast == WBP_ALetter) ||
368 (wbcLast == WBP_Hebrew_Letter) ||
369 (wbcLast == WBP_Numeric) ||
370 (wbcLast == WBP_Katakana) ||
371 (wbcLast == WBP_ExtendNumLet)))
373 set_brks_to(s, brks, posLast, posCur, len,
374 WORDBREAK_NOBREAK, get_next_char);
376 /* No rule found, reset */
379 set_brks_to(s, brks, posLast, posCur, len,
380 WORDBREAK_BREAK, get_next_char);
382 wbcSeqStart = wbcCur;
386 case WBP_Regional_Indicator:
388 if (wbcSeqStart == WBP_Regional_Indicator)
390 set_brks_to(s, brks, posLast, posCur, len,
391 WORDBREAK_NOBREAK, get_next_char);
393 wbcSeqStart = wbcCur;
397 case WBP_Double_Quote:
398 if (wbcLast == WBP_Hebrew_Letter) /* WB7b,c */
404 set_brks_to(s, brks, posLast, posCur, len,
405 WORDBREAK_BREAK, get_next_char);
406 wbcSeqStart = wbcCur;
412 /* Allow breaks and reset */
413 set_brks_to(s, brks, posLast, posCur, len,
414 WORDBREAK_BREAK, get_next_char);
415 wbcSeqStart = wbcCur;
420 /* Error, should never get here! */
427 ch = get_next_char(s, len, &posNext);
431 set_brks_to(s, brks, posLast, posNext, len,
432 WORDBREAK_BREAK, get_next_char);
436 * Sets the word breaking information for a UTF-8 input string.
438 * @param[in] s input UTF-8 string
439 * @param[in] len length of the input
440 * @param[in] lang language of the input
441 * @param[out] brks pointer to the output breaking data, containing
442 * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
443 * #WORDBREAK_INSIDEACHAR
445 void set_wordbreaks_utf8(
451 set_wordbreaks(s, len, lang, brks,
452 (get_next_char_t)ub_get_next_char_utf8);
456 * Sets the word breaking information for a UTF-16 input string.
458 * @param[in] s input UTF-16 string
459 * @param[in] len length of the input
460 * @param[in] lang language of the input
461 * @param[out] brks pointer to the output breaking data, containing
462 * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
463 * #WORDBREAK_INSIDEACHAR
465 void set_wordbreaks_utf16(
471 set_wordbreaks(s, len, lang, brks,
472 (get_next_char_t)ub_get_next_char_utf16);
476 * Sets the word breaking information for a UTF-32 input string.
478 * @param[in] s input UTF-32 string
479 * @param[in] len length of the input
480 * @param[in] lang language of the input
481 * @param[out] brks pointer to the output breaking data, containing
482 * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
483 * #WORDBREAK_INSIDEACHAR
485 void set_wordbreaks_utf32(
491 set_wordbreaks(s, len, lang, brks,
492 (get_next_char_t)ub_get_next_char_utf32);