Merge remote-tracking branch 'origin/tizen' into new_text
[platform/core/uifw/dali-adaptor.git] / text / dali / internal / libunibreak / linebreak.c
1 /* vim: set tabstop=4 shiftwidth=4: */
2
3 /*
4  * Line breaking in a Unicode sequence.  Designed to be used in a
5  * generic text renderer.
6  *
7  * Copyright (C) 2008-2010 Wu Yongwei <wuyongwei at gmail dot com>
8  *
9  * This software is provided 'as-is', without any express or implied
10  * warranty.  In no event will the author be held liable for any damages
11  * arising from the use of this software.
12  *
13  * Permission is granted to anyone to use this software for any purpose,
14  * including commercial applications, and to alter it and redistribute
15  * it freely, subject to the following restrictions:
16  *
17  * 1. The origin of this software must not be misrepresented; you must
18  *    not claim that you wrote the original software.  If you use this
19  *    software in a product, an acknowledgement in the product
20  *    documentation would be appreciated but is not required.
21  * 2. Altered source versions must be plainly marked as such, and must
22  *    not be misrepresented as being the original software.
23  * 3. This notice may not be removed or altered from any source
24  *    distribution.
25  *
26  * The main reference is Unicode Standard Annex 14 (UAX #14):
27  *              <URL:http://www.unicode.org/reports/tr14/>
28  *
29  * When this library was designed, this annex was at Revision 19, for
30  * Unicode 5.0.0:
31  *              <URL:http://www.unicode.org/reports/tr14/tr14-19.html>
32  *
33  * This library has been updated according to Revision 24, for
34  * Unicode 5.2.0:
35  *              <URL:http://www.unicode.org/reports/tr14/tr14-24.html>
36  *
37  * The Unicode Terms of Use are available at
38  *              <URL:http://www.unicode.org/copyright.html>
39  */
40
41 /**
42  * @file        linebreak.c
43  *
44  * Implementation of the line breaking algorithm as described in Unicode
45  * Standard Annex 14.
46  *
47  * @version     2.0, 2010/01/03
48  * @author      Wu Yongwei
49  */
50
51 #include <assert.h>
52 #include <stddef.h>
53 #include <string.h>
54 #include "linebreak.h"
55 #include "linebreakdef.h"
56
57 /**
58  * Size of the second-level index to the line breaking properties.
59  */
60 #define LINEBREAK_INDEX_SIZE 40
61
62 /**
63  * Version number of the library.
64  */
65 const int linebreak_version = LINEBREAK_VERSION;
66
67 /**
68  * Enumeration of break actions.  They are used in the break action
69  * pair table below.
70  */
71 enum BreakAction
72 {
73         DIR_BRK,                /**< Direct break opportunity */
74         IND_BRK,                /**< Indirect break opportunity */
75         CMI_BRK,                /**< Indirect break opportunity for combining marks */
76         CMP_BRK,                /**< Prohibited break for combining marks */
77         PRH_BRK                 /**< Prohibited break */
78 };
79
80 /**
81  * Break action pair table.  This is a direct mapping of Table 2 of
82  * Unicode Standard Annex 14, Revision 24.
83  */
84 static enum BreakAction baTable[LBP_JT][LBP_JT] = {
85         {       /* OP */
86                 PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
87                 PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
88                 PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, CMP_BRK,
89                 PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK },
90         {       /* CL */
91                 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, PRH_BRK, PRH_BRK,
92                 PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
93                 DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
94                 PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
95         {       /* CP */
96                 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, PRH_BRK, PRH_BRK,
97                 PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK,
98                 DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
99                 PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
100         {       /* QU */
101                 PRH_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
102                 PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
103                 IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, CMI_BRK,
104                 PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK },
105         {       /* GL */
106                 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
107                 PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
108                 IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, CMI_BRK,
109                 PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK },
110         {       /* NS */
111                 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
112                 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
113                 DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
114                 PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
115         {       /* EX */
116                 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
117                 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
118                 DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
119                 PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
120         {       /* SY */
121                 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
122                 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK,
123                 DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
124                 PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
125         {       /* IS */
126                 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
127                 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK,
128                 DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
129                 PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
130         {       /* PR */
131                 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
132                 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
133                 DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
134                 PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK },
135         {       /* PO */
136                 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
137                 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK,
138                 DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
139                 PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
140         {       /* NU */
141                 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
142                 PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK,
143                 IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
144                 PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
145         {       /* AL */
146                 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
147                 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK,
148                 IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
149                 PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
150         {       /* ID */
151                 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
152                 PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
153                 IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
154                 PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
155         {       /* IN */
156                 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
157                 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
158                 IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
159                 PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
160         {       /* HY */
161                 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, DIR_BRK, IND_BRK, PRH_BRK,
162                 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK,
163                 DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
164                 PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
165         {       /* BA */
166                 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, DIR_BRK, IND_BRK, PRH_BRK,
167                 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
168                 DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
169                 PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
170         {       /* BB */
171                 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
172                 PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
173                 IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, CMI_BRK,
174                 PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK },
175         {       /* B2 */
176                 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
177                 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
178                 DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, PRH_BRK, PRH_BRK, CMI_BRK,
179                 PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
180         {       /* ZW */
181                 DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
182                 DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
183                 DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, PRH_BRK, DIR_BRK,
184                 DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
185         {       /* CM */
186                 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
187                 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK,
188                 IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
189                 PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
190         {       /* WJ */
191                 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
192                 PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
193                 IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, CMI_BRK,
194                 PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK },
195         {       /* H2 */
196                 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
197                 PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
198                 IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
199                 PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK },
200         {       /* H3 */
201                 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
202                 PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
203                 IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
204                 PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK },
205         {       /* JL */
206                 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
207                 PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
208                 IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
209                 PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK },
210         {       /* JV */
211                 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
212                 PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
213                 IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
214                 PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK },
215         {       /* JT */
216                 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
217                 PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
218                 IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
219                 PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK }
220 };
221
222 /**
223  * Struct for the second-level index to the line breaking properties.
224  */
225 struct LineBreakPropertiesIndex
226 {
227         utf32_t end;                                    /**< End coding point */
228         struct LineBreakProperties *lbp;/**< Pointer to line breaking properties */
229 };
230
231 /**
232  * Second-level index to the line breaking properties.
233  */
234 static struct LineBreakPropertiesIndex lb_prop_index[LINEBREAK_INDEX_SIZE] =
235 {
236         { 0xFFFFFFFF, lb_prop_default }
237 };
238
239 /**
240  * Initializes the second-level index to the line breaking properties.
241  * If it is not called, the performance of #get_char_lb_class_lang (and
242  * thus the main functionality) can be pretty bad, especially for big
243  * code points like those of Chinese.
244  */
245 void init_linebreak(void)
246 {
247         size_t i;
248         size_t iPropDefault;
249         size_t len;
250         size_t step;
251
252         len = 0;
253         while (lb_prop_default[len].prop != LBP_Undefined)
254                 ++len;
255         step = len / LINEBREAK_INDEX_SIZE;
256         iPropDefault = 0;
257         for (i = 0; i < LINEBREAK_INDEX_SIZE; ++i)
258         {
259                 lb_prop_index[i].lbp = lb_prop_default + iPropDefault;
260                 iPropDefault += step;
261                 lb_prop_index[i].end = lb_prop_default[iPropDefault].start - 1;
262         }
263         lb_prop_index[--i].end = 0xFFFFFFFF;
264 }
265
266 /**
267  * Gets the language-specific line breaking properties.
268  *
269  * @param lang  language of the text
270  * @return              pointer to the language-specific line breaking
271  *                              properties array if found; \c NULL otherwise
272  */
273 static struct LineBreakProperties *get_lb_prop_lang(const char *lang)
274 {
275         struct LineBreakPropertiesLang *lbplIter;
276         if (lang != NULL)
277         {
278                 for (lbplIter = lb_prop_lang_map; lbplIter->lang != NULL; ++lbplIter)
279                 {
280                         if (strncmp(lang, lbplIter->lang, lbplIter->namelen) == 0)
281                         {
282                                 return lbplIter->lbp;
283                         }
284                 }
285         }
286         return NULL;
287 }
288
289 /**
290  * Gets the line breaking class of a character from a line breaking
291  * properties array.
292  *
293  * @param ch    character to check
294  * @param lbp   pointer to the line breaking properties array
295  * @return              the line breaking class if found; \c LBP_XX otherwise
296  */
297 static enum LineBreakClass get_char_lb_class(
298                 utf32_t ch,
299                 struct LineBreakProperties *lbp)
300 {
301         while (lbp->prop != LBP_Undefined && ch >= lbp->start)
302         {
303                 if (ch <= lbp->end)
304                         return lbp->prop;
305                 ++lbp;
306         }
307         return LBP_XX;
308 }
309
310 /**
311  * Gets the line breaking class of a character from the default line
312  * breaking properties array.
313  *
314  * @param ch    character to check
315  * @return              the line breaking class if found; \c LBP_XX otherwise
316  */
317 static enum LineBreakClass get_char_lb_class_default(
318                 utf32_t ch)
319 {
320         size_t i = 0;
321         while (ch > lb_prop_index[i].end)
322                 ++i;
323         assert(i < LINEBREAK_INDEX_SIZE);
324         return get_char_lb_class(ch, lb_prop_index[i].lbp);
325 }
326
327 /**
328  * Gets the line breaking class of a character for a specific
329  * language.  This function will check the language-specific data first,
330  * and then the default data if there is no language-specific property
331  * available for the character.
332  *
333  * @param ch            character to check
334  * @param lbpLang       pointer to the language-specific line breaking
335  *                                      properties array
336  * @return                      the line breaking class if found; \c LBP_XX
337  *                                      otherwise
338  */
339 static enum LineBreakClass get_char_lb_class_lang(
340                 utf32_t ch,
341                 struct LineBreakProperties *lbpLang)
342 {
343         enum LineBreakClass lbcResult;
344
345         /* Find the language-specific line breaking class for a character */
346         if (lbpLang)
347         {
348                 lbcResult = get_char_lb_class(ch, lbpLang);
349                 if (lbcResult != LBP_XX)
350                         return lbcResult;
351         }
352
353         /* Find the generic language-specific line breaking class, if no
354          * language context is provided, or language-specific data are not
355          * available for the specific character in the specified language */
356         return get_char_lb_class_default(ch);
357 }
358
359 /**
360  * Resolves the line breaking class for certain ambiguous or complicated
361  * characters.  They are treated in a simplistic way in this
362  * implementation.
363  *
364  * @param lbc   line breaking class to resolve
365  * @param lang  language of the text
366  * @return              the resolved line breaking class
367  */
368 static enum LineBreakClass resolve_lb_class(
369                 enum LineBreakClass lbc,
370                 const char *lang)
371 {
372         switch (lbc)
373         {
374         case LBP_AI:
375                 if (lang != NULL &&
376                                 (strncmp(lang, "zh", 2) == 0 || /* Chinese */
377                                  strncmp(lang, "ja", 2) == 0 || /* Japanese */
378                                  strncmp(lang, "ko", 2) == 0))  /* Korean */
379                 {
380                         return LBP_ID;
381                 }
382                 /* Fall through */
383         case LBP_SA:
384         case LBP_SG:
385         case LBP_XX:
386                 return LBP_AL;
387         default:
388                 return lbc;
389         }
390 }
391
392 /**
393  * Gets the next Unicode character in a UTF-8 sequence.  The index will
394  * be advanced to the next complete character, unless the end of string
395  * is reached in the middle of a UTF-8 sequence.
396  *
397  * @param[in]     s             input UTF-8 string
398  * @param[in]     len   length of the string in bytes
399  * @param[in,out] ip    pointer to the index
400  * @return                              the Unicode character beginning at the index; or
401  *                                              #EOS if end of input is encountered
402  */
403 utf32_t lb_get_next_char_utf8(
404                 const utf8_t *s,
405                 size_t len,
406                 size_t *ip)
407 {
408         utf8_t ch;
409         utf32_t res;
410
411         assert(*ip <= len);
412         if (*ip == len)
413                 return EOS;
414         ch = s[*ip];
415
416         if (ch < 0xC2 || ch > 0xF4)
417         {       /* One-byte sequence, tail (should not occur), or invalid */
418                 *ip += 1;
419                 return ch;
420         }
421         else if (ch < 0xE0)
422         {       /* Two-byte sequence */
423                 if (*ip + 2 > len)
424                         return EOS;
425                 res = ((ch & 0x1F) << 6) + (s[*ip + 1] & 0x3F);
426                 *ip += 2;
427                 return res;
428         }
429         else if (ch < 0xF0)
430         {       /* Three-byte sequence */
431                 if (*ip + 3 > len)
432                         return EOS;
433                 res = ((ch & 0x0F) << 12) +
434                           ((s[*ip + 1] & 0x3F) << 6) +
435                           ((s[*ip + 2] & 0x3F));
436                 *ip += 3;
437                 return res;
438         }
439         else
440         {       /* Four-byte sequence */
441                 if (*ip + 4 > len)
442                         return EOS;
443                 res = ((ch & 0x07) << 18) +
444                           ((s[*ip + 1] & 0x3F) << 12) +
445                           ((s[*ip + 2] & 0x3F) << 6) +
446                           ((s[*ip + 3] & 0x3F));
447                 *ip += 4;
448                 return res;
449         }
450 }
451
452 /**
453  * Gets the next Unicode character in a UTF-16 sequence.  The index will
454  * be advanced to the next complete character, unless the end of string
455  * is reached in the middle of a UTF-16 surrogate pair.
456  *
457  * @param[in]     s             input UTF-16 string
458  * @param[in]     len   length of the string in words
459  * @param[in,out] ip    pointer to the index
460  * @return                              the Unicode character beginning at the index; or
461  *                                              #EOS if end of input is encountered
462  */
463 utf32_t lb_get_next_char_utf16(
464                 const utf16_t *s,
465                 size_t len,
466                 size_t *ip)
467 {
468         utf16_t ch;
469
470         assert(*ip <= len);
471         if (*ip == len)
472                 return EOS;
473         ch = s[(*ip)++];
474
475         if (ch < 0xD800 || ch > 0xDBFF)
476         {       /* If the character is not a high surrogate */
477                 return ch;
478         }
479         if (*ip == len)
480         {       /* If the input ends here (an error) */
481                 --(*ip);
482                 return EOS;
483         }
484         if (s[*ip] < 0xDC00 || s[*ip] > 0xDFFF)
485         {       /* If the next character is not the low surrogate (an error) */
486                 return ch;
487         }
488         /* Return the constructed character and advance the index again */
489         return (((utf32_t)ch & 0x3FF) << 10) + (s[(*ip)++] & 0x3FF) + 0x10000;
490 }
491
492 /**
493  * Gets the next Unicode character in a UTF-32 sequence.  The index will
494  * be advanced to the next character.
495  *
496  * @param[in]     s             input UTF-32 string
497  * @param[in]     len   length of the string in dwords
498  * @param[in,out] ip    pointer to the index
499  * @return                              the Unicode character beginning at the index; or
500  *                                              #EOS if end of input is encountered
501  */
502 utf32_t lb_get_next_char_utf32(
503                 const utf32_t *s,
504                 size_t len,
505                 size_t *ip)
506 {
507         assert(*ip <= len);
508         if (*ip == len)
509                 return EOS;
510         return s[(*ip)++];
511 }
512
513 /**
514  * Sets the line breaking information for a generic input string.
515  *
516  * @param[in]  s                        input string
517  * @param[in]  len                      length of the input
518  * @param[in]  lang                     language of the input
519  * @param[out] brks                     pointer to the output breaking data,
520  *                                                      containing #LINEBREAK_MUSTBREAK,
521  *                                                      #LINEBREAK_ALLOWBREAK, #LINEBREAK_NOBREAK,
522  *                                                      or #LINEBREAK_INSIDEACHAR
523  * @param[in] get_next_char     function to get the next UTF-32 character
524  */
525 void set_linebreaks(
526                 const void *s,
527                 size_t len,
528                 const char *lang,
529                 char *brks,
530                 get_next_char_t get_next_char)
531 {
532         utf32_t ch;
533         enum LineBreakClass lbcCur;
534         enum LineBreakClass lbcNew;
535         enum LineBreakClass lbcLast;
536         struct LineBreakProperties *lbpLang;
537         size_t posCur = 0;
538         size_t posLast = 0;
539         // TIZEN ONLY : (2013.08.19) for special processing at Zero-width space character
540         int zw_flag = 0;
541         //
542
543         --posLast;      /* To be ++'d later */
544         ch = get_next_char(s, len, &posCur);
545         if (ch == EOS)
546                 return;
547         lbpLang = get_lb_prop_lang(lang);
548         lbcCur = resolve_lb_class(get_char_lb_class_lang(ch, lbpLang), lang);
549         lbcNew = LBP_Undefined;
550
551 nextline:
552
553         /* Special treatment for the first character */
554         switch (lbcCur)
555         {
556         case LBP_LF:
557         case LBP_NL:
558                 lbcCur = LBP_BK;
559                 break;
560         case LBP_CB:
561                 lbcCur = LBP_BA;
562                 break;
563         case LBP_SP:
564                 lbcCur = LBP_WJ;
565                 break;
566         default:
567                 break;
568         }
569
570         /* Process a line till an explicit break or end of string */
571         for (;;)
572         {
573                 for (++posLast; posLast < posCur - 1; ++posLast)
574                 {
575                         brks[posLast] = LINEBREAK_INSIDEACHAR;
576                 }
577                 assert(posLast == posCur - 1);
578                 lbcLast = lbcNew;
579                 ch = get_next_char(s, len, &posCur);
580                 if (ch == EOS)
581                         break;
582                 lbcNew = get_char_lb_class_lang(ch, lbpLang);
583                 if (lbcCur == LBP_BK || (lbcCur == LBP_CR && lbcNew != LBP_LF))
584                 {
585                         brks[posLast] = LINEBREAK_MUSTBREAK;
586                         lbcCur = resolve_lb_class(lbcNew, lang);
587                         goto nextline;
588                 }
589
590                 // TIZEN ONLY : (2013.08.19) for special processing at Zero-width space character
591                 /*
592                 switch (lbcNew)
593                 {
594                 case LBP_SP:
595                         brks[posLast] = LINEBREAK_NOBREAK;
596                         continue;
597                 case LBP_BK:
598                 case LBP_LF:
599                 case LBP_NL:
600                         brks[posLast] = LINEBREAK_NOBREAK;
601                         lbcCur = LBP_BK;
602                         continue;
603                 case LBP_CR:
604                         brks[posLast] = LINEBREAK_NOBREAK;
605                         lbcCur = LBP_CR;
606                         continue;
607                 case LBP_CB:
608                         brks[posLast] = LINEBREAK_ALLOWBREAK;
609                         lbcCur = LBP_BA;
610                         continue;
611                 default:
612                         break;
613                 }
614
615                 lbcNew = resolve_lb_class(lbcNew, lang);
616
617                 assert(lbcCur <= LBP_JT);
618                 assert(lbcNew <= LBP_JT);
619                 switch (baTable[lbcCur - 1][lbcNew - 1])
620                 {
621                 case DIR_BRK:
622                         brks[posLast] = LINEBREAK_ALLOWBREAK;
623                         break;
624                 case CMI_BRK:
625                 case IND_BRK:
626                         if (lbcLast == LBP_SP)
627                         {
628                                 brks[posLast] = LINEBREAK_ALLOWBREAK;
629                         }
630                         else
631                         {
632                                 brks[posLast] = LINEBREAK_NOBREAK;
633                         }
634                         break;
635                 case CMP_BRK:
636                         brks[posLast] = LINEBREAK_NOBREAK;
637                         if (lbcLast != LBP_SP)
638                                 continue;
639                         break;
640                 case PRH_BRK:
641                         brks[posLast] = LINEBREAK_NOBREAK;
642                         break;
643                 }
644
645                 lbcCur = lbcNew;
646                 */
647
648                 // TIZEN ONLY - START
649                 if (lbcCur == LBP_ZW && !zw_flag)
650                 {
651                         zw_flag = 1;
652                         posLast = -1;
653                         posCur = 0;
654                         ch = get_next_char(s, len, &posCur);
655                         lbcCur = resolve_lb_class(get_char_lb_class_lang(ch, lbpLang), lang);
656                         lbcNew = LBP_Undefined;
657                         goto nextline;
658                 }
659                 else if (zw_flag)
660                 {
661                         if (lbcCur == LBP_ZW)
662                                 brks[posLast] = LINEBREAK_ALLOWBREAK;
663                         else
664                                 brks[posLast] = LINEBREAK_NOBREAK;
665                         lbcCur = lbcNew;
666                 }
667                 else
668                 {
669                         // TIZEN ONLY(20131106): For Hangul word wrap
670                         switch (lbcCur)
671                         {
672                                 case LBP_H2:                    /**< Hangul LV */
673                                 case LBP_H3:                    /**< Hangul LVT */
674                                 case LBP_JL:                    /**< Hangul L Jamo */
675                                 case LBP_JV:                    /**< Hangul V Jamo */
676                                 case LBP_JT:                    /**< Hangul T Jamo */
677                                         lbcCur = LBP_AL;
678                                         break;
679                                 default:
680                                         break;
681                         }
682
683                         switch (lbcNew)
684                         {
685                                 case LBP_H2:                    /**< Hangul LV */
686                                 case LBP_H3:                    /**< Hangul LVT */
687                                 case LBP_JL:                    /**< Hangul L Jamo */
688                                 case LBP_JV:                    /**< Hangul V Jamo */
689                                 case LBP_JT:                    /**< Hangul T Jamo */
690                                         lbcNew = LBP_AL;
691                                         break;
692                                 default:
693                                         break;
694                         }
695                         //
696
697                         switch (lbcNew)
698                         {
699                                 case LBP_SP:
700                                         brks[posLast] = LINEBREAK_NOBREAK;
701                                         continue;
702                                 case LBP_BK:
703                                 case LBP_LF:
704                                 case LBP_NL:
705                                         brks[posLast] = LINEBREAK_NOBREAK;
706                                         lbcCur = LBP_BK;
707                                         continue;
708                                 case LBP_CR:
709                                         brks[posLast] = LINEBREAK_NOBREAK;
710                                         lbcCur = LBP_CR;
711                                         continue;
712                                 case LBP_CB:
713                                         brks[posLast] = LINEBREAK_ALLOWBREAK;
714                                         lbcCur = LBP_BA;
715                                         continue;
716                                 default:
717                                         break;
718                         }
719
720                         lbcNew = resolve_lb_class(lbcNew, lang);
721
722                         assert(lbcCur <= LBP_JT);
723                         assert(lbcNew <= LBP_JT);
724                         switch (baTable[lbcCur - 1][lbcNew - 1])
725                         {
726                                 case DIR_BRK:
727                                         brks[posLast] = LINEBREAK_ALLOWBREAK;
728                                         break;
729                                 case CMI_BRK:
730                                 case IND_BRK:
731                                         if (lbcLast == LBP_SP)
732                                         {
733                                                 brks[posLast] = LINEBREAK_ALLOWBREAK;
734                                         }
735                                         else
736                                         {
737                                                 brks[posLast] = LINEBREAK_NOBREAK;
738                                         }
739                                         break;
740                                 case CMP_BRK:
741                                         brks[posLast] = LINEBREAK_NOBREAK;
742                                         if (lbcLast != LBP_SP)
743                                                 continue;
744                                         break;
745                                 case PRH_BRK:
746                                         brks[posLast] = LINEBREAK_NOBREAK;
747                                         break;
748                         }
749                         lbcCur = lbcNew;
750                 }
751                 // TIZEN ONLY - END
752         }
753
754         assert(posLast == posCur - 1 && posCur <= len);
755         /* Break after the last character */
756         brks[posLast] = LINEBREAK_MUSTBREAK;
757         /* When the input contains incomplete sequences */
758         while (posCur < len)
759         {
760                 brks[posCur++] = LINEBREAK_INSIDEACHAR;
761         }
762 }
763
764 /**
765  * Sets the line breaking information for a UTF-8 input string.
766  *
767  * @param[in]  s        input UTF-8 string
768  * @param[in]  len      length of the input
769  * @param[in]  lang     language of the input
770  * @param[out] brks     pointer to the output breaking data, containing
771  *                                      #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
772  *                                      #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
773  */
774 void set_linebreaks_utf8(
775                 const utf8_t *s,
776                 size_t len,
777                 const char *lang,
778                 char *brks)
779 {
780         set_linebreaks(s, len, lang, brks,
781                                    (get_next_char_t)lb_get_next_char_utf8);
782 }
783
784 /**
785  * Sets the line breaking information for a UTF-16 input string.
786  *
787  * @param[in]  s        input UTF-16 string
788  * @param[in]  len      length of the input
789  * @param[in]  lang     language of the input
790  * @param[out] brks     pointer to the output breaking data, containing
791  *                                      #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
792  *                                      #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
793  */
794 void set_linebreaks_utf16(
795                 const utf16_t *s,
796                 size_t len,
797                 const char *lang,
798                 char *brks)
799 {
800         set_linebreaks(s, len, lang, brks,
801                                    (get_next_char_t)lb_get_next_char_utf16);
802 }
803
804 /**
805  * Sets the line breaking information for a UTF-32 input string.
806  *
807  * @param[in]  s        input UTF-32 string
808  * @param[in]  len      length of the input
809  * @param[in]  lang     language of the input
810  * @param[out] brks     pointer to the output breaking data, containing
811  *                                      #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
812  *                                      #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
813  */
814 void set_linebreaks_utf32(
815                 const utf32_t *s,
816                 size_t len,
817                 const char *lang,
818                 char *brks)
819 {
820         set_linebreaks(s, len, lang, brks,
821                                    (get_next_char_t)lb_get_next_char_utf32);
822 }
823
824 /**
825  * Tells whether a line break can occur between two Unicode characters.
826  * This is a wrapper function to expose a simple interface.  Generally
827  * speaking, it is better to use #set_linebreaks_utf32 instead, since
828  * complicated cases involving combining marks, spaces, etc. cannot be
829  * correctly processed.
830  *
831  * @param char1 the first Unicode character
832  * @param char2 the second Unicode character
833  * @param lang  language of the input
834  * @return      one of #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
835  *                              #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
836  */
837 int is_line_breakable(
838                 utf32_t char1,
839                 utf32_t char2,
840                 const char* lang)
841 {
842         utf32_t s[2];
843         char brks[2];
844         s[0] = char1;
845         s[1] = char2;
846         set_linebreaks_utf32(s, 2, lang, brks);
847         return brks[0];
848 }