text/dali/internal/libunibreak/linebreak.c

   1 /* vim: set tabstop=4 shiftwidth=4: */
   2
   3 /*
   4  * Line breaking in a Unicode sequence.  Designed to be used in a
   5  * generic text renderer.
   6  *
   7  * Copyright (C) 2008-2010 Wu Yongwei <wuyongwei at gmail dot com>
   8  *
   9  * This software is provided 'as-is', without any express or implied
  10  * warranty.  In no event will the author be held liable for any damages
  11  * arising from the use of this software.
  12  *
  13  * Permission is granted to anyone to use this software for any purpose,
  14  * including commercial applications, and to alter it and redistribute
  15  * it freely, subject to the following restrictions:
  16  *
  17  * 1. The origin of this software must not be misrepresented; you must
  18  *    not claim that you wrote the original software.  If you use this
  19  *    software in a product, an acknowledgement in the product
  20  *    documentation would be appreciated but is not required.
  21  * 2. Altered source versions must be plainly marked as such, and must
  22  *    not be misrepresented as being the original software.
  23  * 3. This notice may not be removed or altered from any source
  24  *    distribution.
  25  *
  26  * The main reference is Unicode Standard Annex 14 (UAX #14):
  27  *              <URL:http://www.unicode.org/reports/tr14/>
  28  *
  29  * When this library was designed, this annex was at Revision 19, for
  30  * Unicode 5.0.0:
  31  *              <URL:http://www.unicode.org/reports/tr14/tr14-19.html>
  32  *
  33  * This library has been updated according to Revision 24, for
  34  * Unicode 5.2.0:
  35  *              <URL:http://www.unicode.org/reports/tr14/tr14-24.html>
  36  *
  37  * The Unicode Terms of Use are available at
  38  *              <URL:http://www.unicode.org/copyright.html>
  39  */
  40
  41 /**
  42  * @file        linebreak.c
  43  *
  44  * Implementation of the line breaking algorithm as described in Unicode
  45  * Standard Annex 14.
  46  *
  47  * @version     2.0, 2010/01/03
  48  * @author      Wu Yongwei
  49  */
  50
  51 #include <assert.h>
  52 #include <stddef.h>
  53 #include <string.h>
  54 #include "linebreak.h"
  55 #include "linebreakdef.h"
  56
  57 /**
  58  * Size of the second-level index to the line breaking properties.
  59  */
  60 #define LINEBREAK_INDEX_SIZE 40
  61
  62 /**
  63  * Version number of the library.
  64  */
  65 const int linebreak_version = LINEBREAK_VERSION;
  66
  67 /**
  68  * Enumeration of break actions.  They are used in the break action
  69  * pair table below.
  70  */
  71 enum BreakAction
  72 {
  73         DIR_BRK,                /**< Direct break opportunity */
  74         IND_BRK,                /**< Indirect break opportunity */
  75         CMI_BRK,                /**< Indirect break opportunity for combining marks */
  76         CMP_BRK,                /**< Prohibited break for combining marks */
  77         PRH_BRK                 /**< Prohibited break */
  78 };
  79
  80 /**
  81  * Break action pair table.  This is a direct mapping of Table 2 of
  82  * Unicode Standard Annex 14, Revision 24.
  83  */
  84 static enum BreakAction baTable[LBP_JT][LBP_JT] = {
  85         {       /* OP */
  86                 PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
  87                 PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
  88                 PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, CMP_BRK,
  89                 PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK },
  90         {       /* CL */
  91                 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, PRH_BRK, PRH_BRK,
  92                 PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
  93                 DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
  94                 PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
  95         {       /* CP */
  96                 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, PRH_BRK, PRH_BRK,
  97                 PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK,
  98                 DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
  99                 PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
 100         {       /* QU */
 101                 PRH_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 102                 PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
 103                 IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, CMI_BRK,
 104                 PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK },
 105         {       /* GL */
 106                 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 107                 PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
 108                 IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, CMI_BRK,
 109                 PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK },
 110         {       /* NS */
 111                 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 112                 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 113                 DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
 114                 PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
 115         {       /* EX */
 116                 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 117                 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 118                 DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
 119                 PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
 120         {       /* SY */
 121                 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 122                 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK,
 123                 DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
 124                 PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
 125         {       /* IS */
 126                 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 127                 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK,
 128                 DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
 129                 PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
 130         {       /* PR */
 131                 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 132                 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
 133                 DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
 134                 PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK },
 135         {       /* PO */
 136                 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 137                 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK,
 138                 DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
 139                 PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
 140         {       /* NU */
 141                 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 142                 PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK,
 143                 IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
 144                 PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
 145         {       /* AL */
 146                 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 147                 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK,
 148                 IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
 149                 PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
 150         {       /* ID */
 151                 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 152                 PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 153                 IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
 154                 PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
 155         {       /* IN */
 156                 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 157                 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 158                 IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
 159                 PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
 160         {       /* HY */
 161                 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, DIR_BRK, IND_BRK, PRH_BRK,
 162                 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK,
 163                 DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
 164                 PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
 165         {       /* BA */
 166                 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, DIR_BRK, IND_BRK, PRH_BRK,
 167                 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 168                 DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
 169                 PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
 170         {       /* BB */
 171                 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 172                 PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
 173                 IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, CMI_BRK,
 174                 PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK },
 175         {       /* B2 */
 176                 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 177                 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 178                 DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, PRH_BRK, PRH_BRK, CMI_BRK,
 179                 PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
 180         {       /* ZW */
 181                 DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 182                 DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 183                 DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, PRH_BRK, DIR_BRK,
 184                 DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
 185         {       /* CM */
 186                 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 187                 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK,
 188                 IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
 189                 PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
 190         {       /* WJ */
 191                 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 192                 PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
 193                 IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, CMI_BRK,
 194                 PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK },
 195         {       /* H2 */
 196                 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 197                 PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 198                 IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
 199                 PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK },
 200         {       /* H3 */
 201                 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 202                 PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 203                 IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
 204                 PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK },
 205         {       /* JL */
 206                 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 207                 PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 208                 IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
 209                 PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK },
 210         {       /* JV */
 211                 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 212                 PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 213                 IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
 214                 PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK },
 215         {       /* JT */
 216                 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 217                 PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 218                 IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
 219                 PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK }
 220 };
 221
 222 /**
 223  * Struct for the second-level index to the line breaking properties.
 224  */
 225 struct LineBreakPropertiesIndex
 226 {
 227         utf32_t end;                                    /**< End coding point */
 228         struct LineBreakProperties *lbp;/**< Pointer to line breaking properties */
 229 };
 230
 231 /**
 232  * Second-level index to the line breaking properties.
 233  */
 234 static struct LineBreakPropertiesIndex lb_prop_index[LINEBREAK_INDEX_SIZE] =
 235 {
 236         { 0xFFFFFFFF, lb_prop_default }
 237 };
 238
 239 /**
 240  * Initializes the second-level index to the line breaking properties.
 241  * If it is not called, the performance of #get_char_lb_class_lang (and
 242  * thus the main functionality) can be pretty bad, especially for big
 243  * code points like those of Chinese.
 244  */
 245 void init_linebreak(void)
 246 {
 247         size_t i;
 248         size_t iPropDefault;
 249         size_t len;
 250         size_t step;
 251
 252         len = 0;
 253         while (lb_prop_default[len].prop != LBP_Undefined)
 254                 ++len;
 255         step = len / LINEBREAK_INDEX_SIZE;
 256         iPropDefault = 0;
 257         for (i = 0; i < LINEBREAK_INDEX_SIZE; ++i)
 258         {
 259                 lb_prop_index[i].lbp = lb_prop_default + iPropDefault;
 260                 iPropDefault += step;
 261                 lb_prop_index[i].end = lb_prop_default[iPropDefault].start - 1;
 262         }
 263         lb_prop_index[--i].end = 0xFFFFFFFF;
 264 }
 265
 266 /**
 267  * Gets the language-specific line breaking properties.
 268  *
 269  * @param lang  language of the text
 270  * @return              pointer to the language-specific line breaking
 271  *                              properties array if found; \c NULL otherwise
 272  */
 273 static struct LineBreakProperties *get_lb_prop_lang(const char *lang)
 274 {
 275         struct LineBreakPropertiesLang *lbplIter;
 276         if (lang != NULL)
 277         {
 278                 for (lbplIter = lb_prop_lang_map; lbplIter->lang != NULL; ++lbplIter)
 279                 {
 280                         if (strncmp(lang, lbplIter->lang, lbplIter->namelen) == 0)
 281                         {
 282                                 return lbplIter->lbp;
 283                         }
 284                 }
 285         }
 286         return NULL;
 287 }
 288
 289 /**
 290  * Gets the line breaking class of a character from a line breaking
 291  * properties array.
 292  *
 293  * @param ch    character to check
 294  * @param lbp   pointer to the line breaking properties array
 295  * @return              the line breaking class if found; \c LBP_XX otherwise
 296  */
 297 static enum LineBreakClass get_char_lb_class(
 298                 utf32_t ch,
 299                 struct LineBreakProperties *lbp)
 300 {
 301         while (lbp->prop != LBP_Undefined && ch >= lbp->start)
 302         {
 303                 if (ch <= lbp->end)
 304                         return lbp->prop;
 305                 ++lbp;
 306         }
 307         return LBP_XX;
 308 }
 309
 310 /**
 311  * Gets the line breaking class of a character from the default line
 312  * breaking properties array.
 313  *
 314  * @param ch    character to check
 315  * @return              the line breaking class if found; \c LBP_XX otherwise
 316  */
 317 static enum LineBreakClass get_char_lb_class_default(
 318                 utf32_t ch)
 319 {
 320         size_t i = 0;
 321         while (ch > lb_prop_index[i].end)
 322                 ++i;
 323         assert(i < LINEBREAK_INDEX_SIZE);
 324         return get_char_lb_class(ch, lb_prop_index[i].lbp);
 325 }
 326
 327 /**
 328  * Gets the line breaking class of a character for a specific
 329  * language.  This function will check the language-specific data first,
 330  * and then the default data if there is no language-specific property
 331  * available for the character.
 332  *
 333  * @param ch            character to check
 334  * @param lbpLang       pointer to the language-specific line breaking
 335  *                                      properties array
 336  * @return                      the line breaking class if found; \c LBP_XX
 337  *                                      otherwise
 338  */
 339 static enum LineBreakClass get_char_lb_class_lang(
 340                 utf32_t ch,
 341                 struct LineBreakProperties *lbpLang)
 342 {
 343         enum LineBreakClass lbcResult;
 344
 345         /* Find the language-specific line breaking class for a character */
 346         if (lbpLang)
 347         {
 348                 lbcResult = get_char_lb_class(ch, lbpLang);
 349                 if (lbcResult != LBP_XX)
 350                         return lbcResult;
 351         }
 352
 353         /* Find the generic language-specific line breaking class, if no
 354          * language context is provided, or language-specific data are not
 355          * available for the specific character in the specified language */
 356         return get_char_lb_class_default(ch);
 357 }
 358
 359 /**
 360  * Resolves the line breaking class for certain ambiguous or complicated
 361  * characters.  They are treated in a simplistic way in this
 362  * implementation.
 363  *
 364  * @param lbc   line breaking class to resolve
 365  * @param lang  language of the text
 366  * @return              the resolved line breaking class
 367  */
 368 static enum LineBreakClass resolve_lb_class(
 369                 enum LineBreakClass lbc,
 370                 const char *lang)
 371 {
 372         switch (lbc)
 373         {
 374         case LBP_AI:
 375                 if (lang != NULL &&
 376                                 (strncmp(lang, "zh", 2) == 0 || /* Chinese */
 377                                  strncmp(lang, "ja", 2) == 0 || /* Japanese */
 378                                  strncmp(lang, "ko", 2) == 0))  /* Korean */
 379                 {
 380                         return LBP_ID;
 381                 }
 382                 /* Fall through */
 383         case LBP_SA:
 384         case LBP_SG:
 385         case LBP_XX:
 386                 return LBP_AL;
 387         default:
 388                 return lbc;
 389         }
 390 }
 391
 392 /**
 393  * Gets the next Unicode character in a UTF-8 sequence.  The index will
 394  * be advanced to the next complete character, unless the end of string
 395  * is reached in the middle of a UTF-8 sequence.
 396  *
 397  * @param[in]     s             input UTF-8 string
 398  * @param[in]     len   length of the string in bytes
 399  * @param[in,out] ip    pointer to the index
 400  * @return                              the Unicode character beginning at the index; or
 401  *                                              #EOS if end of input is encountered
 402  */
 403 utf32_t lb_get_next_char_utf8(
 404                 const utf8_t *s,
 405                 size_t len,
 406                 size_t *ip)
 407 {
 408         utf8_t ch;
 409         utf32_t res;
 410
 411         assert(*ip <= len);
 412         if (*ip == len)
 413                 return EOS;
 414         ch = s[*ip];
 415
 416         if (ch < 0xC2 || ch > 0xF4)
 417         {       /* One-byte sequence, tail (should not occur), or invalid */
 418                 *ip += 1;
 419                 return ch;
 420         }
 421         else if (ch < 0xE0)
 422         {       /* Two-byte sequence */
 423                 if (*ip + 2 > len)
 424                         return EOS;
 425                 res = ((ch & 0x1F) << 6) + (s[*ip + 1] & 0x3F);
 426                 *ip += 2;
 427                 return res;
 428         }
 429         else if (ch < 0xF0)
 430         {       /* Three-byte sequence */
 431                 if (*ip + 3 > len)
 432                         return EOS;
 433                 res = ((ch & 0x0F) << 12) +
 434                           ((s[*ip + 1] & 0x3F) << 6) +
 435                           ((s[*ip + 2] & 0x3F));
 436                 *ip += 3;
 437                 return res;
 438         }
 439         else
 440         {       /* Four-byte sequence */
 441                 if (*ip + 4 > len)
 442                         return EOS;
 443                 res = ((ch & 0x07) << 18) +
 444                           ((s[*ip + 1] & 0x3F) << 12) +
 445                           ((s[*ip + 2] & 0x3F) << 6) +
 446                           ((s[*ip + 3] & 0x3F));
 447                 *ip += 4;
 448                 return res;
 449         }
 450 }
 451
 452 /**
 453  * Gets the next Unicode character in a UTF-16 sequence.  The index will
 454  * be advanced to the next complete character, unless the end of string
 455  * is reached in the middle of a UTF-16 surrogate pair.
 456  *
 457  * @param[in]     s             input UTF-16 string
 458  * @param[in]     len   length of the string in words
 459  * @param[in,out] ip    pointer to the index
 460  * @return                              the Unicode character beginning at the index; or
 461  *                                              #EOS if end of input is encountered
 462  */
 463 utf32_t lb_get_next_char_utf16(
 464                 const utf16_t *s,
 465                 size_t len,
 466                 size_t *ip)
 467 {
 468         utf16_t ch;
 469
 470         assert(*ip <= len);
 471         if (*ip == len)
 472                 return EOS;
 473         ch = s[(*ip)++];
 474
 475         if (ch < 0xD800 || ch > 0xDBFF)
 476         {       /* If the character is not a high surrogate */
 477                 return ch;
 478         }
 479         if (*ip == len)
 480         {       /* If the input ends here (an error) */
 481                 --(*ip);
 482                 return EOS;
 483         }
 484         if (s[*ip] < 0xDC00 || s[*ip] > 0xDFFF)
 485         {       /* If the next character is not the low surrogate (an error) */
 486                 return ch;
 487         }
 488         /* Return the constructed character and advance the index again */
 489         return (((utf32_t)ch & 0x3FF) << 10) + (s[(*ip)++] & 0x3FF) + 0x10000;
 490 }
 491
 492 /**
 493  * Gets the next Unicode character in a UTF-32 sequence.  The index will
 494  * be advanced to the next character.
 495  *
 496  * @param[in]     s             input UTF-32 string
 497  * @param[in]     len   length of the string in dwords
 498  * @param[in,out] ip    pointer to the index
 499  * @return                              the Unicode character beginning at the index; or
 500  *                                              #EOS if end of input is encountered
 501  */
 502 utf32_t lb_get_next_char_utf32(
 503                 const utf32_t *s,
 504                 size_t len,
 505                 size_t *ip)
 506 {
 507         assert(*ip <= len);
 508         if (*ip == len)
 509                 return EOS;
 510         return s[(*ip)++];
 511 }
 512
 513 /**
 514  * Sets the line breaking information for a generic input string.
 515  *
 516  * @param[in]  s                        input string
 517  * @param[in]  len                      length of the input
 518  * @param[in]  lang                     language of the input
 519  * @param[out] brks                     pointer to the output breaking data,
 520  *                                                      containing #LINEBREAK_MUSTBREAK,
 521  *                                                      #LINEBREAK_ALLOWBREAK, #LINEBREAK_NOBREAK,
 522  *                                                      or #LINEBREAK_INSIDEACHAR
 523  * @param[in] get_next_char     function to get the next UTF-32 character
 524  */
 525 void set_linebreaks(
 526                 const void *s,
 527                 size_t len,
 528                 const char *lang,
 529                 char *brks,
 530                 get_next_char_t get_next_char)
 531 {
 532         utf32_t ch;
 533         enum LineBreakClass lbcCur;
 534         enum LineBreakClass lbcNew;
 535         enum LineBreakClass lbcLast;
 536         struct LineBreakProperties *lbpLang;
 537         size_t posCur = 0;
 538         size_t posLast = 0;
 539         // TIZEN ONLY : (2013.08.19) for special processing at Zero-width space character
 540         int zw_flag = 0;
 541         //
 542
 543         --posLast;      /* To be ++'d later */
 544         ch = get_next_char(s, len, &posCur);
 545         if (ch == EOS)
 546                 return;
 547         lbpLang = get_lb_prop_lang(lang);
 548         lbcCur = resolve_lb_class(get_char_lb_class_lang(ch, lbpLang), lang);
 549         lbcNew = LBP_Undefined;
 550
 551 nextline:
 552
 553         /* Special treatment for the first character */
 554         switch (lbcCur)
 555         {
 556         case LBP_LF:
 557         case LBP_NL:
 558                 lbcCur = LBP_BK;
 559                 break;
 560         case LBP_CB:
 561                 lbcCur = LBP_BA;
 562                 break;
 563         case LBP_SP:
 564                 lbcCur = LBP_WJ;
 565                 break;
 566         default:
 567                 break;
 568         }
 569
 570         /* Process a line till an explicit break or end of string */
 571         for (;;)
 572         {
 573                 for (++posLast; posLast < posCur - 1; ++posLast)
 574                 {
 575                         brks[posLast] = LINEBREAK_INSIDEACHAR;
 576                 }
 577                 assert(posLast == posCur - 1);
 578                 lbcLast = lbcNew;
 579                 ch = get_next_char(s, len, &posCur);
 580                 if (ch == EOS)
 581                         break;
 582                 lbcNew = get_char_lb_class_lang(ch, lbpLang);
 583                 if (lbcCur == LBP_BK || (lbcCur == LBP_CR && lbcNew != LBP_LF))
 584                 {
 585                         brks[posLast] = LINEBREAK_MUSTBREAK;
 586                         lbcCur = resolve_lb_class(lbcNew, lang);
 587                         goto nextline;
 588                 }
 589
 590                 // TIZEN ONLY : (2013.08.19) for special processing at Zero-width space character
 591                 /*
 592                 switch (lbcNew)
 593                 {
 594                 case LBP_SP:
 595                         brks[posLast] = LINEBREAK_NOBREAK;
 596                         continue;
 597                 case LBP_BK:
 598                 case LBP_LF:
 599                 case LBP_NL:
 600                         brks[posLast] = LINEBREAK_NOBREAK;
 601                         lbcCur = LBP_BK;
 602                         continue;
 603                 case LBP_CR:
 604                         brks[posLast] = LINEBREAK_NOBREAK;
 605                         lbcCur = LBP_CR;
 606                         continue;
 607                 case LBP_CB:
 608                         brks[posLast] = LINEBREAK_ALLOWBREAK;
 609                         lbcCur = LBP_BA;
 610                         continue;
 611                 default:
 612                         break;
 613                 }
 614
 615                 lbcNew = resolve_lb_class(lbcNew, lang);
 616
 617                 assert(lbcCur <= LBP_JT);
 618                 assert(lbcNew <= LBP_JT);
 619                 switch (baTable[lbcCur - 1][lbcNew - 1])
 620                 {
 621                 case DIR_BRK:
 622                         brks[posLast] = LINEBREAK_ALLOWBREAK;
 623                         break;
 624                 case CMI_BRK:
 625                 case IND_BRK:
 626                         if (lbcLast == LBP_SP)
 627                         {
 628                                 brks[posLast] = LINEBREAK_ALLOWBREAK;
 629                         }
 630                         else
 631                         {
 632                                 brks[posLast] = LINEBREAK_NOBREAK;
 633                         }
 634                         break;
 635                 case CMP_BRK:
 636                         brks[posLast] = LINEBREAK_NOBREAK;
 637                         if (lbcLast != LBP_SP)
 638                                 continue;
 639                         break;
 640                 case PRH_BRK:
 641                         brks[posLast] = LINEBREAK_NOBREAK;
 642                         break;
 643                 }
 644
 645                 lbcCur = lbcNew;
 646                 */
 647
 648                 // TIZEN ONLY - START
 649                 if (lbcCur == LBP_ZW && !zw_flag)
 650                 {
 651                         zw_flag = 1;
 652                         posLast = -1;
 653                         posCur = 0;
 654                         ch = get_next_char(s, len, &posCur);
 655                         lbcCur = resolve_lb_class(get_char_lb_class_lang(ch, lbpLang), lang);
 656                         lbcNew = LBP_Undefined;
 657                         goto nextline;
 658                 }
 659                 else if (zw_flag)
 660                 {
 661                         if (lbcCur == LBP_ZW)
 662                                 brks[posLast] = LINEBREAK_ALLOWBREAK;
 663                         else
 664                                 brks[posLast] = LINEBREAK_NOBREAK;
 665                         lbcCur = lbcNew;
 666                 }
 667                 else
 668                 {
 669                         // TIZEN ONLY(20131106): For Hangul word wrap
 670                         switch (lbcCur)
 671                         {
 672                                 case LBP_H2:                    /**< Hangul LV */
 673                                 case LBP_H3:                    /**< Hangul LVT */
 674                                 case LBP_JL:                    /**< Hangul L Jamo */
 675                                 case LBP_JV:                    /**< Hangul V Jamo */
 676                                 case LBP_JT:                    /**< Hangul T Jamo */
 677                                         lbcCur = LBP_AL;
 678                                         break;
 679                                 default:
 680                                         break;
 681                         }
 682
 683                         switch (lbcNew)
 684                         {
 685                                 case LBP_H2:                    /**< Hangul LV */
 686                                 case LBP_H3:                    /**< Hangul LVT */
 687                                 case LBP_JL:                    /**< Hangul L Jamo */
 688                                 case LBP_JV:                    /**< Hangul V Jamo */
 689                                 case LBP_JT:                    /**< Hangul T Jamo */
 690                                         lbcNew = LBP_AL;
 691                                         break;
 692                                 default:
 693                                         break;
 694                         }
 695                         //
 696
 697                         switch (lbcNew)
 698                         {
 699                                 case LBP_SP:
 700                                         brks[posLast] = LINEBREAK_NOBREAK;
 701                                         continue;
 702                                 case LBP_BK:
 703                                 case LBP_LF:
 704                                 case LBP_NL:
 705                                         brks[posLast] = LINEBREAK_NOBREAK;
 706                                         lbcCur = LBP_BK;
 707                                         continue;
 708                                 case LBP_CR:
 709                                         brks[posLast] = LINEBREAK_NOBREAK;
 710                                         lbcCur = LBP_CR;
 711                                         continue;
 712                                 case LBP_CB:
 713                                         brks[posLast] = LINEBREAK_ALLOWBREAK;
 714                                         lbcCur = LBP_BA;
 715                                         continue;
 716                                 default:
 717                                         break;
 718                         }
 719
 720                         lbcNew = resolve_lb_class(lbcNew, lang);
 721
 722                         assert(lbcCur <= LBP_JT);
 723                         assert(lbcNew <= LBP_JT);
 724                         switch (baTable[lbcCur - 1][lbcNew - 1])
 725                         {
 726                                 case DIR_BRK:
 727                                         brks[posLast] = LINEBREAK_ALLOWBREAK;
 728                                         break;
 729                                 case CMI_BRK:
 730                                 case IND_BRK:
 731                                         if (lbcLast == LBP_SP)
 732                                         {
 733                                                 brks[posLast] = LINEBREAK_ALLOWBREAK;
 734                                         }
 735                                         else
 736                                         {
 737                                                 brks[posLast] = LINEBREAK_NOBREAK;
 738                                         }
 739                                         break;
 740                                 case CMP_BRK:
 741                                         brks[posLast] = LINEBREAK_NOBREAK;
 742                                         if (lbcLast != LBP_SP)
 743                                                 continue;
 744                                         break;
 745                                 case PRH_BRK:
 746                                         brks[posLast] = LINEBREAK_NOBREAK;
 747                                         break;
 748                         }
 749                         lbcCur = lbcNew;
 750                 }
 751                 // TIZEN ONLY - END
 752         }
 753
 754         assert(posLast == posCur - 1 && posCur <= len);
 755         /* Break after the last character */
 756         brks[posLast] = LINEBREAK_MUSTBREAK;
 757         /* When the input contains incomplete sequences */
 758         while (posCur < len)
 759         {
 760                 brks[posCur++] = LINEBREAK_INSIDEACHAR;
 761         }
 762 }
 763
 764 /**
 765  * Sets the line breaking information for a UTF-8 input string.
 766  *
 767  * @param[in]  s        input UTF-8 string
 768  * @param[in]  len      length of the input
 769  * @param[in]  lang     language of the input
 770  * @param[out] brks     pointer to the output breaking data, containing
 771  *                                      #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
 772  *                                      #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
 773  */
 774 void set_linebreaks_utf8(
 775                 const utf8_t *s,
 776                 size_t len,
 777                 const char *lang,
 778                 char *brks)
 779 {
 780         set_linebreaks(s, len, lang, brks,
 781                                    (get_next_char_t)lb_get_next_char_utf8);
 782 }
 783
 784 /**
 785  * Sets the line breaking information for a UTF-16 input string.
 786  *
 787  * @param[in]  s        input UTF-16 string
 788  * @param[in]  len      length of the input
 789  * @param[in]  lang     language of the input
 790  * @param[out] brks     pointer to the output breaking data, containing
 791  *                                      #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
 792  *                                      #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
 793  */
 794 void set_linebreaks_utf16(
 795                 const utf16_t *s,
 796                 size_t len,
 797                 const char *lang,
 798                 char *brks)
 799 {
 800         set_linebreaks(s, len, lang, brks,
 801                                    (get_next_char_t)lb_get_next_char_utf16);
 802 }
 803
 804 /**
 805  * Sets the line breaking information for a UTF-32 input string.
 806  *
 807  * @param[in]  s        input UTF-32 string
 808  * @param[in]  len      length of the input
 809  * @param[in]  lang     language of the input
 810  * @param[out] brks     pointer to the output breaking data, containing
 811  *                                      #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
 812  *                                      #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
 813  */
 814 void set_linebreaks_utf32(
 815                 const utf32_t *s,
 816                 size_t len,
 817                 const char *lang,
 818                 char *brks)
 819 {
 820         set_linebreaks(s, len, lang, brks,
 821                                    (get_next_char_t)lb_get_next_char_utf32);
 822 }
 823
 824 /**
 825  * Tells whether a line break can occur between two Unicode characters.
 826  * This is a wrapper function to expose a simple interface.  Generally
 827  * speaking, it is better to use #set_linebreaks_utf32 instead, since
 828  * complicated cases involving combining marks, spaces, etc. cannot be
 829  * correctly processed.
 830  *
 831  * @param char1 the first Unicode character
 832  * @param char2 the second Unicode character
 833  * @param lang  language of the input
 834  * @return      one of #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
 835  *                              #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
 836  */
 837 int is_line_breakable(
 838                 utf32_t char1,
 839                 utf32_t char2,
 840                 const char* lang)
 841 {
 842         utf32_t s[2];
 843         char brks[2];
 844         s[0] = char1;
 845         s[1] = char2;
 846         set_linebreaks_utf32(s, 2, lang, brks);
 847         return brks[0];
 848 }