text/dali/internal/libunibreak/linebreak.c

   1 /* vim: set expandtab tabstop=4 softtabstop=4 shiftwidth=4: */
   2
   3 /*
   4  * Line breaking in a Unicode sequence.  Designed to be used in a
   5  * generic text renderer.
   6  *
   7  * Copyright (C) 2008-2013 Wu Yongwei <wuyongwei at gmail dot com>
   8  * Copyright (C) 2013 Petr Filipsky <philodej at gmail dot com>
   9  *
  10  * This software is provided 'as-is', without any express or implied
  11  * warranty.  In no event will the author be held liable for any damages
  12  * arising from the use of this software.
  13  *
  14  * Permission is granted to anyone to use this software for any purpose,
  15  * including commercial applications, and to alter it and redistribute
  16  * it freely, subject to the following restrictions:
  17  *
  18  * 1. The origin of this software must not be misrepresented; you must
  19  *    not claim that you wrote the original software.  If you use this
  20  *    software in a product, an acknowledgement in the product
  21  *    documentation would be appreciated but is not required.
  22  * 2. Altered source versions must be plainly marked as such, and must
  23  *    not be misrepresented as being the original software.
  24  * 3. This notice may not be removed or altered from any source
  25  *    distribution.
  26  *
  27  * The main reference is Unicode Standard Annex 14 (UAX #14):
  28  *      <URL:http://www.unicode.org/reports/tr14/>
  29  *
  30  * When this library was designed, this annex was at Revision 19, for
  31  * Unicode 5.0.0:
  32  *      <URL:http://www.unicode.org/reports/tr14/tr14-19.html>
  33  *
  34  * This library has been updated according to Revision 30, for
  35  * Unicode 6.2.0:
  36  *      <URL:http://www.unicode.org/reports/tr14/tr14-30.html>
  37  *
  38  * The Unicode Terms of Use are available at
  39  *      <URL:http://www.unicode.org/copyright.html>
  40  */
  41
  42 /**
  43  * @file    linebreak.c
  44  *
  45  * Implementation of the line breaking algorithm as described in Unicode
  46  * Standard Annex 14.
  47  *
  48  * @version 2.5, 2013/11/14
  49  * @author  Wu Yongwei
  50  * @author  Petr Filipsky
  51  */
  52
  53 #include <assert.h>
  54 #include <stddef.h>
  55 #include <string.h>
  56 #include "linebreak.h"
  57 #include "linebreakdef.h"
  58
  59 /**
  60  * Special value used internally to indicate an undefined break result.
  61  */
  62 #define LINEBREAK_UNDEFINED -1
  63
  64 /**
  65  * Size of the second-level index to the line breaking properties.
  66  */
  67 #define LINEBREAK_INDEX_SIZE 40
  68
  69 /**
  70  * Version number of the library.
  71  */
  72 const int linebreak_version = LINEBREAK_VERSION;
  73
  74 /**
  75  * Enumeration of break actions.  They are used in the break action
  76  * pair table below.
  77  */
  78 enum BreakAction
  79 {
  80     DIR_BRK,        /**< Direct break opportunity */
  81     IND_BRK,        /**< Indirect break opportunity */
  82     CMI_BRK,        /**< Indirect break opportunity for combining marks */
  83     CMP_BRK,        /**< Prohibited break for combining marks */
  84     PRH_BRK         /**< Prohibited break */
  85 };
  86
  87 /**
  88  * Break action pair table.  This is a direct mapping of Table 2 of
  89  * Unicode Standard Annex 14, Revision 30.
  90  */
  91 static enum BreakAction baTable[LBP_RI][LBP_RI] = {
  92     {   /* OP */
  93         PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
  94         PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
  95         PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
  96         CMP_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
  97         PRH_BRK },
  98     {   /* CL */
  99         DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, PRH_BRK, PRH_BRK,
 100         PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 101         DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
 102         CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 103         DIR_BRK },
 104     {   /* CP */
 105         DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, PRH_BRK, PRH_BRK,
 106         PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
 107         DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
 108         CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 109         DIR_BRK },
 110     {   /* QU */
 111         PRH_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 112         PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
 113         IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 114         CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
 115         IND_BRK },
 116     {   /* GL */
 117         IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 118         PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
 119         IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 120         CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
 121         IND_BRK },
 122     {   /* NS */
 123         DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 124         PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 125         DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
 126         CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 127         DIR_BRK },
 128     {   /* EX */
 129         DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 130         PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 131         DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
 132         CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 133         DIR_BRK },
 134     {   /* SY */
 135         DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 136         PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK,
 137         DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
 138         CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 139         DIR_BRK },
 140     {   /* IS */
 141         DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 142         PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
 143         DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
 144         CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 145         DIR_BRK },
 146     {   /* PR */
 147         IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 148         PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
 149         IND_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
 150         CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
 151         DIR_BRK },
 152     {   /* PO */
 153         IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 154         PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
 155         DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
 156         CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 157         DIR_BRK },
 158     {   /* NU */
 159         IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 160         PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
 161         DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
 162         CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 163         DIR_BRK },
 164     {   /* AL */
 165         IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 166         PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
 167         DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
 168         CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 169         DIR_BRK },
 170     {   /* HL */
 171         IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 172         PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
 173         DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
 174         CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 175         DIR_BRK },
 176     {   /* ID */
 177         DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 178         PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 179         DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
 180         CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 181         DIR_BRK },
 182     {   /* IN */
 183         DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 184         PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 185         DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
 186         CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 187         DIR_BRK },
 188     {   /* HY */
 189         DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, DIR_BRK, IND_BRK, PRH_BRK,
 190         PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK,
 191         DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
 192         CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 193         DIR_BRK },
 194     {   /* BA */
 195         DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, DIR_BRK, IND_BRK, PRH_BRK,
 196         PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 197         DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
 198         CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 199         DIR_BRK },
 200     {   /* BB */
 201         IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 202         PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
 203         IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 204         CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
 205         IND_BRK },
 206     {   /* B2 */
 207         DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 208         PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 209         DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, PRH_BRK, PRH_BRK,
 210         CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 211         DIR_BRK },
 212     {   /* ZW */
 213         DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 214         DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 215         DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
 216         DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 217         DIR_BRK },
 218     {   /* CM */
 219         IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 220         PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
 221         DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
 222         CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 223         DIR_BRK },
 224     {   /* WJ */
 225         IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 226         PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
 227         IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 228         CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
 229         IND_BRK },
 230     {   /* H2 */
 231         DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 232         PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 233         DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
 234         CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK,
 235         DIR_BRK },
 236     {   /* H3 */
 237         DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 238         PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 239         DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
 240         CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK,
 241         DIR_BRK },
 242     {   /* JL */
 243         DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 244         PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 245         DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
 246         CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK,
 247         DIR_BRK },
 248     {   /* JV */
 249         DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 250         PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 251         DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
 252         CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK,
 253         DIR_BRK },
 254     {   /* JT */
 255         DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 256         PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 257         DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
 258         CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK,
 259         DIR_BRK },
 260     {   /* RI */
 261         DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 262         PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 263         DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
 264         CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 265         IND_BRK },
 266 };
 267
 268 /**
 269  * Struct for the second-level index to the line breaking properties.
 270  */
 271 struct LineBreakPropertiesIndex
 272 {
 273     utf32_t end;                    /**< End coding point */
 274     struct LineBreakProperties *lbp;/**< Pointer to line breaking properties */
 275 };
 276
 277 /**
 278  * Second-level index to the line breaking properties.
 279  */
 280 static struct LineBreakPropertiesIndex lb_prop_index[LINEBREAK_INDEX_SIZE] =
 281 {
 282     { 0xFFFFFFFF, lb_prop_default }
 283 };
 284
 285 /**
 286  * Initializes the second-level index to the line breaking properties.
 287  * If it is not called, the performance of #get_char_lb_class_lang (and
 288  * thus the main functionality) can be pretty bad, especially for big
 289  * code points like those of Chinese.
 290  */
 291 void init_linebreak(void)
 292 {
 293     size_t i;
 294     size_t iPropDefault;
 295     size_t len;
 296     size_t step;
 297
 298     len = 0;
 299     while (lb_prop_default[len].prop != LBP_Undefined)
 300         ++len;
 301     step = len / LINEBREAK_INDEX_SIZE;
 302     iPropDefault = 0;
 303     for (i = 0; i < LINEBREAK_INDEX_SIZE; ++i)
 304     {
 305         lb_prop_index[i].lbp = lb_prop_default + iPropDefault;
 306         iPropDefault += step;
 307         lb_prop_index[i].end = lb_prop_default[iPropDefault].start - 1;
 308     }
 309     lb_prop_index[--i].end = 0xFFFFFFFF;
 310 }
 311
 312 /**
 313  * Gets the language-specific line breaking properties.
 314  *
 315  * @param lang  language of the text
 316  * @return      pointer to the language-specific line breaking
 317  *              properties array if found; \c NULL otherwise
 318  */
 319 static struct LineBreakProperties *get_lb_prop_lang(const char *lang)
 320 {
 321     struct LineBreakPropertiesLang *lbplIter;
 322     if (lang != NULL)
 323     {
 324         for (lbplIter = lb_prop_lang_map; lbplIter->lang != NULL; ++lbplIter)
 325         {
 326             if (strncmp(lang, lbplIter->lang, lbplIter->namelen) == 0)
 327             {
 328                 return lbplIter->lbp;
 329             }
 330         }
 331     }
 332     return NULL;
 333 }
 334
 335 /**
 336  * Gets the line breaking class of a character from a line breaking
 337  * properties array.
 338  *
 339  * @param ch   character to check
 340  * @param lbp  pointer to the line breaking properties array
 341  * @return     the line breaking class if found; \c LBP_XX otherwise
 342  */
 343 static enum LineBreakClass get_char_lb_class(
 344         utf32_t ch,
 345         struct LineBreakProperties *lbp)
 346 {
 347     while (lbp->prop != LBP_Undefined && ch >= lbp->start)
 348     {
 349         if (ch <= lbp->end)
 350             return lbp->prop;
 351         ++lbp;
 352     }
 353     return LBP_XX;
 354 }
 355
 356 /**
 357  * Gets the line breaking class of a character from the default line
 358  * breaking properties array.
 359  *
 360  * @param ch  character to check
 361  * @return    the line breaking class if found; \c LBP_XX otherwise
 362  */
 363 static enum LineBreakClass get_char_lb_class_default(
 364         utf32_t ch)
 365 {
 366     size_t i = 0;
 367     while (ch > lb_prop_index[i].end)
 368         ++i;
 369     assert(i < LINEBREAK_INDEX_SIZE);
 370     return get_char_lb_class(ch, lb_prop_index[i].lbp);
 371 }
 372
 373 /**
 374  * Gets the line breaking class of a character for a specific
 375  * language.  This function will check the language-specific data first,
 376  * and then the default data if there is no language-specific property
 377  * available for the character.
 378  *
 379  * @param ch       character to check
 380  * @param lbpLang  pointer to the language-specific line breaking
 381  *                 properties array
 382  * @return         the line breaking class if found; \c LBP_XX
 383  *                 otherwise
 384  */
 385 static enum LineBreakClass get_char_lb_class_lang(
 386         utf32_t ch,
 387         struct LineBreakProperties *lbpLang)
 388 {
 389     enum LineBreakClass lbcResult;
 390
 391     /* Find the language-specific line breaking class for a character */
 392     if (lbpLang)
 393     {
 394         lbcResult = get_char_lb_class(ch, lbpLang);
 395         if (lbcResult != LBP_XX)
 396             return lbcResult;
 397     }
 398
 399     /* Find the generic language-specific line breaking class, if no
 400      * language context is provided, or language-specific data are not
 401      * available for the specific character in the specified language */
 402     return get_char_lb_class_default(ch);
 403 }
 404
 405 /**
 406  * Resolves the line breaking class for certain ambiguous or complicated
 407  * characters.  They are treated in a simplistic way in this
 408  * implementation.
 409  *
 410  * @param lbc   line breaking class to resolve
 411  * @param lang  language of the text
 412  * @return      the resolved line breaking class
 413  */
 414 static enum LineBreakClass resolve_lb_class(
 415         enum LineBreakClass lbc,
 416         const char *lang)
 417 {
 418     switch (lbc)
 419     {
 420     case LBP_AI:
 421         if (lang != NULL &&
 422                 (strncmp(lang, "zh", 2) == 0 || /* Chinese */
 423                  strncmp(lang, "ja", 2) == 0 || /* Japanese */
 424                  strncmp(lang, "ko", 2) == 0))  /* Korean */
 425         {
 426             return LBP_ID;
 427         }
 428         else
 429         {
 430             return LBP_AL;
 431         }
 432     case LBP_CJ:
 433         /* Simplified for `normal' line breaking.  See
 434          * <url:http://www.unicode.org/reports/tr14/tr14-30.html#CJ>
 435          * for details. */
 436         return LBP_ID;
 437     case LBP_SA:
 438     case LBP_SG:
 439     case LBP_XX:
 440         return LBP_AL;
 441     default:
 442         return lbc;
 443     }
 444 }
 445
 446 /**
 447  * Treats specially for the first character in a line.
 448  *
 449  * @param[in,out] lbpCtx  pointer to the line breaking context
 450  * @pre                   \a lbpCtx->lbcCur has a valid line break class
 451  * @post                  \a lbpCtx->lbcCur has the updated line break class
 452  */
 453 static void treat_first_char(
 454         struct LineBreakContext* lbpCtx)
 455 {
 456     switch (lbpCtx->lbcCur)
 457     {
 458     case LBP_LF:
 459     case LBP_NL:
 460         lbpCtx->lbcCur = LBP_BK;        /* Rule LB5 */
 461         break;
 462     case LBP_CB:
 463         lbpCtx->lbcCur = LBP_BA;        /* Rule LB20 */
 464         break;
 465     case LBP_SP:
 466         lbpCtx->lbcCur = LBP_WJ;        /* Leading space treated as WJ */
 467         break;
 468     default:
 469         break;
 470     }
 471 }
 472
 473 /**
 474  * Tries telling the line break opportunity by simple rules.
 475  *
 476  * @param[in,out] lbpCtx  pointer to the line breaking context
 477  * @pre                   \a lbpCtx->lbcCur has the current line break
 478  *                        class; and \a lbpCtx->lbcNew has the line
 479  *                        break class for the next character
 480  * @post                  \a lbpCtx->lbcCur has the updated line break
 481  *                        class
 482  * @return                break result, one of #LINEBREAK_MUSTBREAK,
 483  *                        #LINEBREAK_ALLOWBREAK, and #LINEBREAK_NOBREAK
 484  *                        if identified; or #LINEBREAK_UNDEFINED if
 485  *                        table lookup is needed
 486  */
 487 static int get_lb_result_simple(
 488         struct LineBreakContext* lbpCtx)
 489 {
 490     if (lbpCtx->lbcCur == LBP_BK
 491         || (lbpCtx->lbcCur == LBP_CR && lbpCtx->lbcNew != LBP_LF))
 492     {
 493         return LINEBREAK_MUSTBREAK;     /* Rules LB4 and LB5 */
 494     }
 495
 496     switch (lbpCtx->lbcNew)
 497     {
 498     case LBP_SP:
 499         return LINEBREAK_NOBREAK;       /* Rule LB7; no change to lbcCur */
 500     case LBP_BK:
 501     case LBP_LF:
 502     case LBP_NL:
 503         lbpCtx->lbcCur = LBP_BK;        /* Mandatory break after */
 504         return LINEBREAK_NOBREAK;       /* Rule LB6 */
 505     case LBP_CR:
 506         lbpCtx->lbcCur = LBP_CR;
 507         return LINEBREAK_NOBREAK;       /* Rule LB6 */
 508     case LBP_CB:
 509         lbpCtx->lbcCur = LBP_BA;
 510         return LINEBREAK_ALLOWBREAK;    /* Rule LB20 */
 511     default:
 512         return LINEBREAK_UNDEFINED;     /* Table lookup is needed */
 513     }
 514 }
 515
 516 /**
 517  * Tells the line break opportunity by table lookup.
 518  *
 519  * @param[in,out] lbpCtx  pointer to the line breaking context
 520  * @pre                   \a lbpCtx->lbcCur has the current line break
 521  *                        class; \a lbpCtx->lbcLast has the line break
 522  *                        class for the last character; and \a
 523  *                        lbcCur->lbcNew has the line break class for
 524  *                        the next character
 525  * @post                  \a lbpCtx->lbcCur has the updated line break
 526  *                        class
 527  * @return                break result, one of #LINEBREAK_MUSTBREAK,
 528  *                        #LINEBREAK_ALLOWBREAK, and #LINEBREAK_NOBREAK
 529  */
 530 static int get_lb_result_lookup(
 531         struct LineBreakContext* lbpCtx)
 532 {
 533     /* TODO: Rule LB21a, as introduced by Revision 28 of UAX#14, is not
 534      * yet implemented below. */
 535     int brk = LINEBREAK_UNDEFINED;
 536     assert((lbpCtx->lbcCur > 0) && (lbpCtx->lbcCur <= LBP_RI));
 537     assert((lbpCtx->lbcNew > 0) && (lbpCtx->lbcNew <= LBP_RI));
 538
 539     enum LineBreakClass lbcCur, lbcNew;
 540
 541     switch (lbpCtx->lbcCur)
 542     {
 543     case LBP_H2:        /**< Hangul LV */
 544     case LBP_H3:        /**< Hangul LVT */
 545     case LBP_JL:        /**< Hangul L Jamo */
 546     case LBP_JV:        /**< Hangul V Jamo */
 547     case LBP_JT:        /**< Hangul T Jamo */
 548         lbcCur = LBP_AL;
 549         break;
 550     default:
 551         lbcCur = lbpCtx->lbcCur;
 552         break;
 553     }
 554
 555     switch (lbpCtx->lbcNew)
 556     {
 557     case LBP_H2:        /**< Hangul LV */
 558     case LBP_H3:        /**< Hangul LVT */
 559     case LBP_JL:        /**< Hangul L Jamo */
 560     case LBP_JV:        /**< Hangul V Jamo */
 561     case LBP_JT:        /**< Hangul T Jamo */
 562         lbcNew = LBP_AL;
 563         break;
 564     default:
 565         lbcNew = lbpCtx->lbcNew;
 566         break;
 567     }
 568
 569     switch (baTable[lbcCur - 1][lbcNew - 1])
 570     {
 571     case DIR_BRK:
 572         brk = LINEBREAK_ALLOWBREAK;
 573         break;
 574     case CMI_BRK:
 575     case IND_BRK:
 576         brk = (lbpCtx->lbcLast == LBP_SP)
 577             ? LINEBREAK_ALLOWBREAK
 578             : LINEBREAK_NOBREAK;
 579         break;
 580     case CMP_BRK:
 581         brk = LINEBREAK_NOBREAK;
 582         if (lbpCtx->lbcLast != LBP_SP)
 583             return brk;                 /* Do not update lbcCur */
 584         break;
 585     case PRH_BRK:
 586         brk = LINEBREAK_NOBREAK;
 587         break;
 588     }
 589
 590     lbpCtx->lbcCur = lbpCtx->lbcNew;
 591     return brk;
 592 }
 593
 594 /**
 595  * Initializes line breaking context for a given language.
 596  *
 597  * @param[in,out] lbpCtx  pointer to the line breaking context
 598  * @param[in]     ch      the first character to process
 599  * @param[in]     lang    language of the input
 600  * @post                  the line breaking context is initialized
 601  */
 602 void lb_init_break_context(
 603         struct LineBreakContext* lbpCtx,
 604         utf32_t ch,
 605         const char* lang)
 606 {
 607     lbpCtx->lang = lang;
 608     lbpCtx->lbpLang = get_lb_prop_lang(lang);
 609     lbpCtx->lbcLast = LBP_Undefined;
 610     lbpCtx->lbcNew = LBP_Undefined;
 611     lbpCtx->lbcCur = resolve_lb_class(
 612                         get_char_lb_class_lang(ch, lbpCtx->lbpLang),
 613                         lbpCtx->lang);
 614     treat_first_char(lbpCtx);
 615 }
 616
 617 /**
 618  * Updates LineBreakingContext for the next code point and returns
 619  * the detected break.
 620  *
 621  * @param[in,out] lbpCtx  pointer to the line breaking context
 622  * @param[in]     ch      Unicode code point
 623  * @return                break result, one of #LINEBREAK_MUSTBREAK,
 624  *                        #LINEBREAK_ALLOWBREAK, and #LINEBREAK_NOBREAK
 625  * @post                  the line breaking context is updated
 626  */
 627 int lb_process_next_char(
 628         struct LineBreakContext* lbpCtx,
 629         utf32_t ch )
 630 {
 631     int brk;
 632
 633     lbpCtx->lbcLast = lbpCtx->lbcNew;
 634     lbpCtx->lbcNew = get_char_lb_class_lang(ch, lbpCtx->lbpLang);
 635     brk = get_lb_result_simple(lbpCtx);
 636     switch (brk)
 637     {
 638     case LINEBREAK_MUSTBREAK:
 639         lbpCtx->lbcCur = resolve_lb_class(lbpCtx->lbcNew, lbpCtx->lang);
 640         treat_first_char(lbpCtx);
 641         break;
 642     case LINEBREAK_UNDEFINED:
 643         lbpCtx->lbcNew = resolve_lb_class(lbpCtx->lbcNew, lbpCtx->lang);
 644         brk = get_lb_result_lookup(lbpCtx);
 645         break;
 646     default:
 647         break;
 648     }
 649     return brk;
 650 }
 651
 652 /**
 653  * Gets the next Unicode character in a UTF-8 sequence.  The index will
 654  * be advanced to the next complete character, unless the end of string
 655  * is reached in the middle of a UTF-8 sequence.
 656  *
 657  * @param[in]     s    input UTF-8 string
 658  * @param[in]     len  length of the string in bytes
 659  * @param[in,out] ip   pointer to the index
 660  * @return             the Unicode character beginning at the index; or
 661  *                     #EOS if end of input is encountered
 662  */
 663 utf32_t lb_get_next_char_utf8(
 664         const utf8_t *s,
 665         size_t len,
 666         size_t *ip)
 667 {
 668     utf8_t ch;
 669     utf32_t res;
 670
 671     assert(*ip <= len);
 672     if (*ip == len)
 673         return EOS;
 674     ch = s[*ip];
 675
 676     if (ch < 0xC2 || ch > 0xF4)
 677     {   /* One-byte sequence, tail (should not occur), or invalid */
 678         *ip += 1;
 679         return ch;
 680     }
 681     else if (ch < 0xE0)
 682     {   /* Two-byte sequence */
 683         if (*ip + 2 > len)
 684             return EOS;
 685         res = ((ch & 0x1F) << 6) + (s[*ip + 1] & 0x3F);
 686         *ip += 2;
 687         return res;
 688     }
 689     else if (ch < 0xF0)
 690     {   /* Three-byte sequence */
 691         if (*ip + 3 > len)
 692             return EOS;
 693         res = ((ch & 0x0F) << 12) +
 694               ((s[*ip + 1] & 0x3F) << 6) +
 695               ((s[*ip + 2] & 0x3F));
 696         *ip += 3;
 697         return res;
 698     }
 699     else
 700     {   /* Four-byte sequence */
 701         if (*ip + 4 > len)
 702             return EOS;
 703         res = ((ch & 0x07) << 18) +
 704               ((s[*ip + 1] & 0x3F) << 12) +
 705               ((s[*ip + 2] & 0x3F) << 6) +
 706               ((s[*ip + 3] & 0x3F));
 707         *ip += 4;
 708         return res;
 709     }
 710 }
 711
 712 /**
 713  * Gets the next Unicode character in a UTF-16 sequence.  The index will
 714  * be advanced to the next complete character, unless the end of string
 715  * is reached in the middle of a UTF-16 surrogate pair.
 716  *
 717  * @param[in]     s    input UTF-16 string
 718  * @param[in]     len  length of the string in words
 719  * @param[in,out] ip   pointer to the index
 720  * @return             the Unicode character beginning at the index; or
 721  *                     #EOS if end of input is encountered
 722  */
 723 utf32_t lb_get_next_char_utf16(
 724         const utf16_t *s,
 725         size_t len,
 726         size_t *ip)
 727 {
 728     utf16_t ch;
 729
 730     assert(*ip <= len);
 731     if (*ip == len)
 732         return EOS;
 733     ch = s[(*ip)++];
 734
 735     if (ch < 0xD800 || ch > 0xDBFF)
 736     {   /* If the character is not a high surrogate */
 737         return ch;
 738     }
 739     if (*ip == len)
 740     {   /* If the input ends here (an error) */
 741         --(*ip);
 742         return EOS;
 743     }
 744     if (s[*ip] < 0xDC00 || s[*ip] > 0xDFFF)
 745     {   /* If the next character is not the low surrogate (an error) */
 746         return ch;
 747     }
 748     /* Return the constructed character and advance the index again */
 749     return (((utf32_t)ch & 0x3FF) << 10) + (s[(*ip)++] & 0x3FF) + 0x10000;
 750 }
 751
 752 /**
 753  * Gets the next Unicode character in a UTF-32 sequence.  The index will
 754  * be advanced to the next character.
 755  *
 756  * @param[in]     s    input UTF-32 string
 757  * @param[in]     len  length of the string in dwords
 758  * @param[in,out] ip   pointer to the index
 759  * @return             the Unicode character beginning at the index; or
 760  *                     #EOS if end of input is encountered
 761  */
 762 utf32_t lb_get_next_char_utf32(
 763         const utf32_t *s,
 764         size_t len,
 765         size_t *ip)
 766 {
 767     assert(*ip <= len);
 768     if (*ip == len)
 769         return EOS;
 770     return s[(*ip)++];
 771 }
 772
 773 /**
 774  * Sets the line breaking information for a generic input string.
 775  *
 776  * @param[in]  s             input string
 777  * @param[in]  len           length of the input
 778  * @param[in]  lang          language of the input
 779  * @param[out] brks          pointer to the output breaking data,
 780  *                           containing #LINEBREAK_MUSTBREAK,
 781  *                           #LINEBREAK_ALLOWBREAK, #LINEBREAK_NOBREAK,
 782  *                           or #LINEBREAK_INSIDEACHAR
 783  * @param[in] get_next_char  function to get the next UTF-32 character
 784  */
 785 void set_linebreaks(
 786         const void *s,
 787         size_t len,
 788         const char *lang,
 789         char *brks,
 790         get_next_char_t get_next_char)
 791 {
 792     utf32_t ch;
 793     struct LineBreakContext lbCtx;
 794     size_t posCur = 0;
 795     size_t posLast = 0;
 796
 797     --posLast;  /* To be ++'d later */
 798     ch = get_next_char(s, len, &posCur);
 799     if (ch == EOS)
 800         return;
 801     lb_init_break_context(&lbCtx, ch, lang);
 802
 803     /* Process a line till an explicit break or end of string */
 804     for (;;)
 805     {
 806         for (++posLast; posLast < posCur - 1; ++posLast)
 807         {
 808             brks[posLast] = LINEBREAK_INSIDEACHAR;
 809         }
 810         assert(posLast == posCur - 1);
 811         ch = get_next_char(s, len, &posCur);
 812         if (ch == EOS)
 813             break;
 814         brks[posLast] = lb_process_next_char(&lbCtx, ch);
 815     }
 816
 817     assert(posLast == posCur - 1 && posCur <= len);
 818     /* Break after the last character */
 819     brks[posLast] = LINEBREAK_MUSTBREAK;
 820     /* When the input contains incomplete sequences */
 821     while (posCur < len)
 822     {
 823         brks[posCur++] = LINEBREAK_INSIDEACHAR;
 824     }
 825 }
 826
 827 /**
 828  * Sets the line breaking information for a UTF-8 input string.
 829  *
 830  * @param[in]  s     input UTF-8 string
 831  * @param[in]  len   length of the input
 832  * @param[in]  lang  language of the input
 833  * @param[out] brks  pointer to the output breaking data, containing
 834  *                   #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
 835  *                   #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
 836  */
 837 void set_linebreaks_utf8(
 838         const utf8_t *s,
 839         size_t len,
 840         const char *lang,
 841         char *brks)
 842 {
 843     set_linebreaks(s, len, lang, brks,
 844                    (get_next_char_t)lb_get_next_char_utf8);
 845 }
 846
 847 /**
 848  * Sets the line breaking information for a UTF-16 input string.
 849  *
 850  * @param[in]  s     input UTF-16 string
 851  * @param[in]  len   length of the input
 852  * @param[in]  lang  language of the input
 853  * @param[out] brks  pointer to the output breaking data, containing
 854  *                   #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
 855  *                   #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
 856  */
 857 void set_linebreaks_utf16(
 858         const utf16_t *s,
 859         size_t len,
 860         const char *lang,
 861         char *brks)
 862 {
 863     set_linebreaks(s, len, lang, brks,
 864                    (get_next_char_t)lb_get_next_char_utf16);
 865 }
 866
 867 /**
 868  * Sets the line breaking information for a UTF-32 input string.
 869  *
 870  * @param[in]  s     input UTF-32 string
 871  * @param[in]  len   length of the input
 872  * @param[in]  lang  language of the input
 873  * @param[out] brks  pointer to the output breaking data, containing
 874  *                   #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
 875  *                   #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
 876  */
 877 void set_linebreaks_utf32(
 878         const utf32_t *s,
 879         size_t len,
 880         const char *lang,
 881         char *brks)
 882 {
 883     set_linebreaks(s, len, lang, brks,
 884                    (get_next_char_t)lb_get_next_char_utf32);
 885 }
 886
 887 /**
 888  * Tells whether a line break can occur between two Unicode characters.
 889  * This is a wrapper function to expose a simple interface.  Generally
 890  * speaking, it is better to use #set_linebreaks_utf32 instead, since
 891  * complicated cases involving combining marks, spaces, etc. cannot be
 892  * correctly processed.
 893  *
 894  * @param char1  the first Unicode character
 895  * @param char2  the second Unicode character
 896  * @param lang   language of the input
 897  * @return       one of #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
 898  *               #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
 899  */
 900 int is_line_breakable(
 901         utf32_t char1,
 902         utf32_t char2,
 903         const char* lang)
 904 {
 905     utf32_t s[2];
 906     char brks[2];
 907     s[0] = char1;
 908     s[1] = char2;
 909     set_linebreaks_utf32(s, 2, lang, brks);
 910     return brks[0];
 911 }