text/dali/internal/libunibreak/linebreak.c

   1 /* vim: set expandtab tabstop=4 softtabstop=4 shiftwidth=4: */
   2
   3 /*
   4  * Line breaking in a Unicode sequence.  Designed to be used in a
   5  * generic text renderer.
   6  *
   7  * Copyright (C) 2008-2013 Wu Yongwei <wuyongwei at gmail dot com>
   8  * Copyright (C) 2013 Petr Filipsky <philodej at gmail dot com>
   9  *
  10  * This software is provided 'as-is', without any express or implied
  11  * warranty.  In no event will the author be held liable for any damages
  12  * arising from the use of this software.
  13  *
  14  * Permission is granted to anyone to use this software for any purpose,
  15  * including commercial applications, and to alter it and redistribute
  16  * it freely, subject to the following restrictions:
  17  *
  18  * 1. The origin of this software must not be misrepresented; you must
  19  *    not claim that you wrote the original software.  If you use this
  20  *    software in a product, an acknowledgement in the product
  21  *    documentation would be appreciated but is not required.
  22  * 2. Altered source versions must be plainly marked as such, and must
  23  *    not be misrepresented as being the original software.
  24  * 3. This notice may not be removed or altered from any source
  25  *    distribution.
  26  *
  27  * The main reference is Unicode Standard Annex 14 (UAX #14):
  28  *      <URL:http://www.unicode.org/reports/tr14/>
  29  *
  30  * When this library was designed, this annex was at Revision 19, for
  31  * Unicode 5.0.0:
  32  *      <URL:http://www.unicode.org/reports/tr14/tr14-19.html>
  33  *
  34  * This library has been updated according to Revision 30, for
  35  * Unicode 6.2.0:
  36  *      <URL:http://www.unicode.org/reports/tr14/tr14-30.html>
  37  *
  38  * The Unicode Terms of Use are available at
  39  *      <URL:http://www.unicode.org/copyright.html>
  40  */
  41
  42 /**
  43  * @file    linebreak.c
  44  *
  45  * Implementation of the line breaking algorithm as described in Unicode
  46  * Standard Annex 14.
  47  *
  48  * @version 2.5, 2013/11/14
  49  * @author  Wu Yongwei
  50  * @author  Petr Filipsky
  51  */
  52
  53 #include <assert.h>
  54 #include <stddef.h>
  55 #include <string.h>
  56 #include "linebreak.h"
  57 #include "linebreakdef.h"
  58
  59 /**
  60  * Special value used internally to indicate an undefined break result.
  61  */
  62 #define LINEBREAK_UNDEFINED -1
  63
  64 /**
  65  * Size of the second-level index to the line breaking properties.
  66  */
  67 #define LINEBREAK_INDEX_SIZE 40
  68
  69 /**
  70  * Version number of the library.
  71  */
  72 const int linebreak_version = LINEBREAK_VERSION;
  73
  74 /**
  75  * Enumeration of break actions.  They are used in the break action
  76  * pair table below.
  77  */
  78 enum BreakAction
  79 {
  80     DIR_BRK,        /**< Direct break opportunity */
  81     IND_BRK,        /**< Indirect break opportunity */
  82     CMI_BRK,        /**< Indirect break opportunity for combining marks */
  83     CMP_BRK,        /**< Prohibited break for combining marks */
  84     PRH_BRK         /**< Prohibited break */
  85 };
  86
  87 /**
  88  * Break action pair table.  This is a direct mapping of Table 2 of
  89  * Unicode Standard Annex 14, Revision 30.
  90  */
  91 static enum BreakAction baTable[LBP_RI][LBP_RI] = {
  92     {   /* OP */
  93         PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
  94         PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
  95         PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
  96         CMP_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
  97         PRH_BRK },
  98     {   /* CL */
  99         DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, PRH_BRK, PRH_BRK,
 100         PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 101         DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
 102         CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 103         DIR_BRK },
 104     {   /* CP */
 105         DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, PRH_BRK, PRH_BRK,
 106         PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
 107         DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
 108         CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 109         DIR_BRK },
 110     {   /* QU */
 111         PRH_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 112         PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
 113         IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 114         CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
 115         IND_BRK },
 116     {   /* GL */
 117         IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 118         PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
 119         IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 120         CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
 121         IND_BRK },
 122     {   /* NS */
 123         DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 124         PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 125         DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
 126         CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 127         DIR_BRK },
 128     {   /* EX */
 129         DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 130         PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 131         DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
 132         CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 133         DIR_BRK },
 134     {   /* SY */
 135         DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 136         PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK,
 137         DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
 138         CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 139         DIR_BRK },
 140     {   /* IS */
 141         DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 142         PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
 143         DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
 144         CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 145         DIR_BRK },
 146     {   /* PR */
 147         IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 148         PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
 149         IND_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
 150         CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
 151         DIR_BRK },
 152     {   /* PO */
 153         IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 154         PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
 155         DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
 156         CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 157         DIR_BRK },
 158     {   /* NU */
 159         IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 160         PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
 161         DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
 162         CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 163         DIR_BRK },
 164     {   /* AL */
 165         IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 166         PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
 167         DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
 168         CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 169         DIR_BRK },
 170     {   /* HL */
 171         IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 172         PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
 173         DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
 174         CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 175         DIR_BRK },
 176     {   /* ID */
 177         DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 178         PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 179         DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
 180         CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 181         DIR_BRK },
 182     {   /* IN */
 183         DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 184         PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 185         DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
 186         CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 187         DIR_BRK },
 188     {   /* HY */
 189         DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, DIR_BRK, IND_BRK, PRH_BRK,
 190         PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK,
 191         DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
 192         CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 193         DIR_BRK },
 194     {   /* BA */
 195         DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, DIR_BRK, IND_BRK, PRH_BRK,
 196         PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 197         DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
 198         CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 199         DIR_BRK },
 200     {   /* BB */
 201         IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 202         PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
 203         IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 204         CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
 205         IND_BRK },
 206     {   /* B2 */
 207         DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 208         PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 209         DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, PRH_BRK, PRH_BRK,
 210         CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 211         DIR_BRK },
 212     {   /* ZW */
 213         DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 214         DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 215         DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
 216         DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 217         DIR_BRK },
 218     {   /* CM */
 219         IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 220         PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
 221         DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
 222         CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 223         DIR_BRK },
 224     {   /* WJ */
 225         IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 226         PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
 227         IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 228         CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
 229         IND_BRK },
 230     {   /* H2 */
 231         DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 232         PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 233         DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
 234         CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK,
 235         DIR_BRK },
 236     {   /* H3 */
 237         DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 238         PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 239         DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
 240         CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK,
 241         DIR_BRK },
 242     {   /* JL */
 243         DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 244         PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 245         DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
 246         CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK,
 247         DIR_BRK },
 248     {   /* JV */
 249         DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 250         PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 251         DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
 252         CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK,
 253         DIR_BRK },
 254     {   /* JT */
 255         DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 256         PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 257         DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
 258         CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK,
 259         DIR_BRK },
 260     {   /* RI */
 261         DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
 262         PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 263         DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
 264         CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 265         IND_BRK },
 266 };
 267
 268 /**
 269  * Struct for the second-level index to the line breaking properties.
 270  */
 271 struct LineBreakPropertiesIndex
 272 {
 273     utf32_t end;                    /**< End coding point */
 274     struct LineBreakProperties *lbp;/**< Pointer to line breaking properties */
 275 };
 276
 277 /**
 278  * Second-level index to the line breaking properties.
 279  */
 280 static struct LineBreakPropertiesIndex lb_prop_index[LINEBREAK_INDEX_SIZE] =
 281 {
 282     { 0xFFFFFFFF, lb_prop_default }
 283 };
 284
 285 /**
 286  * Initializes the second-level index to the line breaking properties.
 287  * If it is not called, the performance of #get_char_lb_class_lang (and
 288  * thus the main functionality) can be pretty bad, especially for big
 289  * code points like those of Chinese.
 290  */
 291 void init_linebreak(void)
 292 {
 293     size_t i;
 294     size_t iPropDefault;
 295     size_t len;
 296     size_t step;
 297
 298     len = 0;
 299     while (lb_prop_default[len].prop != LBP_Undefined)
 300         ++len;
 301     step = len / LINEBREAK_INDEX_SIZE;
 302     iPropDefault = 0;
 303     for (i = 0; i < LINEBREAK_INDEX_SIZE; ++i)
 304     {
 305         lb_prop_index[i].lbp = lb_prop_default + iPropDefault;
 306         iPropDefault += step;
 307         lb_prop_index[i].end = lb_prop_default[iPropDefault].start - 1;
 308     }
 309     lb_prop_index[--i].end = 0xFFFFFFFF;
 310 }
 311
 312 /**
 313  * Gets the language-specific line breaking properties.
 314  *
 315  * @param lang  language of the text
 316  * @return      pointer to the language-specific line breaking
 317  *              properties array if found; \c NULL otherwise
 318  */
 319 static struct LineBreakProperties *get_lb_prop_lang(const char *lang)
 320 {
 321     struct LineBreakPropertiesLang *lbplIter;
 322     if (lang != NULL)
 323     {
 324         for (lbplIter = lb_prop_lang_map; lbplIter->lang != NULL; ++lbplIter)
 325         {
 326             if (strncmp(lang, lbplIter->lang, lbplIter->namelen) == 0)
 327             {
 328                 return lbplIter->lbp;
 329             }
 330         }
 331     }
 332     return NULL;
 333 }
 334
 335 /**
 336  * Gets the line breaking class of a character from a line breaking
 337  * properties array.
 338  *
 339  * @param ch   character to check
 340  * @param lbp  pointer to the line breaking properties array
 341  * @return     the line breaking class if found; \c LBP_XX otherwise
 342  */
 343 static enum LineBreakClass get_char_lb_class(
 344         utf32_t ch,
 345         struct LineBreakProperties *lbp)
 346 {
 347     while (lbp->prop != LBP_Undefined && ch >= lbp->start)
 348     {
 349         if (ch <= lbp->end)
 350             return lbp->prop;
 351         ++lbp;
 352     }
 353     return LBP_XX;
 354 }
 355
 356 /**
 357  * Gets the line breaking class of a character from the default line
 358  * breaking properties array.
 359  *
 360  * @param ch  character to check
 361  * @return    the line breaking class if found; \c LBP_XX otherwise
 362  */
 363 static enum LineBreakClass get_char_lb_class_default(
 364         utf32_t ch)
 365 {
 366     size_t i = 0;
 367     while (ch > lb_prop_index[i].end)
 368         ++i;
 369     assert(i < LINEBREAK_INDEX_SIZE);
 370     return get_char_lb_class(ch, lb_prop_index[i].lbp);
 371 }
 372
 373 /**
 374  * Gets the line breaking class of a character for a specific
 375  * language.  This function will check the language-specific data first,
 376  * and then the default data if there is no language-specific property
 377  * available for the character.
 378  *
 379  * @param ch       character to check
 380  * @param lbpLang  pointer to the language-specific line breaking
 381  *                 properties array
 382  * @return         the line breaking class if found; \c LBP_XX
 383  *                 otherwise
 384  */
 385 static enum LineBreakClass get_char_lb_class_lang(
 386         utf32_t ch,
 387         struct LineBreakProperties *lbpLang)
 388 {
 389     enum LineBreakClass lbcResult;
 390
 391     /* Find the language-specific line breaking class for a character */
 392     if (lbpLang)
 393     {
 394         lbcResult = get_char_lb_class(ch, lbpLang);
 395         if (lbcResult != LBP_XX)
 396             return lbcResult;
 397     }
 398
 399     /* Find the generic language-specific line breaking class, if no
 400      * language context is provided, or language-specific data are not
 401      * available for the specific character in the specified language */
 402     return get_char_lb_class_default(ch);
 403 }
 404
 405 /**
 406  * Resolves the line breaking class for certain ambiguous or complicated
 407  * characters.  They are treated in a simplistic way in this
 408  * implementation.
 409  *
 410  * @param lbc   line breaking class to resolve
 411  * @param lang  language of the text
 412  * @return      the resolved line breaking class
 413  */
 414 static enum LineBreakClass resolve_lb_class(
 415         enum LineBreakClass lbc,
 416         const char *lang)
 417 {
 418     switch (lbc)
 419     {
 420     case LBP_AI:
 421         if (lang != NULL &&
 422                 (strncmp(lang, "zh", 2) == 0 || /* Chinese */
 423                  strncmp(lang, "ja", 2) == 0 || /* Japanese */
 424                  strncmp(lang, "ko", 2) == 0))  /* Korean */
 425         {
 426             return LBP_ID;
 427         }
 428         else
 429         {
 430             return LBP_AL;
 431         }
 432     case LBP_CJ:
 433         /* Simplified for `normal' line breaking.  See
 434          * <url:http://www.unicode.org/reports/tr14/tr14-30.html#CJ>
 435          * for details. */
 436         return LBP_ID;
 437     case LBP_SA:
 438     case LBP_SG:
 439     case LBP_XX:
 440         return LBP_AL;
 441     default:
 442         return lbc;
 443     }
 444 }
 445
 446 /**
 447  * Treats specially for the first character in a line.
 448  *
 449  * @param[in,out] lbpCtx  pointer to the line breaking context
 450  * @pre                   \a lbpCtx->lbcCur has a valid line break class
 451  * @post                  \a lbpCtx->lbcCur has the updated line break class
 452  */
 453 static void treat_first_char(
 454         struct LineBreakContext* lbpCtx)
 455 {
 456     switch (lbpCtx->lbcCur)
 457     {
 458     case LBP_LF:
 459     case LBP_NL:
 460         lbpCtx->lbcCur = LBP_BK;        /* Rule LB5 */
 461         break;
 462     case LBP_CB:
 463         lbpCtx->lbcCur = LBP_BA;        /* Rule LB20 */
 464         break;
 465     case LBP_SP:
 466         lbpCtx->lbcCur = LBP_WJ;        /* Leading space treated as WJ */
 467         break;
 468     default:
 469         break;
 470     }
 471 }
 472
 473 /**
 474  * Tries telling the line break opportunity by simple rules.
 475  *
 476  * @param[in,out] lbpCtx  pointer to the line breaking context
 477  * @pre                   \a lbpCtx->lbcCur has the current line break
 478  *                        class; and \a lbpCtx->lbcNew has the line
 479  *                        break class for the next character
 480  * @post                  \a lbpCtx->lbcCur has the updated line break
 481  *                        class
 482  * @return                break result, one of #LINEBREAK_MUSTBREAK,
 483  *                        #LINEBREAK_ALLOWBREAK, and #LINEBREAK_NOBREAK
 484  *                        if identified; or #LINEBREAK_UNDEFINED if
 485  *                        table lookup is needed
 486  */
 487 static int get_lb_result_simple(
 488         struct LineBreakContext* lbpCtx)
 489 {
 490     if (lbpCtx->lbcCur == LBP_BK
 491         || (lbpCtx->lbcCur == LBP_CR && lbpCtx->lbcNew != LBP_LF))
 492     {
 493         return LINEBREAK_MUSTBREAK;     /* Rules LB4 and LB5 */
 494     }
 495
 496     switch (lbpCtx->lbcNew)
 497     {
 498     case LBP_SP:
 499         return LINEBREAK_NOBREAK;       /* Rule LB7; no change to lbcCur */
 500     case LBP_BK:
 501     case LBP_LF:
 502     case LBP_NL:
 503         lbpCtx->lbcCur = LBP_BK;        /* Mandatory break after */
 504         return LINEBREAK_NOBREAK;       /* Rule LB6 */
 505     case LBP_CR:
 506         lbpCtx->lbcCur = LBP_CR;
 507         return LINEBREAK_NOBREAK;       /* Rule LB6 */
 508     case LBP_CB:
 509         lbpCtx->lbcCur = LBP_BA;
 510         return LINEBREAK_ALLOWBREAK;    /* Rule LB20 */
 511     default:
 512         return LINEBREAK_UNDEFINED;     /* Table lookup is needed */
 513     }
 514 }
 515
 516 /**
 517  * Tells the line break opportunity by table lookup.
 518  *
 519  * @param[in,out] lbpCtx  pointer to the line breaking context
 520  * @pre                   \a lbpCtx->lbcCur has the current line break
 521  *                        class; \a lbpCtx->lbcLast has the line break
 522  *                        class for the last character; and \a
 523  *                        lbcCur->lbcNew has the line break class for
 524  *                        the next character
 525  * @post                  \a lbpCtx->lbcCur has the updated line break
 526  *                        class
 527  * @return                break result, one of #LINEBREAK_MUSTBREAK,
 528  *                        #LINEBREAK_ALLOWBREAK, and #LINEBREAK_NOBREAK
 529  */
 530 static int get_lb_result_lookup(
 531         struct LineBreakContext* lbpCtx)
 532 {
 533     /* TODO: Rule LB21a, as introduced by Revision 28 of UAX#14, is not
 534      * yet implemented below. */
 535     int brk = LINEBREAK_UNDEFINED;
 536     assert((lbpCtx->lbcCur > 0) && (lbpCtx->lbcCur <= LBP_RI));
 537     assert((lbpCtx->lbcNew > 0) && (lbpCtx->lbcNew <= LBP_RI));
 538     switch (baTable[lbpCtx->lbcCur - 1][lbpCtx->lbcNew - 1])
 539     {
 540     case DIR_BRK:
 541         brk = LINEBREAK_ALLOWBREAK;
 542         break;
 543     case CMI_BRK:
 544     case IND_BRK:
 545         brk = (lbpCtx->lbcLast == LBP_SP)
 546             ? LINEBREAK_ALLOWBREAK
 547             : LINEBREAK_NOBREAK;
 548         break;
 549     case CMP_BRK:
 550         brk = LINEBREAK_NOBREAK;
 551         if (lbpCtx->lbcLast != LBP_SP)
 552             return brk;                 /* Do not update lbcCur */
 553         break;
 554     case PRH_BRK:
 555         brk = LINEBREAK_NOBREAK;
 556         break;
 557     }
 558     lbpCtx->lbcCur = lbpCtx->lbcNew;
 559     return brk;
 560 }
 561
 562 /**
 563  * Initializes line breaking context for a given language.
 564  *
 565  * @param[in,out] lbpCtx  pointer to the line breaking context
 566  * @param[in]     ch      the first character to process
 567  * @param[in]     lang    language of the input
 568  * @post                  the line breaking context is initialized
 569  */
 570 void lb_init_break_context(
 571         struct LineBreakContext* lbpCtx,
 572         utf32_t ch,
 573         const char* lang)
 574 {
 575     lbpCtx->lang = lang;
 576     lbpCtx->lbpLang = get_lb_prop_lang(lang);
 577     lbpCtx->lbcLast = LBP_Undefined;
 578     lbpCtx->lbcNew = LBP_Undefined;
 579     lbpCtx->lbcCur = resolve_lb_class(
 580                         get_char_lb_class_lang(ch, lbpCtx->lbpLang),
 581                         lbpCtx->lang);
 582     treat_first_char(lbpCtx);
 583 }
 584
 585 /**
 586  * Updates LineBreakingContext for the next code point and returns
 587  * the detected break.
 588  *
 589  * @param[in,out] lbpCtx  pointer to the line breaking context
 590  * @param[in]     ch      Unicode code point
 591  * @return                break result, one of #LINEBREAK_MUSTBREAK,
 592  *                        #LINEBREAK_ALLOWBREAK, and #LINEBREAK_NOBREAK
 593  * @post                  the line breaking context is updated
 594  */
 595 int lb_process_next_char(
 596         struct LineBreakContext* lbpCtx,
 597         utf32_t ch )
 598 {
 599     int brk;
 600
 601     lbpCtx->lbcLast = lbpCtx->lbcNew;
 602     lbpCtx->lbcNew = get_char_lb_class_lang(ch, lbpCtx->lbpLang);
 603     brk = get_lb_result_simple(lbpCtx);
 604     switch (brk)
 605     {
 606     case LINEBREAK_MUSTBREAK:
 607         lbpCtx->lbcCur = resolve_lb_class(lbpCtx->lbcNew, lbpCtx->lang);
 608         treat_first_char(lbpCtx);
 609         break;
 610     case LINEBREAK_UNDEFINED:
 611         lbpCtx->lbcNew = resolve_lb_class(lbpCtx->lbcNew, lbpCtx->lang);
 612         brk = get_lb_result_lookup(lbpCtx);
 613         break;
 614     default:
 615         break;
 616     }
 617     return brk;
 618 }
 619
 620 /**
 621  * Gets the next Unicode character in a UTF-8 sequence.  The index will
 622  * be advanced to the next complete character, unless the end of string
 623  * is reached in the middle of a UTF-8 sequence.
 624  *
 625  * @param[in]     s    input UTF-8 string
 626  * @param[in]     len  length of the string in bytes
 627  * @param[in,out] ip   pointer to the index
 628  * @return             the Unicode character beginning at the index; or
 629  *                     #EOS if end of input is encountered
 630  */
 631 utf32_t lb_get_next_char_utf8(
 632         const utf8_t *s,
 633         size_t len,
 634         size_t *ip)
 635 {
 636     utf8_t ch;
 637     utf32_t res;
 638
 639     assert(*ip <= len);
 640     if (*ip == len)
 641         return EOS;
 642     ch = s[*ip];
 643
 644     if (ch < 0xC2 || ch > 0xF4)
 645     {   /* One-byte sequence, tail (should not occur), or invalid */
 646         *ip += 1;
 647         return ch;
 648     }
 649     else if (ch < 0xE0)
 650     {   /* Two-byte sequence */
 651         if (*ip + 2 > len)
 652             return EOS;
 653         res = ((ch & 0x1F) << 6) + (s[*ip + 1] & 0x3F);
 654         *ip += 2;
 655         return res;
 656     }
 657     else if (ch < 0xF0)
 658     {   /* Three-byte sequence */
 659         if (*ip + 3 > len)
 660             return EOS;
 661         res = ((ch & 0x0F) << 12) +
 662               ((s[*ip + 1] & 0x3F) << 6) +
 663               ((s[*ip + 2] & 0x3F));
 664         *ip += 3;
 665         return res;
 666     }
 667     else
 668     {   /* Four-byte sequence */
 669         if (*ip + 4 > len)
 670             return EOS;
 671         res = ((ch & 0x07) << 18) +
 672               ((s[*ip + 1] & 0x3F) << 12) +
 673               ((s[*ip + 2] & 0x3F) << 6) +
 674               ((s[*ip + 3] & 0x3F));
 675         *ip += 4;
 676         return res;
 677     }
 678 }
 679
 680 /**
 681  * Gets the next Unicode character in a UTF-16 sequence.  The index will
 682  * be advanced to the next complete character, unless the end of string
 683  * is reached in the middle of a UTF-16 surrogate pair.
 684  *
 685  * @param[in]     s    input UTF-16 string
 686  * @param[in]     len  length of the string in words
 687  * @param[in,out] ip   pointer to the index
 688  * @return             the Unicode character beginning at the index; or
 689  *                     #EOS if end of input is encountered
 690  */
 691 utf32_t lb_get_next_char_utf16(
 692         const utf16_t *s,
 693         size_t len,
 694         size_t *ip)
 695 {
 696     utf16_t ch;
 697
 698     assert(*ip <= len);
 699     if (*ip == len)
 700         return EOS;
 701     ch = s[(*ip)++];
 702
 703     if (ch < 0xD800 || ch > 0xDBFF)
 704     {   /* If the character is not a high surrogate */
 705         return ch;
 706     }
 707     if (*ip == len)
 708     {   /* If the input ends here (an error) */
 709         --(*ip);
 710         return EOS;
 711     }
 712     if (s[*ip] < 0xDC00 || s[*ip] > 0xDFFF)
 713     {   /* If the next character is not the low surrogate (an error) */
 714         return ch;
 715     }
 716     /* Return the constructed character and advance the index again */
 717     return (((utf32_t)ch & 0x3FF) << 10) + (s[(*ip)++] & 0x3FF) + 0x10000;
 718 }
 719
 720 /**
 721  * Gets the next Unicode character in a UTF-32 sequence.  The index will
 722  * be advanced to the next character.
 723  *
 724  * @param[in]     s    input UTF-32 string
 725  * @param[in]     len  length of the string in dwords
 726  * @param[in,out] ip   pointer to the index
 727  * @return             the Unicode character beginning at the index; or
 728  *                     #EOS if end of input is encountered
 729  */
 730 utf32_t lb_get_next_char_utf32(
 731         const utf32_t *s,
 732         size_t len,
 733         size_t *ip)
 734 {
 735     assert(*ip <= len);
 736     if (*ip == len)
 737         return EOS;
 738     return s[(*ip)++];
 739 }
 740
 741 /**
 742  * Sets the line breaking information for a generic input string.
 743  *
 744  * @param[in]  s             input string
 745  * @param[in]  len           length of the input
 746  * @param[in]  lang          language of the input
 747  * @param[out] brks          pointer to the output breaking data,
 748  *                           containing #LINEBREAK_MUSTBREAK,
 749  *                           #LINEBREAK_ALLOWBREAK, #LINEBREAK_NOBREAK,
 750  *                           or #LINEBREAK_INSIDEACHAR
 751  * @param[in] get_next_char  function to get the next UTF-32 character
 752  */
 753 void set_linebreaks(
 754         const void *s,
 755         size_t len,
 756         const char *lang,
 757         char *brks,
 758         get_next_char_t get_next_char)
 759 {
 760     utf32_t ch;
 761     struct LineBreakContext lbCtx;
 762     size_t posCur = 0;
 763     size_t posLast = 0;
 764
 765     --posLast;  /* To be ++'d later */
 766     ch = get_next_char(s, len, &posCur);
 767     if (ch == EOS)
 768         return;
 769     lb_init_break_context(&lbCtx, ch, lang);
 770
 771     /* Process a line till an explicit break or end of string */
 772     for (;;)
 773     {
 774         for (++posLast; posLast < posCur - 1; ++posLast)
 775         {
 776             brks[posLast] = LINEBREAK_INSIDEACHAR;
 777         }
 778         assert(posLast == posCur - 1);
 779         ch = get_next_char(s, len, &posCur);
 780         if (ch == EOS)
 781             break;
 782         brks[posLast] = lb_process_next_char(&lbCtx, ch);
 783     }
 784
 785     assert(posLast == posCur - 1 && posCur <= len);
 786     /* Break after the last character */
 787     brks[posLast] = LINEBREAK_MUSTBREAK;
 788     /* When the input contains incomplete sequences */
 789     while (posCur < len)
 790     {
 791         brks[posCur++] = LINEBREAK_INSIDEACHAR;
 792     }
 793 }
 794
 795 /**
 796  * Sets the line breaking information for a UTF-8 input string.
 797  *
 798  * @param[in]  s     input UTF-8 string
 799  * @param[in]  len   length of the input
 800  * @param[in]  lang  language of the input
 801  * @param[out] brks  pointer to the output breaking data, containing
 802  *                   #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
 803  *                   #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
 804  */
 805 void set_linebreaks_utf8(
 806         const utf8_t *s,
 807         size_t len,
 808         const char *lang,
 809         char *brks)
 810 {
 811     set_linebreaks(s, len, lang, brks,
 812                    (get_next_char_t)lb_get_next_char_utf8);
 813 }
 814
 815 /**
 816  * Sets the line breaking information for a UTF-16 input string.
 817  *
 818  * @param[in]  s     input UTF-16 string
 819  * @param[in]  len   length of the input
 820  * @param[in]  lang  language of the input
 821  * @param[out] brks  pointer to the output breaking data, containing
 822  *                   #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
 823  *                   #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
 824  */
 825 void set_linebreaks_utf16(
 826         const utf16_t *s,
 827         size_t len,
 828         const char *lang,
 829         char *brks)
 830 {
 831     set_linebreaks(s, len, lang, brks,
 832                    (get_next_char_t)lb_get_next_char_utf16);
 833 }
 834
 835 /**
 836  * Sets the line breaking information for a UTF-32 input string.
 837  *
 838  * @param[in]  s     input UTF-32 string
 839  * @param[in]  len   length of the input
 840  * @param[in]  lang  language of the input
 841  * @param[out] brks  pointer to the output breaking data, containing
 842  *                   #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
 843  *                   #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
 844  */
 845 void set_linebreaks_utf32(
 846         const utf32_t *s,
 847         size_t len,
 848         const char *lang,
 849         char *brks)
 850 {
 851     set_linebreaks(s, len, lang, brks,
 852                    (get_next_char_t)lb_get_next_char_utf32);
 853 }
 854
 855 /**
 856  * Tells whether a line break can occur between two Unicode characters.
 857  * This is a wrapper function to expose a simple interface.  Generally
 858  * speaking, it is better to use #set_linebreaks_utf32 instead, since
 859  * complicated cases involving combining marks, spaces, etc. cannot be
 860  * correctly processed.
 861  *
 862  * @param char1  the first Unicode character
 863  * @param char2  the second Unicode character
 864  * @param lang   language of the input
 865  * @return       one of #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
 866  *               #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
 867  */
 868 int is_line_breakable(
 869         utf32_t char1,
 870         utf32_t char2,
 871         const char* lang)
 872 {
 873     utf32_t s[2];
 874     char brks[2];
 875     s[0] = char1;
 876     s[1] = char2;
 877     set_linebreaks_utf32(s, 2, lang, brks);
 878     return brks[0];
 879 }