text/dali/internal/libunibreak/wordbreak.c

   1 /* vim: set expandtab tabstop=4 softtabstop=4 shiftwidth=4: */
   2
   3 /*
   4  * Word breaking in a Unicode sequence.  Designed to be used in a
   5  * generic text renderer.
   6  *
   7  * Copyright (C) 2013 Tom Hacohen <tom at stosb dot com>
   8  *
   9  * This software is provided 'as-is', without any express or implied
  10  * warranty.  In no event will the author be held liable for any damages
  11  * arising from the use of this software.
  12  *
  13  * Permission is granted to anyone to use this software for any purpose,
  14  * including commercial applications, and to alter it and redistribute
  15  * it freely, subject to the following restrictions:
  16  *
  17  * 1. The origin of this software must not be misrepresented; you must
  18  *    not claim that you wrote the original software.  If you use this
  19  *    software in a product, an acknowledgement in the product
  20  *    documentation would be appreciated but is not required.
  21  * 2. Altered source versions must be plainly marked as such, and must
  22  *    not be misrepresented as being the original software.
  23  * 3. This notice may not be removed or altered from any source
  24  *    distribution.
  25  *
  26  * The main reference is Unicode Standard Annex 29 (UAX #29):
  27  *      <URL:http://unicode.org/reports/tr29>
  28  *
  29  * When this library was designed, this annex was at Revision 17, for
  30  * Unicode 6.0.0:
  31  *      <URL:http://www.unicode.org/reports/tr29/tr29-17.html>
  32  *
  33  * This library has been updated according to Revision 21, for
  34  * Unicode 6.2.0:
  35  *      <URL:http://www.unicode.org/reports/tr29/tr29-21.html>
  36  *
  37  * The Unicode Terms of Use are available at
  38  *      <URL:http://www.unicode.org/copyright.html>
  39  */
  40
  41 /**
  42  * @file    wordbreak.c
  43  *
  44  * Implementation of the word breaking algorithm as described in Unicode
  45  * Standard Annex 29.
  46  *
  47  * @version 2.4, 2013/09/28
  48  * @author  Tom Hacohen
  49  */
  50
  51 #include <assert.h>
  52 #include <stddef.h>
  53 #include <string.h>
  54 #include "linebreak.h"
  55 #include "linebreakdef.h"
  56
  57 #include "wordbreak.h"
  58 #include "wordbreakdata.c"
  59
  60 #define ARRAY_LEN(x) (sizeof(x) / sizeof(x[0]))
  61
  62 /**
  63  * Initializes the wordbreak internals.  It currently does nothing, but
  64  * it may in the future.
  65  */
  66 void init_wordbreak(void)
  67 {
  68 }
  69
  70 /**
  71  * Gets the word breaking class of a character.
  72  *
  73  * @param ch   character to check
  74  * @param wbp  pointer to the wbp breaking properties array
  75  * @param len  size of the wbp array in number of items
  76  * @return     the word breaking class if found; \c WBP_Any otherwise
  77  */
  78 static enum WordBreakClass get_char_wb_class(
  79         utf32_t ch,
  80         struct WordBreakProperties *wbp,
  81         size_t len)
  82 {
  83     int min = 0;
  84     int max = len - 1;
  85     int mid;
  86
  87     do
  88     {
  89         mid = (min + max) / 2;
  90
  91         if (ch < wbp[mid].start)
  92             max = mid - 1;
  93         else if (ch > wbp[mid].end)
  94             min = mid + 1;
  95         else
  96             return wbp[mid].prop;
  97     }
  98     while (min <= max);
  99
 100     return WBP_Any;
 101 }
 102
 103 /**
 104  * Sets the word break types to a specific value in a range.
 105  *
 106  * It sets the inside chars to #WORDBREAK_INSIDEACHAR and the rest to brkType.
 107  * Assumes \a brks is initialized - all the cells with #WORDBREAK_NOBREAK are
 108  * cells that we really don't want to break after.
 109  *
 110  * @param[in]  s             input string
 111  * @param[out] brks          breaks array to fill
 112  * @param[in]  posStart      start position
 113  * @param[in]  posEnd        end position (exclusive)
 114  * @param[in]  len           length of the string
 115  * @param[in]  brkType       breaks type to use
 116  * @param[in] get_next_char  function to get the next UTF-32 character
 117  */
 118 static void set_brks_to(
 119         const void *s,
 120         char *brks,
 121         size_t posStart,
 122         size_t posEnd,
 123         size_t len,
 124         char brkType,
 125         get_next_char_t get_next_char)
 126 {
 127     size_t posNext = posStart;
 128     while (posNext < posEnd)
 129     {
 130         utf32_t ch;
 131         ch = get_next_char(s, len, &posNext);
 132         assert(ch != EOS);
 133         for (; posStart < posNext - 1; ++posStart)
 134             brks[posStart] = WORDBREAK_INSIDEACHAR;
 135         assert(posStart == posNext - 1);
 136
 137         /* Only set it if we haven't set it not to break before. */
 138         if (brks[posStart] != WORDBREAK_NOBREAK)
 139             brks[posStart] = brkType;
 140         posStart = posNext;
 141     }
 142 }
 143
 144 /* Checks to see if the class is newline, CR, or LF (rules WB3a and b). */
 145 #define IS_WB3ab(cls) ((cls == WBP_Newline) || (cls == WBP_CR) || \
 146                        (cls == WBP_LF))
 147
 148 /**
 149  * Sets the word breaking information for a generic input string.
 150  *
 151  * @param[in]  s             input string
 152  * @param[in]  len           length of the input
 153  * @param[in]  lang          language of the input
 154  * @param[out] brks          pointer to the output breaking data, containing
 155  *                           #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
 156  *                           #WORDBREAK_INSIDEACHAR
 157  * @param[in] get_next_char  function to get the next UTF-32 character
 158  */
 159 static void set_wordbreaks(
 160         const void *s,
 161         size_t len,
 162         const char *lang,
 163         char *brks,
 164         get_next_char_t get_next_char)
 165 {
 166     enum WordBreakClass wbcLast = WBP_Undefined;
 167     /* wbcSeqStart is the class that started the current sequence.
 168      * WBP_Undefined is a special case that means "sot".
 169      * This value is the class that is at the start of the current rule
 170      * matching sequence. For example, in case of Numeric+MidNum+Numeric
 171      * it'll be Numeric all the way.
 172      */
 173     enum WordBreakClass wbcSeqStart = WBP_Undefined;
 174     utf32_t ch;
 175     size_t posNext = 0;
 176     size_t posCur = 0;
 177     size_t posLast = 0;
 178
 179     /* TODO: Language-specific specialization. */
 180     (void) lang;
 181
 182     /* Init brks. */
 183     memset(brks, WORDBREAK_BREAK, len);
 184
 185     ch = get_next_char(s, len, &posNext);
 186
 187     while (ch != EOS)
 188     {
 189         enum WordBreakClass wbcCur;
 190         wbcCur = get_char_wb_class(ch, wb_prop_default,
 191                                    ARRAY_LEN(wb_prop_default));
 192
 193         switch (wbcCur)
 194         {
 195         case WBP_CR:
 196             /* WB3b */
 197             set_brks_to(s, brks, posLast, posCur, len,
 198                         WORDBREAK_BREAK, get_next_char);
 199             wbcSeqStart = wbcCur;
 200             posLast = posCur;
 201             break;
 202
 203         case WBP_LF:
 204             if (wbcSeqStart == WBP_CR) /* WB3 */
 205             {
 206                 set_brks_to(s, brks, posLast, posCur, len,
 207                             WORDBREAK_NOBREAK, get_next_char);
 208                 wbcSeqStart = wbcCur;
 209                 posLast = posCur;
 210                 break;
 211             }
 212             /* Fall off */
 213
 214         case WBP_Newline:
 215             /* WB3a,3b */
 216             set_brks_to(s, brks, posLast, posCur, len,
 217                         WORDBREAK_BREAK, get_next_char);
 218             wbcSeqStart = wbcCur;
 219             posLast = posCur;
 220             break;
 221
 222         case WBP_Extend:
 223         case WBP_Format:
 224             /* WB4 - If not the first char/after a newline (WB3a,3b), skip
 225              * this class, set it to be the same as the prev, and mark
 226              * brks not to break before them. */
 227             if ((wbcSeqStart == WBP_Undefined) || IS_WB3ab(wbcSeqStart))
 228             {
 229                 set_brks_to(s, brks, posLast, posCur, len,
 230                             WORDBREAK_BREAK, get_next_char);
 231                 wbcSeqStart = wbcCur;
 232             }
 233             else
 234             {
 235                 /* It's surely not the first */
 236                 brks[posCur - 1] = WORDBREAK_NOBREAK;
 237                 /* "inherit" the previous class. */
 238                 wbcCur = wbcLast;
 239             }
 240             break;
 241
 242         case WBP_Katakana:
 243             if ((wbcSeqStart == WBP_Katakana) || /* WB13 */
 244                     (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
 245             {
 246                 set_brks_to(s, brks, posLast, posCur, len,
 247                             WORDBREAK_NOBREAK, get_next_char);
 248             }
 249             /* No rule found, reset */
 250             else
 251             {
 252                 set_brks_to(s, brks, posLast, posCur, len,
 253                             WORDBREAK_BREAK, get_next_char);
 254             }
 255             wbcSeqStart = wbcCur;
 256             posLast = posCur;
 257             break;
 258
 259         case WBP_ALetter:
 260         case WBP_Hebrew:
 261             if ((wbcSeqStart == WBP_ALetter) || /* WB5,6,7 */
 262                     (wbcLast == WBP_Numeric) || /* WB10 */
 263                     (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
 264             {
 265                 set_brks_to(s, brks, posLast, posCur, len,
 266                             WORDBREAK_NOBREAK, get_next_char);
 267             }
 268             /* No rule found, reset */
 269             else
 270             {
 271                 set_brks_to(s, brks, posLast, posCur, len,
 272                             WORDBREAK_BREAK, get_next_char);
 273             }
 274             wbcSeqStart = wbcCur;
 275             posLast = posCur;
 276             break;
 277
 278         case WBP_MidNumLet:
 279             if ((wbcLast == WBP_ALetter) || /* WB6,7 */
 280                     (wbcLast == WBP_Numeric)) /* WB11,12 */
 281             {
 282                 /* Go on */
 283             }
 284             else
 285             {
 286                 set_brks_to(s, brks, posLast, posCur, len,
 287                             WORDBREAK_BREAK, get_next_char);
 288                 wbcSeqStart = wbcCur;
 289                 posLast = posCur;
 290             }
 291             break;
 292
 293         case WBP_MidLetter:
 294             if (wbcLast == WBP_ALetter) /* WB6,7 */
 295             {
 296                 /* Go on */
 297             }
 298             else
 299             {
 300                 set_brks_to(s, brks, posLast, posCur, len,
 301                             WORDBREAK_BREAK, get_next_char);
 302                 wbcSeqStart = wbcCur;
 303                 posLast = posCur;
 304             }
 305             break;
 306
 307         case WBP_MidNum:
 308             if (wbcLast == WBP_Numeric) /* WB11,12 */
 309             {
 310                 /* Go on */
 311             }
 312             else
 313             {
 314                 set_brks_to(s, brks, posLast, posCur, len,
 315                             WORDBREAK_BREAK, get_next_char);
 316                 wbcSeqStart = wbcCur;
 317                 posLast = posCur;
 318             }
 319             break;
 320
 321         case WBP_Numeric:
 322             if ((wbcSeqStart == WBP_Numeric) || /* WB8,11,12 */
 323                     (wbcLast == WBP_ALetter) || /* WB9 */
 324                     (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
 325             {
 326                 set_brks_to(s, brks, posLast, posCur, len,
 327                             WORDBREAK_NOBREAK, get_next_char);
 328             }
 329             /* No rule found, reset */
 330             else
 331             {
 332                 set_brks_to(s, brks, posLast, posCur, len,
 333                             WORDBREAK_BREAK, get_next_char);
 334             }
 335             wbcSeqStart = wbcCur;
 336             posLast = posCur;
 337             break;
 338
 339         case WBP_ExtendNumLet:
 340             /* WB13a,13b */
 341             if ((wbcSeqStart == wbcLast) &&
 342                 ((wbcLast == WBP_ALetter) ||
 343                  (wbcLast == WBP_Numeric) ||
 344                  (wbcLast == WBP_Katakana) ||
 345                  (wbcLast == WBP_ExtendNumLet)))
 346             {
 347                 set_brks_to(s, brks, posLast, posCur, len,
 348                             WORDBREAK_NOBREAK, get_next_char);
 349             }
 350             /* No rule found, reset */
 351             else
 352             {
 353                 set_brks_to(s, brks, posLast, posCur, len,
 354                             WORDBREAK_BREAK, get_next_char);
 355             }
 356             wbcSeqStart = wbcCur;
 357             posLast = posCur;
 358             break;
 359
 360         case WBP_Regional:
 361             /* WB13c */
 362             if (wbcSeqStart == WBP_Regional)
 363             {
 364                 set_brks_to(s, brks, posLast, posCur, len,
 365                             WORDBREAK_NOBREAK, get_next_char);
 366             }
 367             wbcSeqStart = wbcCur;
 368             posLast = posCur;
 369             break;
 370
 371         case WBP_Any:
 372             /* Allow breaks and reset */
 373             set_brks_to(s, brks, posLast, posCur, len,
 374                         WORDBREAK_BREAK, get_next_char);
 375             wbcSeqStart = wbcCur;
 376             posLast = posCur;
 377             break;
 378
 379         default:
 380             /* Error, should never get here! */
 381             assert(0);
 382             break;
 383         }
 384
 385         wbcLast = wbcCur;
 386         posCur = posNext;
 387         ch = get_next_char(s, len, &posNext);
 388     }
 389
 390     /* WB2 */
 391     set_brks_to(s, brks, posLast, posNext, len,
 392                 WORDBREAK_BREAK, get_next_char);
 393 }
 394
 395 /**
 396  * Sets the word breaking information for a UTF-8 input string.
 397  *
 398  * @param[in]  s     input UTF-8 string
 399  * @param[in]  len   length of the input
 400  * @param[in]  lang  language of the input
 401  * @param[out] brks  pointer to the output breaking data, containing
 402  *                   #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
 403  *                   #WORDBREAK_INSIDEACHAR
 404  */
 405 void set_wordbreaks_utf8(
 406         const utf8_t *s,
 407         size_t len,
 408         const char *lang,
 409         char *brks)
 410 {
 411     set_wordbreaks(s, len, lang, brks,
 412                    (get_next_char_t)lb_get_next_char_utf8);
 413 }
 414
 415 /**
 416  * Sets the word breaking information for a UTF-16 input string.
 417  *
 418  * @param[in]  s     input UTF-16 string
 419  * @param[in]  len   length of the input
 420  * @param[in]  lang  language of the input
 421  * @param[out] brks  pointer to the output breaking data, containing
 422  *                   #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
 423  *                   #WORDBREAK_INSIDEACHAR
 424  */
 425 void set_wordbreaks_utf16(
 426         const utf16_t *s,
 427         size_t len,
 428         const char *lang,
 429         char *brks)
 430 {
 431     set_wordbreaks(s, len, lang, brks,
 432                    (get_next_char_t)lb_get_next_char_utf16);
 433 }
 434
 435 /**
 436  * Sets the word breaking information for a UTF-32 input string.
 437  *
 438  * @param[in]  s     input UTF-32 string
 439  * @param[in]  len   length of the input
 440  * @param[in]  lang  language of the input
 441  * @param[out] brks  pointer to the output breaking data, containing
 442  *                   #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
 443  *                   #WORDBREAK_INSIDEACHAR
 444  */
 445 void set_wordbreaks_utf32(
 446         const utf32_t *s,
 447         size_t len,
 448         const char *lang,
 449         char *brks)
 450 {
 451     set_wordbreaks(s, len, lang, brks,
 452                    (get_next_char_t)lb_get_next_char_utf32);
 453 }