third-party/libunibreak/wordbreak.c

   1 /* vim: set expandtab tabstop=4 softtabstop=4 shiftwidth=4: */
   2
   3 /*
   4  * Word breaking in a Unicode sequence.  Designed to be used in a
   5  * generic text renderer.
   6  *
   7  * Copyright (C) 2013 Tom Hacohen <tom at stosb dot com>
   8  *
   9  * This software is provided 'as-is', without any express or implied
  10  * warranty.  In no event will the author be held liable for any damages
  11  * arising from the use of this software.
  12  *
  13  * Permission is granted to anyone to use this software for any purpose,
  14  * including commercial applications, and to alter it and redistribute
  15  * it freely, subject to the following restrictions:
  16  *
  17  * 1. The origin of this software must not be misrepresented; you must
  18  *    not claim that you wrote the original software.  If you use this
  19  *    software in a product, an acknowledgement in the product
  20  *    documentation would be appreciated but is not required.
  21  * 2. Altered source versions must be plainly marked as such, and must
  22  *    not be misrepresented as being the original software.
  23  * 3. This notice may not be removed or altered from any source
  24  *    distribution.
  25  *
  26  * The main reference is Unicode Standard Annex 29 (UAX #29):
  27  *      <URL:http://unicode.org/reports/tr29>
  28  *
  29  * When this library was designed, this annex was at Revision 17, for
  30  * Unicode 6.0.0:
  31  *      <URL:http://www.unicode.org/reports/tr29/tr29-17.html>
  32  *
  33  * This library has been updated according to Revision 21, for
  34  * Unicode 6.2.0:
  35  *      <URL:http://www.unicode.org/reports/tr29/tr29-21.html>
  36  *
  37  * The Unicode Terms of Use are available at
  38  *      <URL:http://www.unicode.org/copyright.html>
  39  */
  40
  41 /**
  42  * @file    wordbreak.c
  43  *
  44  * Implementation of the word breaking algorithm as described in Unicode
  45  * Standard Annex 29.
  46  *
  47  * @version 2.4, 2013/09/28
  48  * @author  Tom Hacohen
  49  */
  50
  51 #include <assert.h>
  52 #include <stddef.h>
  53 #include <string.h>
  54 #include "linebreak.h"
  55 #include "linebreakdef.h"
  56
  57 #include "wordbreak.h"
  58 #include "wordbreakdata.c"
  59
  60 #define ARRAY_LEN(x) (sizeof(x) / sizeof(x[0]))
  61
  62 /**
  63  * Initializes the wordbreak internals.  It currently does nothing, but
  64  * it may in the future.
  65  */
  66 void init_wordbreak(void)
  67 {
  68 }
  69
  70 /**
  71  * Gets the word breaking class of a character.
  72  *
  73  * @param ch   character to check
  74  * @param wbp  pointer to the wbp breaking properties array
  75  * @param len  size of the wbp array in number of items
  76  * @return     the word breaking class if found; \c WBP_Any otherwise
  77  */
  78 static enum WordBreakClass get_char_wb_class(
  79         utf32_t ch,
  80         struct WordBreakProperties *wbp,
  81         size_t len)
  82 {
  83     int min = 0;
  84     int max = len - 1;
  85     int mid;
  86
  87     do
  88     {
  89         mid = (min + max) / 2;
  90
  91         if (ch < wbp[mid].start)
  92             max = mid - 1;
  93         else if (ch > wbp[mid].end)
  94             min = mid + 1;
  95         else
  96             return wbp[mid].prop;
  97     }
  98     while (min <= max);
  99
 100     return WBP_Any;
 101 }
 102
 103 /**
 104  * Sets the word break types to a specific value in a range.
 105  *
 106  * It sets the inside chars to #WORDBREAK_INSIDEACHAR and the rest to brkType.
 107  * Assumes \a brks is initialized - all the cells with #WORDBREAK_NOBREAK are
 108  * cells that we really don't want to break after.
 109  *
 110  * @param[in]  s             input string
 111  * @param[out] brks          breaks array to fill
 112  * @param[in]  posStart      start position
 113  * @param[in]  posEnd        end position (exclusive)
 114  * @param[in]  len           length of the string
 115  * @param[in]  brkType       breaks type to use
 116  * @param[in] get_next_char  function to get the next UTF-32 character
 117  */
 118 static void set_brks_to(
 119         const void *s,
 120         char *brks,
 121         size_t posStart,
 122         size_t posEnd,
 123         size_t len,
 124         char brkType,
 125         get_next_char_t get_next_char)
 126 {
 127     size_t posNext = posStart;
 128     while (posNext < posEnd)
 129     {
 130         utf32_t ch;
 131         ch = get_next_char(s, len, &posNext);
 132         assert(ch != EOS);
 133         for (; posStart < posNext - 1; ++posStart)
 134             brks[posStart] = WORDBREAK_INSIDEACHAR;
 135         assert(posStart == posNext - 1);
 136
 137         /* Only set it if we haven't set it not to break before. */
 138         if (brks[posStart] != WORDBREAK_NOBREAK)
 139             brks[posStart] = brkType;
 140         posStart = posNext;
 141     }
 142 }
 143
 144 /* Checks to see if the class is newline, CR, or LF (rules WB3a and b). */
 145 #define IS_WB3ab(cls) ((cls == WBP_Newline) || (cls == WBP_CR) || \
 146                        (cls == WBP_LF))
 147
 148 /**
 149  * Sets the word breaking information for a generic input string.
 150  *
 151  * @param[in]  s             input string
 152  * @param[in]  len           length of the input
 153  * @param[in]  lang          language of the input
 154  * @param[out] brks          pointer to the output breaking data, containing
 155  *                           #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
 156  *                           #WORDBREAK_INSIDEACHAR
 157  * @param[in] get_next_char  function to get the next UTF-32 character
 158  */
 159 static void set_wordbreaks(
 160         const void *s,
 161         size_t len,
 162         const char *lang,
 163         char *brks,
 164         get_next_char_t get_next_char)
 165 {
 166     enum WordBreakClass wbcLast = WBP_Undefined;
 167     /* wbcSeqStart is the class that started the current sequence.
 168      * WBP_Undefined is a special case that means "sot".
 169      * This value is the class that is at the start of the current rule
 170      * matching sequence. For example, in case of Numeric+MidNum+Numeric
 171      * it'll be Numeric all the way.
 172      */
 173     enum WordBreakClass wbcSeqStart = WBP_Undefined;
 174     utf32_t ch;
 175     size_t posNext = 0;
 176     size_t posCur = 0;
 177     size_t posLast = 0;
 178
 179     /* TODO: Language-specific specialization. */
 180     (void) lang;
 181
 182     /* Init brks. */
 183     memset(brks, WORDBREAK_BREAK, len);
 184
 185     ch = get_next_char(s, len, &posNext);
 186
 187     while (ch != EOS)
 188     {
 189         enum WordBreakClass wbcCur;
 190         wbcCur = get_char_wb_class(ch, wb_prop_default,
 191                                    ARRAY_LEN(wb_prop_default));
 192
 193         switch (wbcCur)
 194         {
 195         case WBP_CR:
 196             /* WB3b */
 197             set_brks_to(s, brks, posLast, posCur, len,
 198                         WORDBREAK_BREAK, get_next_char);
 199             wbcSeqStart = wbcCur;
 200             posLast = posCur;
 201             break;
 202
 203         case WBP_LF:
 204             if (wbcSeqStart == WBP_CR) /* WB3 */
 205             {
 206                 set_brks_to(s, brks, posLast, posCur, len,
 207                             WORDBREAK_NOBREAK, get_next_char);
 208                 wbcSeqStart = wbcCur;
 209                 posLast = posCur;
 210                 break;
 211             }
 212             /* Fall off */
 213
 214         case WBP_Newline:
 215             /* WB3a,3b */
 216             set_brks_to(s, brks, posLast, posCur, len,
 217                         WORDBREAK_BREAK, get_next_char);
 218             wbcSeqStart = wbcCur;
 219             posLast = posCur;
 220             break;
 221
 222         case WBP_Extend:
 223         case WBP_Format:
 224             /* WB4 - If not the first char/after a newline (WB3a,3b), skip
 225              * this class, set it to be the same as the prev, and mark
 226              * brks not to break before them. */
 227             if ((wbcSeqStart == WBP_Undefined) || IS_WB3ab(wbcSeqStart))
 228             {
 229                 set_brks_to(s, brks, posLast, posCur, len,
 230                             WORDBREAK_BREAK, get_next_char);
 231                 wbcSeqStart = wbcCur;
 232             }
 233             else
 234             {
 235                 /* It's surely not the first */
 236                 brks[posCur - 1] = WORDBREAK_NOBREAK;
 237                 /* "inherit" the previous class. */
 238                 wbcCur = wbcLast;
 239             }
 240             break;
 241
 242         case WBP_Katakana:
 243             if ((wbcSeqStart == WBP_Katakana) || /* WB13 */
 244                     (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
 245             {
 246                 set_brks_to(s, brks, posLast, posCur, len,
 247                             WORDBREAK_NOBREAK, get_next_char);
 248             }
 249             /* No rule found, reset */
 250             else
 251             {
 252                 set_brks_to(s, brks, posLast, posCur, len,
 253                             WORDBREAK_BREAK, get_next_char);
 254             }
 255             wbcSeqStart = wbcCur;
 256             posLast = posCur;
 257             break;
 258
 259         case WBP_ALetter:
 260             if ((wbcSeqStart == WBP_ALetter) || /* WB5,6,7 */
 261                     (wbcLast == WBP_Numeric) || /* WB10 */
 262                     (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
 263             {
 264                 set_brks_to(s, brks, posLast, posCur, len,
 265                             WORDBREAK_NOBREAK, get_next_char);
 266             }
 267             /* No rule found, reset */
 268             else
 269             {
 270                 set_brks_to(s, brks, posLast, posCur, len,
 271                             WORDBREAK_BREAK, get_next_char);
 272             }
 273             wbcSeqStart = wbcCur;
 274             posLast = posCur;
 275             break;
 276
 277         case WBP_MidNumLet:
 278             if ((wbcLast == WBP_ALetter) || /* WB6,7 */
 279                     (wbcLast == WBP_Numeric)) /* WB11,12 */
 280             {
 281                 /* Go on */
 282             }
 283             else
 284             {
 285                 set_brks_to(s, brks, posLast, posCur, len,
 286                             WORDBREAK_BREAK, get_next_char);
 287                 wbcSeqStart = wbcCur;
 288                 posLast = posCur;
 289             }
 290             break;
 291
 292         case WBP_MidLetter:
 293             if (wbcLast == WBP_ALetter) /* WB6,7 */
 294             {
 295                 /* Go on */
 296             }
 297             else
 298             {
 299                 set_brks_to(s, brks, posLast, posCur, len,
 300                             WORDBREAK_BREAK, get_next_char);
 301                 wbcSeqStart = wbcCur;
 302                 posLast = posCur;
 303             }
 304             break;
 305
 306         case WBP_MidNum:
 307             if (wbcLast == WBP_Numeric) /* WB11,12 */
 308             {
 309                 /* Go on */
 310             }
 311             else
 312             {
 313                 set_brks_to(s, brks, posLast, posCur, len,
 314                             WORDBREAK_BREAK, get_next_char);
 315                 wbcSeqStart = wbcCur;
 316                 posLast = posCur;
 317             }
 318             break;
 319
 320         case WBP_Numeric:
 321             if ((wbcSeqStart == WBP_Numeric) || /* WB8,11,12 */
 322                     (wbcLast == WBP_ALetter) || /* WB9 */
 323                     (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
 324             {
 325                 set_brks_to(s, brks, posLast, posCur, len,
 326                             WORDBREAK_NOBREAK, get_next_char);
 327             }
 328             /* No rule found, reset */
 329             else
 330             {
 331                 set_brks_to(s, brks, posLast, posCur, len,
 332                             WORDBREAK_BREAK, get_next_char);
 333             }
 334             wbcSeqStart = wbcCur;
 335             posLast = posCur;
 336             break;
 337
 338         case WBP_ExtendNumLet:
 339             /* WB13a,13b */
 340             if ((wbcSeqStart == wbcLast) &&
 341                 ((wbcLast == WBP_ALetter) ||
 342                  (wbcLast == WBP_Numeric) ||
 343                  (wbcLast == WBP_Katakana) ||
 344                  (wbcLast == WBP_ExtendNumLet)))
 345             {
 346                 set_brks_to(s, brks, posLast, posCur, len,
 347                             WORDBREAK_NOBREAK, get_next_char);
 348             }
 349             /* No rule found, reset */
 350             else
 351             {
 352                 set_brks_to(s, brks, posLast, posCur, len,
 353                             WORDBREAK_BREAK, get_next_char);
 354             }
 355             wbcSeqStart = wbcCur;
 356             posLast = posCur;
 357             break;
 358
 359         case WBP_Regional:
 360             /* WB13c */
 361             if (wbcSeqStart == WBP_Regional)
 362             {
 363                 set_brks_to(s, brks, posLast, posCur, len,
 364                             WORDBREAK_NOBREAK, get_next_char);
 365             }
 366             wbcSeqStart = wbcCur;
 367             posLast = posCur;
 368             break;
 369
 370         case WBP_Any:
 371             /* Allow breaks and reset */
 372             set_brks_to(s, brks, posLast, posCur, len,
 373                         WORDBREAK_BREAK, get_next_char);
 374             wbcSeqStart = wbcCur;
 375             posLast = posCur;
 376             break;
 377
 378         default:
 379             /* Error, should never get here! */
 380             assert(0);
 381             break;
 382         }
 383
 384         wbcLast = wbcCur;
 385         posCur = posNext;
 386         ch = get_next_char(s, len, &posNext);
 387     }
 388
 389     /* WB2 */
 390     set_brks_to(s, brks, posLast, posNext, len,
 391                 WORDBREAK_BREAK, get_next_char);
 392 }
 393
 394 /**
 395  * Sets the word breaking information for a UTF-8 input string.
 396  *
 397  * @param[in]  s     input UTF-8 string
 398  * @param[in]  len   length of the input
 399  * @param[in]  lang  language of the input
 400  * @param[out] brks  pointer to the output breaking data, containing
 401  *                   #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
 402  *                   #WORDBREAK_INSIDEACHAR
 403  */
 404 void set_wordbreaks_utf8(
 405         const utf8_t *s,
 406         size_t len,
 407         const char *lang,
 408         char *brks)
 409 {
 410     set_wordbreaks(s, len, lang, brks,
 411                    (get_next_char_t)lb_get_next_char_utf8);
 412 }
 413
 414 /**
 415  * Sets the word breaking information for a UTF-16 input string.
 416  *
 417  * @param[in]  s     input UTF-16 string
 418  * @param[in]  len   length of the input
 419  * @param[in]  lang  language of the input
 420  * @param[out] brks  pointer to the output breaking data, containing
 421  *                   #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
 422  *                   #WORDBREAK_INSIDEACHAR
 423  */
 424 void set_wordbreaks_utf16(
 425         const utf16_t *s,
 426         size_t len,
 427         const char *lang,
 428         char *brks)
 429 {
 430     set_wordbreaks(s, len, lang, brks,
 431                    (get_next_char_t)lb_get_next_char_utf16);
 432 }
 433
 434 /**
 435  * Sets the word breaking information for a UTF-32 input string.
 436  *
 437  * @param[in]  s     input UTF-32 string
 438  * @param[in]  len   length of the input
 439  * @param[in]  lang  language of the input
 440  * @param[out] brks  pointer to the output breaking data, containing
 441  *                   #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
 442  *                   #WORDBREAK_INSIDEACHAR
 443  */
 444 void set_wordbreaks_utf32(
 445         const utf32_t *s,
 446         size_t len,
 447         const char *lang,
 448         char *brks)
 449 {
 450     set_wordbreaks(s, len, lang, brks,
 451                    (get_next_char_t)lb_get_next_char_utf32);
 452 }