third-party/libunibreak/wordbreak.c

   1 /* vim: set expandtab tabstop=4 softtabstop=4 shiftwidth=4: */
   2
   3 /*
   4  * Word breaking in a Unicode sequence.  Designed to be used in a
   5  * generic text renderer.
   6  *
   7  * Copyright (C) 2013-2015 Tom Hacohen <tom at stosb dot com>
   8  *
   9  * This software is provided 'as-is', without any express or implied
  10  * warranty.  In no event will the author be held liable for any damages
  11  * arising from the use of this software.
  12  *
  13  * Permission is granted to anyone to use this software for any purpose,
  14  * including commercial applications, and to alter it and redistribute
  15  * it freely, subject to the following restrictions:
  16  *
  17  * 1. The origin of this software must not be misrepresented; you must
  18  *    not claim that you wrote the original software.  If you use this
  19  *    software in a product, an acknowledgement in the product
  20  *    documentation would be appreciated but is not required.
  21  * 2. Altered source versions must be plainly marked as such, and must
  22  *    not be misrepresented as being the original software.
  23  * 3. This notice may not be removed or altered from any source
  24  *    distribution.
  25  *
  26  * The main reference is Unicode Standard Annex 29 (UAX #29):
  27  *      <URL:http://unicode.org/reports/tr29>
  28  *
  29  * When this library was designed, this annex was at Revision 17, for
  30  * Unicode 6.0.0:
  31  *      <URL:http://www.unicode.org/reports/tr29/tr29-17.html>
  32  *
  33  * This library has been updated according to Revision 25, for
  34  * Unicode 7.0.0:
  35  *      <URL:http://www.unicode.org/reports/tr29/tr29-25.html>
  36  *
  37  * The Unicode Terms of Use are available at
  38  *      <URL:http://www.unicode.org/copyright.html>
  39  */
  40
  41 /**
  42  * @file    wordbreak.c
  43  *
  44  * Implementation of the word breaking algorithm as described in Unicode
  45  * Standard Annex 29.
  46  *
  47  * @version 2.6, 2015/04/18
  48  * @author  Tom Hacohen
  49  */
  50
  51 #include <assert.h>
  52 #include <stddef.h>
  53 #include <string.h>
  54 #include "unibreakdef.h"
  55 #include "wordbreak.h"
  56 #include "wordbreakdata.c"
  57
  58 #define ARRAY_LEN(x) (sizeof(x) / sizeof(x[0]))
  59
  60 /**
  61  * Initializes the wordbreak internals.  It currently does nothing, but
  62  * it may in the future.
  63  */
  64 void init_wordbreak(void)
  65 {
  66 }
  67
  68 /**
  69  * Gets the word breaking class of a character.
  70  *
  71  * @param ch   character to check
  72  * @param wbp  pointer to the wbp breaking properties array
  73  * @param len  size of the wbp array in number of items
  74  * @return     the word breaking class if found; \c WBP_Any otherwise
  75  */
  76 static enum WordBreakClass get_char_wb_class(
  77         utf32_t ch,
  78         struct WordBreakProperties *wbp,
  79         size_t len)
  80 {
  81     int min = 0;
  82     int max = len - 1;
  83     int mid;
  84
  85     do
  86     {
  87         mid = (min + max) / 2;
  88
  89         if (ch < wbp[mid].start)
  90             max = mid - 1;
  91         else if (ch > wbp[mid].end)
  92             min = mid + 1;
  93         else
  94             return wbp[mid].prop;
  95     }
  96     while (min <= max);
  97
  98     return WBP_Any;
  99 }
 100
 101 /**
 102  * Sets the word break types to a specific value in a range.
 103  *
 104  * It sets the inside chars to #WORDBREAK_INSIDEACHAR and the rest to brkType.
 105  * Assumes \a brks is initialized - all the cells with #WORDBREAK_NOBREAK are
 106  * cells that we really don't want to break after.
 107  *
 108  * @param[in]  s             input string
 109  * @param[out] brks          breaks array to fill
 110  * @param[in]  posStart      start position
 111  * @param[in]  posEnd        end position (exclusive)
 112  * @param[in]  len           length of the string
 113  * @param[in]  brkType       breaks type to use
 114  * @param[in] get_next_char  function to get the next UTF-32 character
 115  */
 116 static void set_brks_to(
 117         const void *s,
 118         char *brks,
 119         size_t posStart,
 120         size_t posEnd,
 121         size_t len,
 122         char brkType,
 123         get_next_char_t get_next_char)
 124 {
 125     size_t posNext = posStart;
 126     while (posNext < posEnd)
 127     {
 128         utf32_t ch;
 129         ch = get_next_char(s, len, &posNext);
 130         assert(ch != EOS);
 131         for (; posStart < posNext - 1; ++posStart)
 132             brks[posStart] = WORDBREAK_INSIDEACHAR;
 133         assert(posStart == posNext - 1);
 134
 135         /* Only set it if we haven't set it not to break before. */
 136         if (brks[posStart] != WORDBREAK_NOBREAK)
 137             brks[posStart] = brkType;
 138         posStart = posNext;
 139     }
 140 }
 141
 142 /* Checks to see if the class is newline, CR, or LF (rules WB3a and b). */
 143 #define IS_WB3ab(cls) ((cls == WBP_Newline) || (cls == WBP_CR) || \
 144                        (cls == WBP_LF))
 145
 146 /**
 147  * Sets the word breaking information for a generic input string.
 148  *
 149  * @param[in]  s             input string
 150  * @param[in]  len           length of the input
 151  * @param[in]  lang          language of the input
 152  * @param[out] brks          pointer to the output breaking data, containing
 153  *                           #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
 154  *                           #WORDBREAK_INSIDEACHAR
 155  * @param[in] get_next_char  function to get the next UTF-32 character
 156  */
 157 static void set_wordbreaks(
 158         const void *s,
 159         size_t len,
 160         const char *lang,
 161         char *brks,
 162         get_next_char_t get_next_char)
 163 {
 164     enum WordBreakClass wbcLast = WBP_Undefined;
 165     /* wbcSeqStart is the class that started the current sequence.
 166      * WBP_Undefined is a special case that means "sot".
 167      * This value is the class that is at the start of the current rule
 168      * matching sequence. For example, in case of Numeric+MidNum+Numeric
 169      * it'll be Numeric all the way.
 170      */
 171     enum WordBreakClass wbcSeqStart = WBP_Undefined;
 172     utf32_t ch;
 173     size_t posNext = 0;
 174     size_t posCur = 0;
 175     size_t posLast = 0;
 176
 177     /* TODO: Language-specific specialization. */
 178     (void) lang;
 179
 180     /* Init brks. */
 181     memset(brks, WORDBREAK_BREAK, len);
 182
 183     ch = get_next_char(s, len, &posNext);
 184
 185     while (ch != EOS)
 186     {
 187         enum WordBreakClass wbcCur;
 188         wbcCur = get_char_wb_class(ch, wb_prop_default,
 189                                    ARRAY_LEN(wb_prop_default));
 190
 191         switch (wbcCur)
 192         {
 193         case WBP_CR:
 194             /* WB3b */
 195             set_brks_to(s, brks, posLast, posCur, len,
 196                         WORDBREAK_BREAK, get_next_char);
 197             wbcSeqStart = wbcCur;
 198             posLast = posCur;
 199             break;
 200
 201         case WBP_LF:
 202             if (wbcSeqStart == WBP_CR) /* WB3 */
 203             {
 204                 set_brks_to(s, brks, posLast, posCur, len,
 205                             WORDBREAK_NOBREAK, get_next_char);
 206                 wbcSeqStart = wbcCur;
 207                 posLast = posCur;
 208                 break;
 209             }
 210             /* Fall off */
 211
 212         case WBP_Newline:
 213             /* WB3a,3b */
 214             set_brks_to(s, brks, posLast, posCur, len,
 215                         WORDBREAK_BREAK, get_next_char);
 216             wbcSeqStart = wbcCur;
 217             posLast = posCur;
 218             break;
 219
 220         case WBP_Extend:
 221         case WBP_Format:
 222             /* WB4 - If not the first char/after a newline (WB3a,3b), skip
 223              * this class, set it to be the same as the prev, and mark
 224              * brks not to break before them. */
 225             if ((wbcSeqStart == WBP_Undefined) || IS_WB3ab(wbcSeqStart))
 226             {
 227                 set_brks_to(s, brks, posLast, posCur, len,
 228                             WORDBREAK_BREAK, get_next_char);
 229                 wbcSeqStart = wbcCur;
 230             }
 231             else
 232             {
 233                 /* It's surely not the first */
 234                 brks[posCur - 1] = WORDBREAK_NOBREAK;
 235                 /* "inherit" the previous class. */
 236                 wbcCur = wbcLast;
 237             }
 238             break;
 239
 240         case WBP_Katakana:
 241             if ((wbcSeqStart == WBP_Katakana) || /* WB13 */
 242                     (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
 243             {
 244                 set_brks_to(s, brks, posLast, posCur, len,
 245                             WORDBREAK_NOBREAK, get_next_char);
 246             }
 247             /* No rule found, reset */
 248             else
 249             {
 250                 set_brks_to(s, brks, posLast, posCur, len,
 251                             WORDBREAK_BREAK, get_next_char);
 252             }
 253             wbcSeqStart = wbcCur;
 254             posLast = posCur;
 255             break;
 256
 257         case WBP_Hebrew_Letter:
 258         case WBP_ALetter:
 259             if ((wbcSeqStart == WBP_Hebrew_Letter) &&
 260                     (wbcLast == WBP_Double_Quote)) /* WB7b,c */
 261             {
 262                if (wbcCur == WBP_Hebrew_Letter)
 263                  {
 264                      set_brks_to(s, brks, posLast, posCur, len,
 265                              WORDBREAK_NOBREAK, get_next_char);
 266                  }
 267                else
 268                  {
 269                      set_brks_to(s, brks, posLast, posCur, len,
 270                              WORDBREAK_BREAK, get_next_char);
 271                  }
 272             }
 273             else if (((wbcSeqStart == WBP_ALetter) ||
 274                         (wbcSeqStart == WBP_Hebrew_Letter)) || /* WB5,6,7 */
 275                     (wbcLast == WBP_Numeric) || /* WB10 */
 276                     (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
 277             {
 278                 set_brks_to(s, brks, posLast, posCur, len,
 279                             WORDBREAK_NOBREAK, get_next_char);
 280             }
 281             /* No rule found, reset */
 282             else
 283             {
 284                 set_brks_to(s, brks, posLast, posCur, len,
 285                             WORDBREAK_BREAK, get_next_char);
 286             }
 287             wbcSeqStart = wbcCur;
 288             posLast = posCur;
 289             break;
 290
 291         case WBP_Single_Quote:
 292             if (wbcLast == WBP_Hebrew_Letter) /* WB7a */
 293             {
 294                 set_brks_to(s, brks, posLast, posCur, len,
 295                             WORDBREAK_NOBREAK, get_next_char);
 296                 wbcSeqStart = wbcCur;
 297                 posLast = posCur;
 298             }
 299             /* No break on purpose */
 300         case WBP_MidNumLet:
 301             if (((wbcLast == WBP_ALetter) ||
 302                         (wbcLast == WBP_Hebrew_Letter)) || /* WB6,7 */
 303                     (wbcLast == WBP_Numeric)) /* WB11,12 */
 304             {
 305                 /* Go on */
 306             }
 307             else
 308             {
 309                 set_brks_to(s, brks, posLast, posCur, len,
 310                             WORDBREAK_BREAK, get_next_char);
 311                 wbcSeqStart = wbcCur;
 312                 posLast = posCur;
 313             }
 314             break;
 315
 316         case WBP_MidLetter:
 317             if ((wbcLast == WBP_ALetter) ||
 318                     (wbcLast == WBP_Hebrew_Letter)) /* WB6,7 */
 319             {
 320                 /* Go on */
 321             }
 322             else
 323             {
 324                 set_brks_to(s, brks, posLast, posCur, len,
 325                             WORDBREAK_BREAK, get_next_char);
 326                 wbcSeqStart = wbcCur;
 327                 posLast = posCur;
 328             }
 329             break;
 330
 331         case WBP_MidNum:
 332             if (wbcLast == WBP_Numeric) /* WB11,12 */
 333             {
 334                 /* Go on */
 335             }
 336             else
 337             {
 338                 set_brks_to(s, brks, posLast, posCur, len,
 339                             WORDBREAK_BREAK, get_next_char);
 340                 wbcSeqStart = wbcCur;
 341                 posLast = posCur;
 342             }
 343             break;
 344
 345         case WBP_Numeric:
 346             if ((wbcSeqStart == WBP_Numeric) || /* WB8,11,12 */
 347                     ((wbcLast == WBP_ALetter) ||
 348                      (wbcLast == WBP_Hebrew_Letter)) || /* WB9 */
 349                     (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
 350             {
 351                 set_brks_to(s, brks, posLast, posCur, len,
 352                             WORDBREAK_NOBREAK, get_next_char);
 353             }
 354             /* No rule found, reset */
 355             else
 356             {
 357                 set_brks_to(s, brks, posLast, posCur, len,
 358                             WORDBREAK_BREAK, get_next_char);
 359             }
 360             wbcSeqStart = wbcCur;
 361             posLast = posCur;
 362             break;
 363
 364         case WBP_ExtendNumLet:
 365             /* WB13a,13b */
 366             if ((wbcSeqStart == wbcLast) &&
 367                 ((wbcLast == WBP_ALetter) ||
 368                  (wbcLast == WBP_Hebrew_Letter) ||
 369                  (wbcLast == WBP_Numeric) ||
 370                  (wbcLast == WBP_Katakana) ||
 371                  (wbcLast == WBP_ExtendNumLet)))
 372             {
 373                 set_brks_to(s, brks, posLast, posCur, len,
 374                             WORDBREAK_NOBREAK, get_next_char);
 375             }
 376             /* No rule found, reset */
 377             else
 378             {
 379                 set_brks_to(s, brks, posLast, posCur, len,
 380                             WORDBREAK_BREAK, get_next_char);
 381             }
 382             wbcSeqStart = wbcCur;
 383             posLast = posCur;
 384             break;
 385
 386         case WBP_Regional_Indicator:
 387             /* WB13c */
 388             if (wbcSeqStart == WBP_Regional_Indicator)
 389             {
 390                 set_brks_to(s, brks, posLast, posCur, len,
 391                             WORDBREAK_NOBREAK, get_next_char);
 392             }
 393             wbcSeqStart = wbcCur;
 394             posLast = posCur;
 395             break;
 396
 397         case WBP_Double_Quote:
 398             if (wbcLast == WBP_Hebrew_Letter) /* WB7b,c */
 399             {
 400                /* Go on */
 401             }
 402             else
 403             {
 404                 set_brks_to(s, brks, posLast, posCur, len,
 405                             WORDBREAK_BREAK, get_next_char);
 406                 wbcSeqStart = wbcCur;
 407                 posLast = posCur;
 408             }
 409             break;
 410
 411         case WBP_Any:
 412             /* Allow breaks and reset */
 413             set_brks_to(s, brks, posLast, posCur, len,
 414                         WORDBREAK_BREAK, get_next_char);
 415             wbcSeqStart = wbcCur;
 416             posLast = posCur;
 417             break;
 418
 419         default:
 420             /* Error, should never get here! */
 421             assert(0);
 422             break;
 423         }
 424
 425         wbcLast = wbcCur;
 426         posCur = posNext;
 427         ch = get_next_char(s, len, &posNext);
 428     }
 429
 430     /* WB2 */
 431     set_brks_to(s, brks, posLast, posNext, len,
 432                 WORDBREAK_BREAK, get_next_char);
 433 }
 434
 435 /**
 436  * Sets the word breaking information for a UTF-8 input string.
 437  *
 438  * @param[in]  s     input UTF-8 string
 439  * @param[in]  len   length of the input
 440  * @param[in]  lang  language of the input
 441  * @param[out] brks  pointer to the output breaking data, containing
 442  *                   #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
 443  *                   #WORDBREAK_INSIDEACHAR
 444  */
 445 void set_wordbreaks_utf8(
 446         const utf8_t *s,
 447         size_t len,
 448         const char *lang,
 449         char *brks)
 450 {
 451     set_wordbreaks(s, len, lang, brks,
 452                    (get_next_char_t)ub_get_next_char_utf8);
 453 }
 454
 455 /**
 456  * Sets the word breaking information for a UTF-16 input string.
 457  *
 458  * @param[in]  s     input UTF-16 string
 459  * @param[in]  len   length of the input
 460  * @param[in]  lang  language of the input
 461  * @param[out] brks  pointer to the output breaking data, containing
 462  *                   #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
 463  *                   #WORDBREAK_INSIDEACHAR
 464  */
 465 void set_wordbreaks_utf16(
 466         const utf16_t *s,
 467         size_t len,
 468         const char *lang,
 469         char *brks)
 470 {
 471     set_wordbreaks(s, len, lang, brks,
 472                    (get_next_char_t)ub_get_next_char_utf16);
 473 }
 474
 475 /**
 476  * Sets the word breaking information for a UTF-32 input string.
 477  *
 478  * @param[in]  s     input UTF-32 string
 479  * @param[in]  len   length of the input
 480  * @param[in]  lang  language of the input
 481  * @param[out] brks  pointer to the output breaking data, containing
 482  *                   #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
 483  *                   #WORDBREAK_INSIDEACHAR
 484  */
 485 void set_wordbreaks_utf32(
 486         const utf32_t *s,
 487         size_t len,
 488         const char *lang,
 489         char *brks)
 490 {
 491     set_wordbreaks(s, len, lang, brks,
 492                    (get_next_char_t)ub_get_next_char_utf32);
 493 }