posix/regex_internal.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 static void re_string_construct_common (const char *str, int len,
  22                                         re_string_t *pstr,
  23                                         RE_TRANSLATE_TYPE trans, int icase,
  24                                         const re_dfa_t *dfa) internal_function;
  25 #ifdef RE_ENABLE_I18N
  26 static int re_string_skip_chars (re_string_t *pstr, int new_raw_idx,
  27                                  wint_t *last_wc) internal_function;
  28 #endif /* RE_ENABLE_I18N */
  29 static re_dfastate_t *create_newstate_common (re_dfa_t *dfa,
  30                                               const re_node_set *nodes,
  31                                               unsigned int hash) internal_function;
  32 static reg_errcode_t register_state (re_dfa_t *dfa, re_dfastate_t *newstate,
  33                                      unsigned int hash) internal_function;
  34 static re_dfastate_t *create_ci_newstate (re_dfa_t *dfa,
  35                                           const re_node_set *nodes,
  36                                           unsigned int hash) internal_function;
  37 static re_dfastate_t *create_cd_newstate (re_dfa_t *dfa,
  38                                           const re_node_set *nodes,
  39                                           unsigned int context,
  40                                           unsigned int hash) internal_function;
  41 static unsigned int inline calc_state_hash (const re_node_set *nodes,
  42                                             unsigned int context) internal_function;
  43 \f
  44 /* Functions for string operation.  */
  45
  46 /* This function allocate the buffers.  It is necessary to call
  47    re_string_reconstruct before using the object.  */
  48
  49 static reg_errcode_t
  50 re_string_allocate (pstr, str, len, init_len, trans, icase, dfa)
  51      re_string_t *pstr;
  52      const char *str;
  53      int len, init_len, icase;
  54      RE_TRANSLATE_TYPE trans;
  55      const re_dfa_t *dfa;
  56 {
  57   reg_errcode_t ret;
  58   int init_buf_len;
  59
  60   /* Ensure at least one character fits into the buffers.  */
  61   if (init_len < dfa->mb_cur_max)
  62     init_len = dfa->mb_cur_max;
  63   init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
  64   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  65
  66   ret = re_string_realloc_buffers (pstr, init_buf_len);
  67   if (BE (ret != REG_NOERROR, 0))
  68     return ret;
  69
  70   pstr->word_char = dfa->word_char;
  71   pstr->word_ops_used = dfa->word_ops_used;
  72   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  73   pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
  74   pstr->valid_raw_len = pstr->valid_len;
  75   return REG_NOERROR;
  76 }
  77
  78 /* This function allocate the buffers, and initialize them.  */
  79
  80 static reg_errcode_t
  81 re_string_construct (pstr, str, len, trans, icase, dfa)
  82      re_string_t *pstr;
  83      const char *str;
  84      int len, icase;
  85      RE_TRANSLATE_TYPE trans;
  86      const re_dfa_t *dfa;
  87 {
  88   reg_errcode_t ret;
  89   memset (pstr, '\0', sizeof (re_string_t));
  90   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  91
  92   if (len > 0)
  93     {
  94       ret = re_string_realloc_buffers (pstr, len + 1);
  95       if (BE (ret != REG_NOERROR, 0))
  96         return ret;
  97     }
  98   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  99
 100   if (icase)
 101     {
 102 #ifdef RE_ENABLE_I18N
 103       if (dfa->mb_cur_max > 1)
 104         {
 105           while (1)
 106             {
 107               ret = build_wcs_upper_buffer (pstr);
 108               if (BE (ret != REG_NOERROR, 0))
 109                 return ret;
 110               if (pstr->valid_raw_len >= len)
 111                 break;
 112               if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
 113                 break;
 114               ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
 115               if (BE (ret != REG_NOERROR, 0))
 116                 return ret;
 117             }
 118         }
 119       else
 120 #endif /* RE_ENABLE_I18N  */
 121         build_upper_buffer (pstr);
 122     }
 123   else
 124     {
 125 #ifdef RE_ENABLE_I18N
 126       if (dfa->mb_cur_max > 1)
 127         build_wcs_buffer (pstr);
 128       else
 129 #endif /* RE_ENABLE_I18N  */
 130         {
 131           if (trans != NULL)
 132             re_string_translate_buffer (pstr);
 133           else
 134             {
 135               pstr->valid_len = pstr->bufs_len;
 136               pstr->valid_raw_len = pstr->bufs_len;
 137             }
 138         }
 139     }
 140
 141   return REG_NOERROR;
 142 }
 143
 144 /* Helper functions for re_string_allocate, and re_string_construct.  */
 145
 146 static reg_errcode_t
 147 re_string_realloc_buffers (pstr, new_buf_len)
 148      re_string_t *pstr;
 149      int new_buf_len;
 150 {
 151 #ifdef RE_ENABLE_I18N
 152   if (pstr->mb_cur_max > 1)
 153     {
 154       wint_t *new_array = re_realloc (pstr->wcs, wint_t, new_buf_len);
 155       if (BE (new_array == NULL, 0))
 156         return REG_ESPACE;
 157       pstr->wcs = new_array;
 158       if (pstr->offsets != NULL)
 159         {
 160           int *new_array = re_realloc (pstr->offsets, int, new_buf_len);
 161           if (BE (new_array == NULL, 0))
 162             return REG_ESPACE;
 163           pstr->offsets = new_array;
 164         }
 165     }
 166 #endif /* RE_ENABLE_I18N  */
 167   if (pstr->mbs_allocated)
 168     {
 169       unsigned char *new_array = re_realloc (pstr->mbs, unsigned char,
 170                                              new_buf_len);
 171       if (BE (new_array == NULL, 0))
 172         return REG_ESPACE;
 173       pstr->mbs = new_array;
 174     }
 175   pstr->bufs_len = new_buf_len;
 176   return REG_NOERROR;
 177 }
 178
 179
 180 static void
 181 re_string_construct_common (str, len, pstr, trans, icase, dfa)
 182      const char *str;
 183      int len;
 184      re_string_t *pstr;
 185      RE_TRANSLATE_TYPE trans;
 186      int icase;
 187      const re_dfa_t *dfa;
 188 {
 189   pstr->raw_mbs = (const unsigned char *) str;
 190   pstr->len = len;
 191   pstr->raw_len = len;
 192   pstr->trans = (unsigned RE_TRANSLATE_TYPE) trans;
 193   pstr->icase = icase ? 1 : 0;
 194   pstr->mbs_allocated = (trans != NULL || icase);
 195   pstr->mb_cur_max = dfa->mb_cur_max;
 196   pstr->is_utf8 = dfa->is_utf8;
 197   pstr->map_notascii = dfa->map_notascii;
 198   pstr->stop = pstr->len;
 199   pstr->raw_stop = pstr->stop;
 200 }
 201
 202 #ifdef RE_ENABLE_I18N
 203
 204 /* Build wide character buffer PSTR->WCS.
 205    If the byte sequence of the string are:
 206      <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
 207    Then wide character buffer will be:
 208      <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
 209    We use WEOF for padding, they indicate that the position isn't
 210    a first byte of a multibyte character.
 211
 212    Note that this function assumes PSTR->VALID_LEN elements are already
 213    built and starts from PSTR->VALID_LEN.  */
 214
 215 static void
 216 build_wcs_buffer (pstr)
 217      re_string_t *pstr;
 218 {
 219 #ifdef _LIBC
 220   unsigned char buf[pstr->mb_cur_max];
 221 #else
 222   unsigned char buf[64];
 223 #endif
 224   mbstate_t prev_st;
 225   int byte_idx, end_idx, mbclen, remain_len;
 226
 227   /* Build the buffers from pstr->valid_len to either pstr->len or
 228      pstr->bufs_len.  */
 229   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 230   for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
 231     {
 232       wchar_t wc;
 233       const char *p;
 234
 235       remain_len = end_idx - byte_idx;
 236       prev_st = pstr->cur_state;
 237       /* Apply the translation if we need.  */
 238       if (BE (pstr->trans != NULL, 0))
 239         {
 240           int i, ch;
 241
 242           for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 243             {
 244               ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
 245               buf[i] = pstr->trans[ch];
 246             }
 247           p = (const char *) buf;
 248         }
 249       else
 250         p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
 251       mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 252       if (BE (mbclen == (size_t) -2, 0))
 253         {
 254           /* The buffer doesn't have enough space, finish to build.  */
 255           pstr->cur_state = prev_st;
 256           break;
 257         }
 258       else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
 259         {
 260           /* We treat these cases as a singlebyte character.  */
 261           mbclen = 1;
 262           wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 263           if (BE (pstr->trans != NULL, 0))
 264             wc = pstr->trans[wc];
 265           pstr->cur_state = prev_st;
 266         }
 267
 268       /* Write wide character and padding.  */
 269       pstr->wcs[byte_idx++] = wc;
 270       /* Write paddings.  */
 271       for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 272         pstr->wcs[byte_idx++] = WEOF;
 273     }
 274   pstr->valid_len = byte_idx;
 275   pstr->valid_raw_len = byte_idx;
 276 }
 277
 278 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
 279    but for REG_ICASE.  */
 280
 281 static int
 282 build_wcs_upper_buffer (pstr)
 283      re_string_t *pstr;
 284 {
 285   mbstate_t prev_st;
 286   int src_idx, byte_idx, end_idx, mbclen, remain_len;
 287 #ifdef _LIBC
 288   unsigned char buf[pstr->mb_cur_max];
 289 #else
 290   unsigned char buf[64];
 291 #endif
 292
 293   byte_idx = pstr->valid_len;
 294   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 295
 296 #ifdef _LIBC
 297   /* The following optimization assumes that the wchar_t encoding is
 298      always ISO 10646.  */
 299   if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
 300     {
 301       while (byte_idx < end_idx)
 302         {
 303           wchar_t wc;
 304
 305           if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
 306               && mbsinit (&pstr->cur_state))
 307             {
 308               /* In case of a singlebyte character.  */
 309               pstr->mbs[byte_idx]
 310                 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
 311               /* The next step uses the assumption that wchar_t is encoded
 312                  with ISO 10646: all ASCII values can be converted like
 313                  this.  */
 314               pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
 315               ++byte_idx;
 316               continue;
 317             }
 318
 319           remain_len = end_idx - byte_idx;
 320           prev_st = pstr->cur_state;
 321           mbclen = mbrtowc (&wc,
 322                             ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
 323                              + byte_idx), remain_len, &pstr->cur_state);
 324           if (BE (mbclen > 0, 1))
 325             {
 326               wchar_t wcu = wc;
 327               if (iswlower (wc))
 328                 {
 329                   int mbcdlen;
 330
 331                   wcu = towupper (wc);
 332                   mbcdlen = wcrtomb (buf, wcu, &prev_st);
 333                   if (BE (mbclen == mbcdlen, 1))
 334                     memcpy (pstr->mbs + byte_idx, buf, mbclen);
 335                   else
 336                     {
 337                       src_idx = byte_idx;
 338                       goto offsets_needed;
 339                     }
 340                 }
 341               else
 342                 memcpy (pstr->mbs + byte_idx,
 343                         pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
 344               pstr->wcs[byte_idx++] = wcu;
 345               /* Write paddings.  */
 346               for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 347                 pstr->wcs[byte_idx++] = WEOF;
 348             }
 349           else if (mbclen == (size_t) -1 || mbclen == 0)
 350             {
 351               /* It is an invalid character or '\0'.  Just use the byte.  */
 352               int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 353               pstr->mbs[byte_idx] = ch;
 354               /* And also cast it to wide char.  */
 355               pstr->wcs[byte_idx++] = (wchar_t) ch;
 356               if (BE (mbclen == (size_t) -1, 0))
 357                 pstr->cur_state = prev_st;
 358             }
 359           else
 360             {
 361               /* The buffer doesn't have enough space, finish to build.  */
 362               pstr->cur_state = prev_st;
 363               break;
 364             }
 365         }
 366       pstr->valid_len = byte_idx;
 367       pstr->valid_raw_len = byte_idx;
 368       return REG_NOERROR;
 369     }
 370   else
 371 #endif
 372     for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
 373       {
 374         wchar_t wc;
 375         const char *p;
 376 #ifdef _LIBC
 377 offsets_needed:
 378 #endif
 379         remain_len = end_idx - byte_idx;
 380         prev_st = pstr->cur_state;
 381         if (BE (pstr->trans != NULL, 0))
 382           {
 383             int i, ch;
 384
 385             for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 386               {
 387                 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
 388                 buf[i] = pstr->trans[ch];
 389               }
 390             p = (const char *) buf;
 391           }
 392         else
 393           p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
 394         mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 395         if (BE (mbclen > 0, 1))
 396           {
 397             wchar_t wcu = wc;
 398             if (iswlower (wc))
 399               {
 400                 int mbcdlen;
 401
 402                 wcu = towupper (wc);
 403                 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
 404                 if (BE (mbclen == mbcdlen, 1))
 405                   memcpy (pstr->mbs + byte_idx, buf, mbclen);
 406                 else
 407                   {
 408                     int i;
 409
 410                     if (byte_idx + mbcdlen > pstr->bufs_len)
 411                       {
 412                         pstr->cur_state = prev_st;
 413                         break;
 414                       }
 415
 416                     if (pstr->offsets == NULL)
 417                       {
 418                         pstr->offsets = re_malloc (int, pstr->bufs_len);
 419
 420                         if (pstr->offsets == NULL)
 421                           return REG_ESPACE;
 422                       }
 423                     if (!pstr->offsets_needed)
 424                       {
 425                         for (i = 0; i < byte_idx; ++i)
 426                           pstr->offsets[i] = i;
 427                         pstr->offsets_needed = 1;
 428                       }
 429
 430                     memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
 431                     pstr->wcs[byte_idx] = wcu;
 432                     pstr->offsets[byte_idx] = src_idx;
 433                     for (i = 1; i < mbcdlen; ++i)
 434                       {
 435                         pstr->offsets[byte_idx + i]
 436                           = src_idx + (i < mbclen ? i : mbclen - 1);
 437                         pstr->wcs[byte_idx + i] = WEOF;
 438                       }
 439                     pstr->len += mbcdlen - mbclen;
 440                     if (pstr->raw_stop > src_idx)
 441                       pstr->stop += mbcdlen - mbclen;
 442                     end_idx = (pstr->bufs_len > pstr->len)
 443                               ? pstr->len : pstr->bufs_len;
 444                     byte_idx += mbcdlen;
 445                     src_idx += mbclen;
 446                     continue;
 447                   }
 448               }
 449             else
 450               memcpy (pstr->mbs + byte_idx, p, mbclen);
 451
 452             if (BE (pstr->offsets_needed != 0, 0))
 453               {
 454                 int i;
 455                 for (i = 0; i < mbclen; ++i)
 456                   pstr->offsets[byte_idx + i] = src_idx + i;
 457               }
 458             src_idx += mbclen;
 459
 460             pstr->wcs[byte_idx++] = wcu;
 461             /* Write paddings.  */
 462             for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 463               pstr->wcs[byte_idx++] = WEOF;
 464           }
 465         else if (mbclen == (size_t) -1 || mbclen == 0)
 466           {
 467             /* It is an invalid character or '\0'.  Just use the byte.  */
 468             int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
 469
 470             if (BE (pstr->trans != NULL, 0))
 471               ch = pstr->trans [ch];
 472             pstr->mbs[byte_idx] = ch;
 473
 474             if (BE (pstr->offsets_needed != 0, 0))
 475               pstr->offsets[byte_idx] = src_idx;
 476             ++src_idx;
 477
 478             /* And also cast it to wide char.  */
 479             pstr->wcs[byte_idx++] = (wchar_t) ch;
 480             if (BE (mbclen == (size_t) -1, 0))
 481               pstr->cur_state = prev_st;
 482           }
 483         else
 484           {
 485             /* The buffer doesn't have enough space, finish to build.  */
 486             pstr->cur_state = prev_st;
 487             break;
 488           }
 489       }
 490   pstr->valid_len = byte_idx;
 491   pstr->valid_raw_len = src_idx;
 492   return REG_NOERROR;
 493 }
 494
 495 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
 496    Return the index.  */
 497
 498 static int
 499 re_string_skip_chars (pstr, new_raw_idx, last_wc)
 500      re_string_t *pstr;
 501      int new_raw_idx;
 502      wint_t *last_wc;
 503 {
 504   mbstate_t prev_st;
 505   int rawbuf_idx, mbclen;
 506   wchar_t wc = 0;
 507
 508   /* Skip the characters which are not necessary to check.  */
 509   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
 510        rawbuf_idx < new_raw_idx;)
 511     {
 512       int remain_len;
 513       remain_len = pstr->len - rawbuf_idx;
 514       prev_st = pstr->cur_state;
 515       mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
 516                         remain_len, &pstr->cur_state);
 517       if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
 518         {
 519           /* We treat these cases as a singlebyte character.  */
 520           mbclen = 1;
 521           pstr->cur_state = prev_st;
 522         }
 523       /* Then proceed the next character.  */
 524       rawbuf_idx += mbclen;
 525     }
 526   *last_wc = (wint_t) wc;
 527   return rawbuf_idx;
 528 }
 529 #endif /* RE_ENABLE_I18N  */
 530
 531 /* Build the buffer PSTR->MBS, and apply the translation if we need.
 532    This function is used in case of REG_ICASE.  */
 533
 534 static void
 535 build_upper_buffer (pstr)
 536      re_string_t *pstr;
 537 {
 538   int char_idx, end_idx;
 539   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 540
 541   for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
 542     {
 543       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
 544       if (BE (pstr->trans != NULL, 0))
 545         ch = pstr->trans[ch];
 546       if (islower (ch))
 547         pstr->mbs[char_idx] = toupper (ch);
 548       else
 549         pstr->mbs[char_idx] = ch;
 550     }
 551   pstr->valid_len = char_idx;
 552   pstr->valid_raw_len = char_idx;
 553 }
 554
 555 /* Apply TRANS to the buffer in PSTR.  */
 556
 557 static void
 558 re_string_translate_buffer (pstr)
 559      re_string_t *pstr;
 560 {
 561   int buf_idx, end_idx;
 562   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 563
 564   for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
 565     {
 566       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
 567       pstr->mbs[buf_idx] = pstr->trans[ch];
 568     }
 569
 570   pstr->valid_len = buf_idx;
 571   pstr->valid_raw_len = buf_idx;
 572 }
 573
 574 /* This function re-construct the buffers.
 575    Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
 576    convert to upper case in case of REG_ICASE, apply translation.  */
 577
 578 static reg_errcode_t
 579 re_string_reconstruct (pstr, idx, eflags)
 580      re_string_t *pstr;
 581      int idx, eflags;
 582 {
 583   int offset = idx - pstr->raw_mbs_idx;
 584   if (offset < 0)
 585     {
 586       /* Reset buffer.  */
 587 #ifdef RE_ENABLE_I18N
 588       if (pstr->mb_cur_max > 1)
 589         memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 590 #endif /* RE_ENABLE_I18N */
 591       pstr->len = pstr->raw_len;
 592       pstr->stop = pstr->raw_stop;
 593       pstr->valid_len = 0;
 594       pstr->raw_mbs_idx = 0;
 595       pstr->valid_raw_len = 0;
 596       pstr->offsets_needed = 0;
 597       pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 598                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
 599       if (!pstr->mbs_allocated)
 600         pstr->mbs = (unsigned char *) pstr->raw_mbs;
 601       offset = idx;
 602     }
 603
 604   if (offset != 0)
 605     {
 606       /* Are the characters which are already checked remain?  */
 607       if (offset < pstr->valid_raw_len
 608 #ifdef RE_ENABLE_I18N
 609           /* Handling this would enlarge the code too much.
 610              Accept a slowdown in that case.  */
 611           && pstr->offsets_needed == 0
 612 #endif
 613          )
 614         {
 615           /* Yes, move them to the front of the buffer.  */
 616           pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
 617 #ifdef RE_ENABLE_I18N
 618           if (pstr->mb_cur_max > 1)
 619             memmove (pstr->wcs, pstr->wcs + offset,
 620                      (pstr->valid_len - offset) * sizeof (wint_t));
 621 #endif /* RE_ENABLE_I18N */
 622           if (pstr->mbs_allocated)
 623             memmove (pstr->mbs, pstr->mbs + offset,
 624                      pstr->valid_len - offset);
 625           pstr->valid_len -= offset;
 626           pstr->valid_raw_len -= offset;
 627 #if DEBUG
 628           assert (pstr->valid_len > 0);
 629 #endif
 630         }
 631       else
 632         {
 633           /* No, skip all characters until IDX.  */
 634 #ifdef RE_ENABLE_I18N
 635           if (BE (pstr->offsets_needed, 0))
 636             {
 637               pstr->len = pstr->raw_len - idx + offset;
 638               pstr->stop = pstr->raw_stop - idx + offset;
 639               pstr->offsets_needed = 0;
 640             }
 641 #endif
 642           pstr->valid_len = 0;
 643           pstr->valid_raw_len = 0;
 644 #ifdef RE_ENABLE_I18N
 645           if (pstr->mb_cur_max > 1)
 646             {
 647               int wcs_idx;
 648               wint_t wc = WEOF;
 649
 650 #ifdef _LIBC
 651               if (pstr->is_utf8)
 652                 {
 653                   const unsigned char *raw, *p, *q, *end;
 654
 655                   /* Special case UTF-8.  Multi-byte chars start with any
 656                      byte other than 0x80 - 0xbf.  */
 657                   raw = pstr->raw_mbs + pstr->raw_mbs_idx;
 658                   end = raw + (offset - pstr->mb_cur_max);
 659                   for (p = raw + offset - 1; p >= end; --p)
 660                     if ((*p & 0xc0) != 0x80)
 661                       {
 662                         mbstate_t cur_state;
 663                         wchar_t wc2;
 664                         int mlen = raw + pstr->len - p;
 665                         unsigned char buf[6];
 666
 667                         q = p;
 668                         if (BE (pstr->trans != NULL, 0))
 669                           {
 670                             int i = mlen < 6 ? mlen : 6;
 671                             while (--i >= 0)
 672                               buf[i] = pstr->trans[p[i]];
 673                             q = buf;
 674                           }
 675                         /* XXX Don't use mbrtowc, we know which conversion
 676                            to use (UTF-8 -> UCS4).  */
 677                         memset (&cur_state, 0, sizeof (cur_state));
 678                         mlen = mbrtowc (&wc2, p, mlen, &cur_state)
 679                                - (raw + offset - p);
 680                         if (mlen >= 0)
 681                           {
 682                             memset (&pstr->cur_state, '\0',
 683                                     sizeof (mbstate_t));
 684                             pstr->valid_len = mlen;
 685                             wc = wc2;
 686                           }
 687                         break;
 688                       }
 689                 }
 690 #endif
 691               if (wc == WEOF)
 692                 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
 693               if (BE (pstr->valid_len, 0))
 694                 {
 695                   for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
 696                     pstr->wcs[wcs_idx] = WEOF;
 697                   if (pstr->mbs_allocated)
 698                     memset (pstr->mbs, 255, pstr->valid_len);
 699                 }
 700               pstr->valid_raw_len = pstr->valid_len;
 701               pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
 702                                     && IS_WIDE_WORD_CHAR (wc))
 703                                    ? CONTEXT_WORD
 704                                    : ((IS_WIDE_NEWLINE (wc)
 705                                        && pstr->newline_anchor)
 706                                       ? CONTEXT_NEWLINE : 0));
 707             }
 708           else
 709 #endif /* RE_ENABLE_I18N */
 710             {
 711               int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
 712               if (pstr->trans)
 713                 c = pstr->trans[c];
 714               pstr->tip_context = (bitset_contain (pstr->word_char, c)
 715                                    ? CONTEXT_WORD
 716                                    : ((IS_NEWLINE (c) && pstr->newline_anchor)
 717                                       ? CONTEXT_NEWLINE : 0));
 718             }
 719         }
 720       if (!pstr->mbs_allocated)
 721         pstr->mbs += offset;
 722     }
 723   pstr->raw_mbs_idx = idx;
 724   pstr->len -= offset;
 725   pstr->stop -= offset;
 726
 727   /* Then build the buffers.  */
 728 #ifdef RE_ENABLE_I18N
 729   if (pstr->mb_cur_max > 1)
 730     {
 731       if (pstr->icase)
 732         {
 733           int ret = build_wcs_upper_buffer (pstr);
 734           if (BE (ret != REG_NOERROR, 0))
 735             return ret;
 736         }
 737       else
 738         build_wcs_buffer (pstr);
 739     }
 740   else
 741 #endif /* RE_ENABLE_I18N */
 742     {
 743       if (pstr->icase)
 744         build_upper_buffer (pstr);
 745       else if (pstr->trans != NULL)
 746         re_string_translate_buffer (pstr);
 747       else
 748         pstr->valid_len = pstr->len;
 749     }
 750   pstr->cur_idx = 0;
 751
 752   return REG_NOERROR;
 753 }
 754
 755 static unsigned char
 756 re_string_peek_byte_case (pstr, idx)
 757      const re_string_t *pstr;
 758      int idx;
 759 {
 760   int ch, off;
 761
 762   /* Handle the common (easiest) cases first.  */
 763   if (BE (!pstr->mbs_allocated, 1))
 764     return re_string_peek_byte (pstr, idx);
 765
 766 #ifdef RE_ENABLE_I18N
 767   if (pstr->mb_cur_max > 1
 768       && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
 769     return re_string_peek_byte (pstr, idx);
 770 #endif
 771
 772   off = pstr->cur_idx + idx;
 773 #ifdef RE_ENABLE_I18N
 774   if (pstr->offsets_needed)
 775     off = pstr->offsets[off];
 776 #endif
 777
 778   ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 779
 780 #ifdef RE_ENABLE_I18N
 781   /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
 782      this function returns CAPITAL LETTER I instead of first byte of
 783      DOTLESS SMALL LETTER I.  The latter would confuse the parser,
 784      since peek_byte_case doesn't advance cur_idx in any way.  */
 785   if (pstr->offsets_needed && !isascii (ch))
 786     return re_string_peek_byte (pstr, idx);
 787 #endif
 788
 789   return ch;
 790 }
 791
 792 static unsigned char
 793 re_string_fetch_byte_case (pstr)
 794      re_string_t *pstr;
 795 {
 796   if (BE (!pstr->mbs_allocated, 1))
 797     return re_string_fetch_byte (pstr);
 798
 799 #ifdef RE_ENABLE_I18N
 800   if (pstr->offsets_needed)
 801     {
 802       int off, ch;
 803
 804       /* For tr_TR.UTF-8 [[:islower:]] there is
 805          [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
 806          in that case the whole multi-byte character and return
 807          the original letter.  On the other side, with
 808          [[: DOTLESS SMALL LETTER I return [[:I, as doing
 809          anything else would complicate things too much.  */
 810
 811       if (!re_string_first_byte (pstr, pstr->cur_idx))
 812         return re_string_fetch_byte (pstr);
 813
 814       off = pstr->offsets[pstr->cur_idx];
 815       ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 816
 817       if (! isascii (ch))
 818         return re_string_fetch_byte (pstr);
 819
 820       re_string_skip_bytes (pstr,
 821                             re_string_char_size_at (pstr, pstr->cur_idx));
 822       return ch;
 823     }
 824 #endif
 825
 826   return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
 827 }
 828
 829 static void
 830 re_string_destruct (pstr)
 831      re_string_t *pstr;
 832 {
 833 #ifdef RE_ENABLE_I18N
 834   re_free (pstr->wcs);
 835   re_free (pstr->offsets);
 836 #endif /* RE_ENABLE_I18N  */
 837   if (pstr->mbs_allocated)
 838     re_free (pstr->mbs);
 839 }
 840
 841 /* Return the context at IDX in INPUT.  */
 842
 843 static unsigned int
 844 re_string_context_at (input, idx, eflags)
 845      const re_string_t *input;
 846      int idx, eflags;
 847 {
 848   int c;
 849   if (idx < 0 || idx == input->len)
 850     {
 851       if (idx < 0)
 852         /* In this case, we use the value stored in input->tip_context,
 853            since we can't know the character in input->mbs[-1] here.  */
 854         return input->tip_context;
 855       else /* (idx == input->len) */
 856         return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
 857                 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
 858     }
 859 #ifdef RE_ENABLE_I18N
 860   if (input->mb_cur_max > 1)
 861     {
 862       wint_t wc;
 863       int wc_idx = idx;
 864       while(input->wcs[wc_idx] == WEOF)
 865         {
 866 #ifdef DEBUG
 867           /* It must not happen.  */
 868           assert (wc_idx >= 0);
 869 #endif
 870           --wc_idx;
 871           if (wc_idx < 0)
 872             return input->tip_context;
 873         }
 874       wc = input->wcs[wc_idx];
 875       if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
 876         return CONTEXT_WORD;
 877       return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
 878               ? CONTEXT_NEWLINE : 0);
 879     }
 880   else
 881 #endif
 882     {
 883       c = re_string_byte_at (input, idx);
 884       if (bitset_contain (input->word_char, c))
 885         return CONTEXT_WORD;
 886       return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
 887     }
 888 }
 889 \f
 890 /* Functions for set operation.  */
 891
 892 static reg_errcode_t
 893 re_node_set_alloc (set, size)
 894      re_node_set *set;
 895      int size;
 896 {
 897   set->alloc = size;
 898   set->nelem = 0;
 899   set->elems = re_malloc (int, size);
 900   if (BE (set->elems == NULL, 0))
 901     return REG_ESPACE;
 902   return REG_NOERROR;
 903 }
 904
 905 static reg_errcode_t
 906 re_node_set_init_1 (set, elem)
 907      re_node_set *set;
 908      int elem;
 909 {
 910   set->alloc = 1;
 911   set->nelem = 1;
 912   set->elems = re_malloc (int, 1);
 913   if (BE (set->elems == NULL, 0))
 914     {
 915       set->alloc = set->nelem = 0;
 916       return REG_ESPACE;
 917     }
 918   set->elems[0] = elem;
 919   return REG_NOERROR;
 920 }
 921
 922 static reg_errcode_t
 923 re_node_set_init_2 (set, elem1, elem2)
 924      re_node_set *set;
 925      int elem1, elem2;
 926 {
 927   set->alloc = 2;
 928   set->elems = re_malloc (int, 2);
 929   if (BE (set->elems == NULL, 0))
 930     return REG_ESPACE;
 931   if (elem1 == elem2)
 932     {
 933       set->nelem = 1;
 934       set->elems[0] = elem1;
 935     }
 936   else
 937     {
 938       set->nelem = 2;
 939       if (elem1 < elem2)
 940         {
 941           set->elems[0] = elem1;
 942           set->elems[1] = elem2;
 943         }
 944       else
 945         {
 946           set->elems[0] = elem2;
 947           set->elems[1] = elem1;
 948         }
 949     }
 950   return REG_NOERROR;
 951 }
 952
 953 static reg_errcode_t
 954 re_node_set_init_copy (dest, src)
 955      re_node_set *dest;
 956      const re_node_set *src;
 957 {
 958   dest->nelem = src->nelem;
 959   if (src->nelem > 0)
 960     {
 961       dest->alloc = dest->nelem;
 962       dest->elems = re_malloc (int, dest->alloc);
 963       if (BE (dest->elems == NULL, 0))
 964         {
 965           dest->alloc = dest->nelem = 0;
 966           return REG_ESPACE;
 967         }
 968       memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
 969     }
 970   else
 971     re_node_set_init_empty (dest);
 972   return REG_NOERROR;
 973 }
 974
 975 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
 976    DEST. Return value indicate the error code or REG_NOERROR if succeeded.
 977    Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
 978
 979 static reg_errcode_t
 980 re_node_set_add_intersect (dest, src1, src2)
 981      re_node_set *dest;
 982      const re_node_set *src1, *src2;
 983 {
 984   int i1, i2, is, id, delta, sbase;
 985   if (src1->nelem == 0 || src2->nelem == 0)
 986     return REG_NOERROR;
 987
 988   /* We need dest->nelem + 2 * elems_in_intersection; this is a
 989      conservative estimate.  */
 990   if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
 991     {
 992       int new_alloc = src1->nelem + src2->nelem + dest->alloc;
 993       int *new_elems = re_realloc (dest->elems, int, new_alloc);
 994       if (BE (new_elems == NULL, 0))
 995         return REG_ESPACE;
 996       dest->elems = new_elems;
 997       dest->alloc = new_alloc;
 998     }
 999
1000   /* Find the items in the intersection of SRC1 and SRC2, and copy
1001      into the top of DEST those that are not already in DEST itself.  */
1002   sbase = dest->nelem + src1->nelem + src2->nelem;
1003   i1 = src1->nelem - 1;
1004   i2 = src2->nelem - 1;
1005   id = dest->nelem - 1;
1006   for (;;)
1007     {
1008       if (src1->elems[i1] == src2->elems[i2])
1009         {
1010           /* Try to find the item in DEST.  Maybe we could binary search?  */
1011           while (id >= 0 && dest->elems[id] > src1->elems[i1])
1012             --id;
1013
1014           if (id < 0 || dest->elems[id] != src1->elems[i1])
1015             dest->elems[--sbase] = src1->elems[i1];
1016
1017           if (--i1 < 0 || --i2 < 0)
1018             break;
1019         }
1020
1021       /* Lower the highest of the two items.  */
1022       else if (src1->elems[i1] < src2->elems[i2])
1023         {
1024           if (--i2 < 0)
1025             break;
1026         }
1027       else
1028         {
1029           if (--i1 < 0)
1030             break;
1031         }
1032     }
1033
1034   id = dest->nelem - 1;
1035   is = dest->nelem + src1->nelem + src2->nelem - 1;
1036   delta = is - sbase + 1;
1037
1038   /* Now copy.  When DELTA becomes zero, the remaining
1039      DEST elements are already in place; this is more or
1040      less the same loop that is in re_node_set_merge.  */
1041   dest->nelem += delta;
1042   if (delta > 0 && id >= 0)
1043     for (;;)
1044       {
1045         if (dest->elems[is] > dest->elems[id])
1046           {
1047             /* Copy from the top.  */
1048             dest->elems[id + delta--] = dest->elems[is--];
1049             if (delta == 0)
1050               break;
1051           }
1052         else
1053           {
1054             /* Slide from the bottom.  */
1055             dest->elems[id + delta] = dest->elems[id];
1056             if (--id < 0)
1057               break;
1058           }
1059       }
1060
1061   /* Copy remaining SRC elements.  */
1062   memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1063
1064   return REG_NOERROR;
1065 }
1066
1067 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1068    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1069
1070 static reg_errcode_t
1071 re_node_set_init_union (dest, src1, src2)
1072      re_node_set *dest;
1073      const re_node_set *src1, *src2;
1074 {
1075   int i1, i2, id;
1076   if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1077     {
1078       dest->alloc = src1->nelem + src2->nelem;
1079       dest->elems = re_malloc (int, dest->alloc);
1080       if (BE (dest->elems == NULL, 0))
1081         return REG_ESPACE;
1082     }
1083   else
1084     {
1085       if (src1 != NULL && src1->nelem > 0)
1086         return re_node_set_init_copy (dest, src1);
1087       else if (src2 != NULL && src2->nelem > 0)
1088         return re_node_set_init_copy (dest, src2);
1089       else
1090         re_node_set_init_empty (dest);
1091       return REG_NOERROR;
1092     }
1093   for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1094     {
1095       if (src1->elems[i1] > src2->elems[i2])
1096         {
1097           dest->elems[id++] = src2->elems[i2++];
1098           continue;
1099         }
1100       if (src1->elems[i1] == src2->elems[i2])
1101         ++i2;
1102       dest->elems[id++] = src1->elems[i1++];
1103     }
1104   if (i1 < src1->nelem)
1105     {
1106       memcpy (dest->elems + id, src1->elems + i1,
1107              (src1->nelem - i1) * sizeof (int));
1108       id += src1->nelem - i1;
1109     }
1110   else if (i2 < src2->nelem)
1111     {
1112       memcpy (dest->elems + id, src2->elems + i2,
1113              (src2->nelem - i2) * sizeof (int));
1114       id += src2->nelem - i2;
1115     }
1116   dest->nelem = id;
1117   return REG_NOERROR;
1118 }
1119
1120 /* Calculate the union set of the sets DEST and SRC. And store it to
1121    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1122
1123 static reg_errcode_t
1124 re_node_set_merge (dest, src)
1125      re_node_set *dest;
1126      const re_node_set *src;
1127 {
1128   int is, id, sbase, delta;
1129   if (src == NULL || src->nelem == 0)
1130     return REG_NOERROR;
1131   if (dest->alloc < 2 * src->nelem + dest->nelem)
1132     {
1133       int new_alloc = 2 * (src->nelem + dest->alloc);
1134       int *new_buffer = re_realloc (dest->elems, int, new_alloc);
1135       if (BE (new_buffer == NULL, 0))
1136         return REG_ESPACE;
1137       dest->elems = new_buffer;
1138       dest->alloc = new_alloc;
1139     }
1140
1141   if (BE (dest->nelem == 0, 0))
1142     {
1143       dest->nelem = src->nelem;
1144       memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1145       return REG_NOERROR;
1146     }
1147
1148   /* Copy into the top of DEST the items of SRC that are not
1149      found in DEST.  Maybe we could binary search in DEST?  */
1150   for (sbase = dest->nelem + 2 * src->nelem,
1151        is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1152     {
1153       if (dest->elems[id] == src->elems[is])
1154         is--, id--;
1155       else if (dest->elems[id] < src->elems[is])
1156         dest->elems[--sbase] = src->elems[is--];
1157       else /* if (dest->elems[id] > src->elems[is]) */
1158         --id;
1159     }
1160
1161   if (is >= 0)
1162     {
1163       /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1164       sbase -= is + 1;
1165       memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
1166     }
1167
1168   id = dest->nelem - 1;
1169   is = dest->nelem + 2 * src->nelem - 1;
1170   delta = is - sbase + 1;
1171   if (delta == 0)
1172     return REG_NOERROR;
1173
1174   /* Now copy.  When DELTA becomes zero, the remaining
1175      DEST elements are already in place.  */
1176   dest->nelem += delta;
1177   for (;;)
1178     {
1179       if (dest->elems[is] > dest->elems[id])
1180         {
1181           /* Copy from the top.  */
1182           dest->elems[id + delta--] = dest->elems[is--];
1183           if (delta == 0)
1184             break;
1185         }
1186       else
1187         {
1188           /* Slide from the bottom.  */
1189           dest->elems[id + delta] = dest->elems[id];
1190           if (--id < 0)
1191             {
1192               /* Copy remaining SRC elements.  */
1193               memcpy (dest->elems, dest->elems + sbase,
1194                       delta * sizeof (int));
1195               break;
1196             }
1197         }
1198     }
1199
1200   return REG_NOERROR;
1201 }
1202
1203 /* Insert the new element ELEM to the re_node_set* SET.
1204    SET should not already have ELEM.
1205    return -1 if an error is occured, return 1 otherwise.  */
1206
1207 static int
1208 re_node_set_insert (set, elem)
1209      re_node_set *set;
1210      int elem;
1211 {
1212   int idx;
1213   /* In case the set is empty.  */
1214   if (set->alloc == 0)
1215     {
1216       if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
1217         return 1;
1218       else
1219         return -1;
1220     }
1221
1222   if (BE (set->nelem, 0) == 0)
1223     {
1224       /* We already guaranteed above that set->alloc != 0.  */
1225       set->elems[0] = elem;
1226       ++set->nelem;
1227       return 1;
1228     }
1229
1230   /* Realloc if we need.  */
1231   if (set->alloc == set->nelem)
1232     {
1233       int *new_array;
1234       set->alloc = set->alloc * 2;
1235       new_array = re_realloc (set->elems, int, set->alloc);
1236       if (BE (new_array == NULL, 0))
1237         return -1;
1238       set->elems = new_array;
1239     }
1240
1241   /* Move the elements which follows the new element.  Test the
1242      first element separately to skip a check in the inner loop.  */
1243   if (elem < set->elems[0])
1244     {
1245       idx = 0;
1246       for (idx = set->nelem; idx > 0; idx--)
1247         set->elems[idx] = set->elems[idx - 1];
1248     }
1249   else
1250     {
1251       for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1252         set->elems[idx] = set->elems[idx - 1];
1253     }
1254
1255   /* Insert the new element.  */
1256   set->elems[idx] = elem;
1257   ++set->nelem;
1258   return 1;
1259 }
1260
1261 /* Compare two node sets SET1 and SET2.
1262    return 1 if SET1 and SET2 are equivalent, return 0 otherwise.  */
1263
1264 static int
1265 re_node_set_compare (set1, set2)
1266      const re_node_set *set1, *set2;
1267 {
1268   int i;
1269   if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1270     return 0;
1271   for (i = set1->nelem ; --i >= 0 ; )
1272     if (set1->elems[i] != set2->elems[i])
1273       return 0;
1274   return 1;
1275 }
1276
1277 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1278
1279 static int
1280 re_node_set_contains (set, elem)
1281      const re_node_set *set;
1282      int elem;
1283 {
1284   int idx, right, mid;
1285   if (set->nelem <= 0)
1286     return 0;
1287
1288   /* Binary search the element.  */
1289   idx = 0;
1290   right = set->nelem - 1;
1291   while (idx < right)
1292     {
1293       mid = (idx + right) / 2;
1294       if (set->elems[mid] < elem)
1295         idx = mid + 1;
1296       else
1297         right = mid;
1298     }
1299   return set->elems[idx] == elem ? idx + 1 : 0;
1300 }
1301
1302 static void
1303 re_node_set_remove_at (set, idx)
1304      re_node_set *set;
1305      int idx;
1306 {
1307   if (idx < 0 || idx >= set->nelem)
1308     return;
1309   --set->nelem;
1310   for (; idx < set->nelem; idx++)
1311     set->elems[idx] = set->elems[idx + 1];
1312 }
1313 \f
1314
1315 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1316    Or return -1, if an error will be occured.  */
1317
1318 static int
1319 re_dfa_add_node (dfa, token, mode)
1320      re_dfa_t *dfa;
1321      re_token_t token;
1322      int mode;
1323 {
1324   if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1325     {
1326       int new_nodes_alloc = dfa->nodes_alloc * 2;
1327       re_token_t *new_array = re_realloc (dfa->nodes, re_token_t,
1328                                           new_nodes_alloc);
1329       if (BE (new_array == NULL, 0))
1330         return -1;
1331       dfa->nodes = new_array;
1332       if (mode)
1333         {
1334           int *new_nexts, *new_indices;
1335           re_node_set *new_edests, *new_eclosures, *new_inveclosures;
1336
1337           new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
1338           new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
1339           new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1340           new_eclosures = re_realloc (dfa->eclosures, re_node_set,
1341                                       new_nodes_alloc);
1342           new_inveclosures = re_realloc (dfa->inveclosures, re_node_set,
1343                                          new_nodes_alloc);
1344           if (BE (new_nexts == NULL || new_indices == NULL
1345                   || new_edests == NULL || new_eclosures == NULL
1346                   || new_inveclosures == NULL, 0))
1347             return -1;
1348           dfa->nexts = new_nexts;
1349           dfa->org_indices = new_indices;
1350           dfa->edests = new_edests;
1351           dfa->eclosures = new_eclosures;
1352           dfa->inveclosures = new_inveclosures;
1353         }
1354       dfa->nodes_alloc = new_nodes_alloc;
1355     }
1356   dfa->nodes[dfa->nodes_len] = token;
1357   dfa->nodes[dfa->nodes_len].opt_subexp = 0;
1358   dfa->nodes[dfa->nodes_len].duplicated = 0;
1359   dfa->nodes[dfa->nodes_len].constraint = 0;
1360   return dfa->nodes_len++;
1361 }
1362
1363 static unsigned int inline
1364 calc_state_hash (nodes, context)
1365      const re_node_set *nodes;
1366      unsigned int context;
1367 {
1368   unsigned int hash = nodes->nelem + context;
1369   int i;
1370   for (i = 0 ; i < nodes->nelem ; i++)
1371     hash += nodes->elems[i];
1372   return hash;
1373 }
1374
1375 /* Search for the state whose node_set is equivalent to NODES.
1376    Return the pointer to the state, if we found it in the DFA.
1377    Otherwise create the new one and return it.  In case of an error
1378    return NULL and set the error code in ERR.
1379    Note: - We assume NULL as the invalid state, then it is possible that
1380            return value is NULL and ERR is REG_NOERROR.
1381          - We never return non-NULL value in case of any errors, it is for
1382            optimization.  */
1383
1384 static re_dfastate_t*
1385 re_acquire_state (err, dfa, nodes)
1386      reg_errcode_t *err;
1387      re_dfa_t *dfa;
1388      const re_node_set *nodes;
1389 {
1390   unsigned int hash;
1391   re_dfastate_t *new_state;
1392   struct re_state_table_entry *spot;
1393   int i;
1394   if (BE (nodes->nelem == 0, 0))
1395     {
1396       *err = REG_NOERROR;
1397       return NULL;
1398     }
1399   hash = calc_state_hash (nodes, 0);
1400   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1401
1402   for (i = 0 ; i < spot->num ; i++)
1403     {
1404       re_dfastate_t *state = spot->array[i];
1405       if (hash != state->hash)
1406         continue;
1407       if (re_node_set_compare (&state->nodes, nodes))
1408         return state;
1409     }
1410
1411   /* There are no appropriate state in the dfa, create the new one.  */
1412   new_state = create_ci_newstate (dfa, nodes, hash);
1413   if (BE (new_state != NULL, 1))
1414     return new_state;
1415   else
1416     {
1417       *err = REG_ESPACE;
1418       return NULL;
1419     }
1420 }
1421
1422 /* Search for the state whose node_set is equivalent to NODES and
1423    whose context is equivalent to CONTEXT.
1424    Return the pointer to the state, if we found it in the DFA.
1425    Otherwise create the new one and return it.  In case of an error
1426    return NULL and set the error code in ERR.
1427    Note: - We assume NULL as the invalid state, then it is possible that
1428            return value is NULL and ERR is REG_NOERROR.
1429          - We never return non-NULL value in case of any errors, it is for
1430            optimization.  */
1431
1432 static re_dfastate_t*
1433 re_acquire_state_context (err, dfa, nodes, context)
1434      reg_errcode_t *err;
1435      re_dfa_t *dfa;
1436      const re_node_set *nodes;
1437      unsigned int context;
1438 {
1439   unsigned int hash;
1440   re_dfastate_t *new_state;
1441   struct re_state_table_entry *spot;
1442   int i;
1443   if (nodes->nelem == 0)
1444     {
1445       *err = REG_NOERROR;
1446       return NULL;
1447     }
1448   hash = calc_state_hash (nodes, context);
1449   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1450
1451   for (i = 0 ; i < spot->num ; i++)
1452     {
1453       re_dfastate_t *state = spot->array[i];
1454       if (state->hash == hash
1455           && state->context == context
1456           && re_node_set_compare (state->entrance_nodes, nodes))
1457         return state;
1458     }
1459   /* There are no appropriate state in `dfa', create the new one.  */
1460   new_state = create_cd_newstate (dfa, nodes, context, hash);
1461   if (BE (new_state != NULL, 1))
1462     return new_state;
1463   else
1464     {
1465       *err = REG_ESPACE;
1466       return NULL;
1467     }
1468 }
1469
1470 /* Allocate memory for DFA state and initialize common properties.
1471    Return the new state if succeeded, otherwise return NULL.  */
1472
1473 static re_dfastate_t *
1474 create_newstate_common (dfa, nodes, hash)
1475      re_dfa_t *dfa;
1476      const re_node_set *nodes;
1477      unsigned int hash;
1478 {
1479   re_dfastate_t *newstate;
1480   reg_errcode_t err;
1481   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1482   if (BE (newstate == NULL, 0))
1483     return NULL;
1484   err = re_node_set_init_copy (&newstate->nodes, nodes);
1485   if (BE (err != REG_NOERROR, 0))
1486     {
1487       re_free (newstate);
1488       return NULL;
1489     }
1490   newstate->trtable = NULL;
1491   newstate->hash = hash;
1492   return newstate;
1493 }
1494
1495 /* Store the new state NEWSTATE whose hash value is HASH in appropriate
1496    position.  Return value indicate the error code if failed.  */
1497
1498 static reg_errcode_t
1499 register_state (dfa, newstate, hash)
1500      re_dfa_t *dfa;
1501      re_dfastate_t *newstate;
1502      unsigned int hash;
1503 {
1504   struct re_state_table_entry *spot;
1505   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1506
1507   if (BE (spot->alloc <= spot->num, 0))
1508     {
1509       int new_alloc = 2 * spot->num + 2;
1510       re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1511                                               new_alloc);
1512       if (BE (new_array == NULL, 0))
1513         return REG_ESPACE;
1514       spot->array = new_array;
1515       spot->alloc = new_alloc;
1516     }
1517   spot->array[spot->num++] = newstate;
1518   return REG_NOERROR;
1519 }
1520
1521 /* Create the new state which is independ of contexts.
1522    Return the new state if succeeded, otherwise return NULL.  */
1523
1524 static re_dfastate_t *
1525 create_ci_newstate (dfa, nodes, hash)
1526      re_dfa_t *dfa;
1527      const re_node_set *nodes;
1528      unsigned int hash;
1529 {
1530   int i;
1531   reg_errcode_t err;
1532   re_dfastate_t *newstate;
1533   newstate = create_newstate_common (dfa, nodes, hash);
1534   if (BE (newstate == NULL, 0))
1535     return NULL;
1536   newstate->entrance_nodes = &newstate->nodes;
1537
1538   for (i = 0 ; i < nodes->nelem ; i++)
1539     {
1540       re_token_t *node = dfa->nodes + nodes->elems[i];
1541       re_token_type_t type = node->type;
1542       if (type == CHARACTER && !node->constraint)
1543         continue;
1544
1545       /* If the state has the halt node, the state is a halt state.  */
1546       else if (type == END_OF_RE)
1547         newstate->halt = 1;
1548 #ifdef RE_ENABLE_I18N
1549       else if (type == COMPLEX_BRACKET
1550                || type == OP_UTF8_PERIOD
1551                || (type == OP_PERIOD && dfa->mb_cur_max > 1))
1552         newstate->accept_mb = 1;
1553 #endif /* RE_ENABLE_I18N */
1554       else if (type == OP_BACK_REF)
1555         newstate->has_backref = 1;
1556       else if (type == ANCHOR || node->constraint)
1557         newstate->has_constraint = 1;
1558     }
1559   err = register_state (dfa, newstate, hash);
1560   if (BE (err != REG_NOERROR, 0))
1561     {
1562       free_state (newstate);
1563       newstate = NULL;
1564     }
1565   return newstate;
1566 }
1567
1568 /* Create the new state which is depend on the context CONTEXT.
1569    Return the new state if succeeded, otherwise return NULL.  */
1570
1571 static re_dfastate_t *
1572 create_cd_newstate (dfa, nodes, context, hash)
1573      re_dfa_t *dfa;
1574      const re_node_set *nodes;
1575      unsigned int context, hash;
1576 {
1577   int i, nctx_nodes = 0;
1578   reg_errcode_t err;
1579   re_dfastate_t *newstate;
1580
1581   newstate = create_newstate_common (dfa, nodes, hash);
1582   if (BE (newstate == NULL, 0))
1583     return NULL;
1584   newstate->context = context;
1585   newstate->entrance_nodes = &newstate->nodes;
1586
1587   for (i = 0 ; i < nodes->nelem ; i++)
1588     {
1589       unsigned int constraint = 0;
1590       re_token_t *node = dfa->nodes + nodes->elems[i];
1591       re_token_type_t type = node->type;
1592       if (node->constraint)
1593         constraint = node->constraint;
1594
1595       if (type == CHARACTER && !constraint)
1596         continue;
1597       /* If the state has the halt node, the state is a halt state.  */
1598       else if (type == END_OF_RE)
1599         newstate->halt = 1;
1600 #ifdef RE_ENABLE_I18N
1601       else if (type == COMPLEX_BRACKET
1602                || type == OP_UTF8_PERIOD
1603                || (type == OP_PERIOD && dfa->mb_cur_max > 1))
1604         newstate->accept_mb = 1;
1605 #endif /* RE_ENABLE_I18N */
1606       else if (type == OP_BACK_REF)
1607         newstate->has_backref = 1;
1608       else if (type == ANCHOR)
1609         constraint = node->opr.ctx_type;
1610
1611       if (constraint)
1612         {
1613           if (newstate->entrance_nodes == &newstate->nodes)
1614             {
1615               newstate->entrance_nodes = re_malloc (re_node_set, 1);
1616               if (BE (newstate->entrance_nodes == NULL, 0))
1617                 {
1618                   free_state (newstate);
1619                   return NULL;
1620                 }
1621               re_node_set_init_copy (newstate->entrance_nodes, nodes);
1622               nctx_nodes = 0;
1623               newstate->has_constraint = 1;
1624             }
1625
1626           if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1627             {
1628               re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1629               ++nctx_nodes;
1630             }
1631         }
1632     }
1633   err = register_state (dfa, newstate, hash);
1634   if (BE (err != REG_NOERROR, 0))
1635     {
1636       free_state (newstate);
1637       newstate = NULL;
1638     }
1639   return  newstate;
1640 }
1641
1642 static void
1643 free_state (state)
1644      re_dfastate_t *state;
1645 {
1646   if (state->entrance_nodes != &state->nodes)
1647     {
1648       re_node_set_free (state->entrance_nodes);
1649       re_free (state->entrance_nodes);
1650     }
1651   re_node_set_free (&state->nodes);
1652   re_free (state->trtable);
1653   re_free (state);
1654 }