compat/regex/regex_internal.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2006, 2010 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <http://www.gnu.org/licenses/>.  */
  19
  20 static void re_string_construct_common (const char *str, int len,
  21                                         re_string_t *pstr,
  22                                         RE_TRANSLATE_TYPE trans, int icase,
  23                                         const re_dfa_t *dfa) internal_function;
  24 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
  25                                           const re_node_set *nodes,
  26                                           unsigned int hash) internal_function;
  27 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
  28                                           const re_node_set *nodes,
  29                                           unsigned int context,
  30                                           unsigned int hash) internal_function;
  31
  32 #ifdef GAWK
  33 #undef MAX      /* safety */
  34 static int
  35 MAX(size_t a, size_t b)
  36 {
  37         return (a > b ? a : b);
  38 }
  39 #endif
  40 \f
  41 /* Functions for string operation.  */
  42
  43 /* This function allocate the buffers.  It is necessary to call
  44    re_string_reconstruct before using the object.  */
  45
  46 static reg_errcode_t
  47 internal_function
  48 re_string_allocate (re_string_t *pstr, const char *str, int len, int init_len,
  49                     RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
  50 {
  51   reg_errcode_t ret;
  52   int init_buf_len;
  53
  54   /* Ensure at least one character fits into the buffers.  */
  55   if (init_len < dfa->mb_cur_max)
  56     init_len = dfa->mb_cur_max;
  57   init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
  58   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  59
  60   ret = re_string_realloc_buffers (pstr, init_buf_len);
  61   if (BE (ret != REG_NOERROR, 0))
  62     return ret;
  63
  64   pstr->word_char = dfa->word_char;
  65   pstr->word_ops_used = dfa->word_ops_used;
  66   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  67   pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
  68   pstr->valid_raw_len = pstr->valid_len;
  69   return REG_NOERROR;
  70 }
  71
  72 /* This function allocate the buffers, and initialize them.  */
  73
  74 static reg_errcode_t
  75 internal_function
  76 re_string_construct (re_string_t *pstr, const char *str, int len,
  77                      RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
  78 {
  79   reg_errcode_t ret;
  80   memset (pstr, '\0', sizeof (re_string_t));
  81   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  82
  83   if (len > 0)
  84     {
  85       ret = re_string_realloc_buffers (pstr, len + 1);
  86       if (BE (ret != REG_NOERROR, 0))
  87         return ret;
  88     }
  89   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  90
  91   if (icase)
  92     {
  93 #ifdef RE_ENABLE_I18N
  94       if (dfa->mb_cur_max > 1)
  95         {
  96           while (1)
  97             {
  98               ret = build_wcs_upper_buffer (pstr);
  99               if (BE (ret != REG_NOERROR, 0))
 100                 return ret;
 101               if (pstr->valid_raw_len >= len)
 102                 break;
 103               if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
 104                 break;
 105               ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
 106               if (BE (ret != REG_NOERROR, 0))
 107                 return ret;
 108             }
 109         }
 110       else
 111 #endif /* RE_ENABLE_I18N  */
 112         build_upper_buffer (pstr);
 113     }
 114   else
 115     {
 116 #ifdef RE_ENABLE_I18N
 117       if (dfa->mb_cur_max > 1)
 118         build_wcs_buffer (pstr);
 119       else
 120 #endif /* RE_ENABLE_I18N  */
 121         {
 122           if (trans != NULL)
 123             re_string_translate_buffer (pstr);
 124           else
 125             {
 126               pstr->valid_len = pstr->bufs_len;
 127               pstr->valid_raw_len = pstr->bufs_len;
 128             }
 129         }
 130     }
 131
 132   return REG_NOERROR;
 133 }
 134
 135 /* Helper functions for re_string_allocate, and re_string_construct.  */
 136
 137 static reg_errcode_t
 138 internal_function
 139 re_string_realloc_buffers (re_string_t *pstr, int new_buf_len)
 140 {
 141 #ifdef RE_ENABLE_I18N
 142   if (pstr->mb_cur_max > 1)
 143     {
 144       wint_t *new_wcs;
 145
 146       /* Avoid overflow in realloc.  */
 147       const size_t max_object_size = MAX (sizeof (wint_t), sizeof (int));
 148       if (BE (SIZE_MAX / max_object_size < new_buf_len, 0))
 149         return REG_ESPACE;
 150
 151       new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
 152       if (BE (new_wcs == NULL, 0))
 153         return REG_ESPACE;
 154       pstr->wcs = new_wcs;
 155       if (pstr->offsets != NULL)
 156         {
 157           int *new_offsets = re_realloc (pstr->offsets, int, new_buf_len);
 158           if (BE (new_offsets == NULL, 0))
 159             return REG_ESPACE;
 160           pstr->offsets = new_offsets;
 161         }
 162     }
 163 #endif /* RE_ENABLE_I18N  */
 164   if (pstr->mbs_allocated)
 165     {
 166       unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
 167                                            new_buf_len);
 168       if (BE (new_mbs == NULL, 0))
 169         return REG_ESPACE;
 170       pstr->mbs = new_mbs;
 171     }
 172   pstr->bufs_len = new_buf_len;
 173   return REG_NOERROR;
 174 }
 175
 176
 177 static void
 178 internal_function
 179 re_string_construct_common (const char *str, int len, re_string_t *pstr,
 180                             RE_TRANSLATE_TYPE trans, int icase,
 181                             const re_dfa_t *dfa)
 182 {
 183   pstr->raw_mbs = (const unsigned char *) str;
 184   pstr->len = len;
 185   pstr->raw_len = len;
 186   pstr->trans = trans;
 187   pstr->icase = icase ? 1 : 0;
 188   pstr->mbs_allocated = (trans != NULL || icase);
 189   pstr->mb_cur_max = dfa->mb_cur_max;
 190   pstr->is_utf8 = dfa->is_utf8;
 191   pstr->map_notascii = dfa->map_notascii;
 192   pstr->stop = pstr->len;
 193   pstr->raw_stop = pstr->stop;
 194 }
 195
 196 #ifdef RE_ENABLE_I18N
 197
 198 /* Build wide character buffer PSTR->WCS.
 199    If the byte sequence of the string are:
 200      <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
 201    Then wide character buffer will be:
 202      <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
 203    We use WEOF for padding, they indicate that the position isn't
 204    a first byte of a multibyte character.
 205
 206    Note that this function assumes PSTR->VALID_LEN elements are already
 207    built and starts from PSTR->VALID_LEN.  */
 208
 209 static void
 210 internal_function
 211 build_wcs_buffer (re_string_t *pstr)
 212 {
 213 #ifdef _LIBC
 214   unsigned char buf[MB_LEN_MAX];
 215   assert (MB_LEN_MAX >= pstr->mb_cur_max);
 216 #else
 217   unsigned char buf[64];
 218 #endif
 219   mbstate_t prev_st;
 220   int byte_idx, end_idx, remain_len;
 221   size_t mbclen;
 222
 223   /* Build the buffers from pstr->valid_len to either pstr->len or
 224      pstr->bufs_len.  */
 225   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 226   for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
 227     {
 228       wchar_t wc;
 229       const char *p;
 230
 231       remain_len = end_idx - byte_idx;
 232       prev_st = pstr->cur_state;
 233       /* Apply the translation if we need.  */
 234       if (BE (pstr->trans != NULL, 0))
 235         {
 236           int i, ch;
 237
 238           for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 239             {
 240               ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
 241               buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
 242             }
 243           p = (const char *) buf;
 244         }
 245       else
 246         p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
 247       mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 248       if (BE (mbclen == (size_t) -2, 0))
 249         {
 250           /* The buffer doesn't have enough space, finish to build.  */
 251           pstr->cur_state = prev_st;
 252           break;
 253         }
 254       else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
 255         {
 256           /* We treat these cases as a singlebyte character.  */
 257           mbclen = 1;
 258           wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 259           if (BE (pstr->trans != NULL, 0))
 260             wc = pstr->trans[wc];
 261           pstr->cur_state = prev_st;
 262         }
 263
 264       /* Write wide character and padding.  */
 265       pstr->wcs[byte_idx++] = wc;
 266       /* Write paddings.  */
 267       for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 268         pstr->wcs[byte_idx++] = WEOF;
 269     }
 270   pstr->valid_len = byte_idx;
 271   pstr->valid_raw_len = byte_idx;
 272 }
 273
 274 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
 275    but for REG_ICASE.  */
 276
 277 static reg_errcode_t
 278 internal_function
 279 build_wcs_upper_buffer (re_string_t *pstr)
 280 {
 281   mbstate_t prev_st;
 282   int src_idx, byte_idx, end_idx, remain_len;
 283   size_t mbclen;
 284 #ifdef _LIBC
 285   char buf[MB_LEN_MAX];
 286   assert (MB_LEN_MAX >= pstr->mb_cur_max);
 287 #else
 288   char buf[64];
 289 #endif
 290
 291   byte_idx = pstr->valid_len;
 292   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 293
 294   /* The following optimization assumes that ASCII characters can be
 295      mapped to wide characters with a simple cast.  */
 296   if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
 297     {
 298       while (byte_idx < end_idx)
 299         {
 300           wchar_t wc;
 301
 302           if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
 303               && mbsinit (&pstr->cur_state))
 304             {
 305               /* In case of a singlebyte character.  */
 306               pstr->mbs[byte_idx]
 307                 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
 308               /* The next step uses the assumption that wchar_t is encoded
 309                  ASCII-safe: all ASCII values can be converted like this.  */
 310               pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
 311               ++byte_idx;
 312               continue;
 313             }
 314
 315           remain_len = end_idx - byte_idx;
 316           prev_st = pstr->cur_state;
 317           mbclen = __mbrtowc (&wc,
 318                               ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
 319                                + byte_idx), remain_len, &pstr->cur_state);
 320           if (BE (mbclen + 2 > 2, 1))
 321             {
 322               wchar_t wcu = wc;
 323               if (iswlower (wc))
 324                 {
 325                   size_t mbcdlen;
 326
 327                   wcu = towupper (wc);
 328                   mbcdlen = wcrtomb (buf, wcu, &prev_st);
 329                   if (BE (mbclen == mbcdlen, 1))
 330                     memcpy (pstr->mbs + byte_idx, buf, mbclen);
 331                   else
 332                     {
 333                       src_idx = byte_idx;
 334                       goto offsets_needed;
 335                     }
 336                 }
 337               else
 338                 memcpy (pstr->mbs + byte_idx,
 339                         pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
 340               pstr->wcs[byte_idx++] = wcu;
 341               /* Write paddings.  */
 342               for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 343                 pstr->wcs[byte_idx++] = WEOF;
 344             }
 345           else if (mbclen == (size_t) -1 || mbclen == 0)
 346             {
 347               /* It is an invalid character or '\0'.  Just use the byte.  */
 348               int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 349               pstr->mbs[byte_idx] = ch;
 350               /* And also cast it to wide char.  */
 351               pstr->wcs[byte_idx++] = (wchar_t) ch;
 352               if (BE (mbclen == (size_t) -1, 0))
 353                 pstr->cur_state = prev_st;
 354             }
 355           else
 356             {
 357               /* The buffer doesn't have enough space, finish to build.  */
 358               pstr->cur_state = prev_st;
 359               break;
 360             }
 361         }
 362       pstr->valid_len = byte_idx;
 363       pstr->valid_raw_len = byte_idx;
 364       return REG_NOERROR;
 365     }
 366   else
 367     for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
 368       {
 369         wchar_t wc;
 370         const char *p;
 371       offsets_needed:
 372         remain_len = end_idx - byte_idx;
 373         prev_st = pstr->cur_state;
 374         if (BE (pstr->trans != NULL, 0))
 375           {
 376             int i, ch;
 377
 378             for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 379               {
 380                 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
 381                 buf[i] = pstr->trans[ch];
 382               }
 383             p = (const char *) buf;
 384           }
 385         else
 386           p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
 387         mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 388         if (BE (mbclen + 2 > 2, 1))
 389           {
 390             wchar_t wcu = wc;
 391             if (iswlower (wc))
 392               {
 393                 size_t mbcdlen;
 394
 395                 wcu = towupper (wc);
 396                 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
 397                 if (BE (mbclen == mbcdlen, 1))
 398                   memcpy (pstr->mbs + byte_idx, buf, mbclen);
 399                 else if (mbcdlen != (size_t) -1)
 400                   {
 401                     size_t i;
 402
 403                     if (byte_idx + mbcdlen > pstr->bufs_len)
 404                       {
 405                         pstr->cur_state = prev_st;
 406                         break;
 407                       }
 408
 409                     if (pstr->offsets == NULL)
 410                       {
 411                         pstr->offsets = re_malloc (int, pstr->bufs_len);
 412
 413                         if (pstr->offsets == NULL)
 414                           return REG_ESPACE;
 415                       }
 416                     if (!pstr->offsets_needed)
 417                       {
 418                         for (i = 0; i < (size_t) byte_idx; ++i)
 419                           pstr->offsets[i] = i;
 420                         pstr->offsets_needed = 1;
 421                       }
 422
 423                     memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
 424                     pstr->wcs[byte_idx] = wcu;
 425                     pstr->offsets[byte_idx] = src_idx;
 426                     for (i = 1; i < mbcdlen; ++i)
 427                       {
 428                         pstr->offsets[byte_idx + i]
 429                           = src_idx + (i < mbclen ? i : mbclen - 1);
 430                         pstr->wcs[byte_idx + i] = WEOF;
 431                       }
 432                     pstr->len += mbcdlen - mbclen;
 433                     if (pstr->raw_stop > src_idx)
 434                       pstr->stop += mbcdlen - mbclen;
 435                     end_idx = (pstr->bufs_len > pstr->len)
 436                               ? pstr->len : pstr->bufs_len;
 437                     byte_idx += mbcdlen;
 438                     src_idx += mbclen;
 439                     continue;
 440                   }
 441                 else
 442                   memcpy (pstr->mbs + byte_idx, p, mbclen);
 443               }
 444             else
 445               memcpy (pstr->mbs + byte_idx, p, mbclen);
 446
 447             if (BE (pstr->offsets_needed != 0, 0))
 448               {
 449                 size_t i;
 450                 for (i = 0; i < mbclen; ++i)
 451                   pstr->offsets[byte_idx + i] = src_idx + i;
 452               }
 453             src_idx += mbclen;
 454
 455             pstr->wcs[byte_idx++] = wcu;
 456             /* Write paddings.  */
 457             for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 458               pstr->wcs[byte_idx++] = WEOF;
 459           }
 460         else if (mbclen == (size_t) -1 || mbclen == 0)
 461           {
 462             /* It is an invalid character or '\0'.  Just use the byte.  */
 463             int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
 464
 465             if (BE (pstr->trans != NULL, 0))
 466               ch = pstr->trans [ch];
 467             pstr->mbs[byte_idx] = ch;
 468
 469             if (BE (pstr->offsets_needed != 0, 0))
 470               pstr->offsets[byte_idx] = src_idx;
 471             ++src_idx;
 472
 473             /* And also cast it to wide char.  */
 474             pstr->wcs[byte_idx++] = (wchar_t) ch;
 475             if (BE (mbclen == (size_t) -1, 0))
 476               pstr->cur_state = prev_st;
 477           }
 478         else
 479           {
 480             /* The buffer doesn't have enough space, finish to build.  */
 481             pstr->cur_state = prev_st;
 482             break;
 483           }
 484       }
 485   pstr->valid_len = byte_idx;
 486   pstr->valid_raw_len = src_idx;
 487   return REG_NOERROR;
 488 }
 489
 490 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
 491    Return the index.  */
 492
 493 static int
 494 internal_function
 495 re_string_skip_chars (re_string_t *pstr, int new_raw_idx, wint_t *last_wc)
 496 {
 497   mbstate_t prev_st;
 498   int rawbuf_idx;
 499   size_t mbclen;
 500   wint_t wc = WEOF;
 501
 502   /* Skip the characters which are not necessary to check.  */
 503   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
 504        rawbuf_idx < new_raw_idx;)
 505     {
 506       wchar_t wc2;
 507       int remain_len = pstr->len - rawbuf_idx;
 508       prev_st = pstr->cur_state;
 509       mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
 510                           remain_len, &pstr->cur_state);
 511       if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
 512         {
 513           /* We treat these cases as a single byte character.  */
 514           if (mbclen == 0 || remain_len == 0)
 515             wc = L'\0';
 516           else
 517             wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
 518           mbclen = 1;
 519           pstr->cur_state = prev_st;
 520         }
 521       else
 522         wc = (wint_t) wc2;
 523       /* Then proceed the next character.  */
 524       rawbuf_idx += mbclen;
 525     }
 526   *last_wc = (wint_t) wc;
 527   return rawbuf_idx;
 528 }
 529 #endif /* RE_ENABLE_I18N  */
 530
 531 /* Build the buffer PSTR->MBS, and apply the translation if we need.
 532    This function is used in case of REG_ICASE.  */
 533
 534 static void
 535 internal_function
 536 build_upper_buffer (re_string_t *pstr)
 537 {
 538   int char_idx, end_idx;
 539   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 540
 541   for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
 542     {
 543       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
 544       if (BE (pstr->trans != NULL, 0))
 545         ch = pstr->trans[ch];
 546       if (islower (ch))
 547         pstr->mbs[char_idx] = toupper (ch);
 548       else
 549         pstr->mbs[char_idx] = ch;
 550     }
 551   pstr->valid_len = char_idx;
 552   pstr->valid_raw_len = char_idx;
 553 }
 554
 555 /* Apply TRANS to the buffer in PSTR.  */
 556
 557 static void
 558 internal_function
 559 re_string_translate_buffer (re_string_t *pstr)
 560 {
 561   int buf_idx, end_idx;
 562   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 563
 564   for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
 565     {
 566       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
 567       pstr->mbs[buf_idx] = pstr->trans[ch];
 568     }
 569
 570   pstr->valid_len = buf_idx;
 571   pstr->valid_raw_len = buf_idx;
 572 }
 573
 574 /* This function re-construct the buffers.
 575    Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
 576    convert to upper case in case of REG_ICASE, apply translation.  */
 577
 578 static reg_errcode_t
 579 internal_function
 580 re_string_reconstruct (re_string_t *pstr, int idx, int eflags)
 581 {
 582   int offset = idx - pstr->raw_mbs_idx;
 583   if (BE (offset < 0, 0))
 584     {
 585       /* Reset buffer.  */
 586 #ifdef RE_ENABLE_I18N
 587       if (pstr->mb_cur_max > 1)
 588         memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 589 #endif /* RE_ENABLE_I18N */
 590       pstr->len = pstr->raw_len;
 591       pstr->stop = pstr->raw_stop;
 592       pstr->valid_len = 0;
 593       pstr->raw_mbs_idx = 0;
 594       pstr->valid_raw_len = 0;
 595       pstr->offsets_needed = 0;
 596       pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 597                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
 598       if (!pstr->mbs_allocated)
 599         pstr->mbs = (unsigned char *) pstr->raw_mbs;
 600       offset = idx;
 601     }
 602
 603   if (BE (offset != 0, 1))
 604     {
 605       /* Should the already checked characters be kept?  */
 606       if (BE (offset < pstr->valid_raw_len, 1))
 607         {
 608           /* Yes, move them to the front of the buffer.  */
 609 #ifdef RE_ENABLE_I18N
 610           if (BE (pstr->offsets_needed, 0))
 611             {
 612               int low = 0, high = pstr->valid_len, mid;
 613               do
 614                 {
 615                   mid = low + (high - low) / 2;
 616                   if (pstr->offsets[mid] > offset)
 617                     high = mid;
 618                   else if (pstr->offsets[mid] < offset)
 619                     low = mid + 1;
 620                   else
 621                     break;
 622                 }
 623               while (low < high);
 624               if (pstr->offsets[mid] < offset)
 625                 ++mid;
 626               pstr->tip_context = re_string_context_at (pstr, mid - 1,
 627                                                         eflags);
 628               /* This can be quite complicated, so handle specially
 629                  only the common and easy case where the character with
 630                  different length representation of lower and upper
 631                  case is present at or after offset.  */
 632               if (pstr->valid_len > offset
 633                   && mid == offset && pstr->offsets[mid] == offset)
 634                 {
 635                   memmove (pstr->wcs, pstr->wcs + offset,
 636                            (pstr->valid_len - offset) * sizeof (wint_t));
 637                   memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
 638                   pstr->valid_len -= offset;
 639                   pstr->valid_raw_len -= offset;
 640                   for (low = 0; low < pstr->valid_len; low++)
 641                     pstr->offsets[low] = pstr->offsets[low + offset] - offset;
 642                 }
 643               else
 644                 {
 645                   /* Otherwise, just find out how long the partial multibyte
 646                      character at offset is and fill it with WEOF/255.  */
 647                   pstr->len = pstr->raw_len - idx + offset;
 648                   pstr->stop = pstr->raw_stop - idx + offset;
 649                   pstr->offsets_needed = 0;
 650                   while (mid > 0 && pstr->offsets[mid - 1] == offset)
 651                     --mid;
 652                   while (mid < pstr->valid_len)
 653                     if (pstr->wcs[mid] != WEOF)
 654                       break;
 655                     else
 656                       ++mid;
 657                   if (mid == pstr->valid_len)
 658                     pstr->valid_len = 0;
 659                   else
 660                     {
 661                       pstr->valid_len = pstr->offsets[mid] - offset;
 662                       if (pstr->valid_len)
 663                         {
 664                           for (low = 0; low < pstr->valid_len; ++low)
 665                             pstr->wcs[low] = WEOF;
 666                           memset (pstr->mbs, 255, pstr->valid_len);
 667                         }
 668                     }
 669                   pstr->valid_raw_len = pstr->valid_len;
 670                 }
 671             }
 672           else
 673 #endif
 674             {
 675               pstr->tip_context = re_string_context_at (pstr, offset - 1,
 676                                                         eflags);
 677 #ifdef RE_ENABLE_I18N
 678               if (pstr->mb_cur_max > 1)
 679                 memmove (pstr->wcs, pstr->wcs + offset,
 680                          (pstr->valid_len - offset) * sizeof (wint_t));
 681 #endif /* RE_ENABLE_I18N */
 682               if (BE (pstr->mbs_allocated, 0))
 683                 memmove (pstr->mbs, pstr->mbs + offset,
 684                          pstr->valid_len - offset);
 685               pstr->valid_len -= offset;
 686               pstr->valid_raw_len -= offset;
 687 #if DEBUG
 688               assert (pstr->valid_len > 0);
 689 #endif
 690             }
 691         }
 692       else
 693         {
 694 #ifdef RE_ENABLE_I18N
 695           /* No, skip all characters until IDX.  */
 696           int prev_valid_len = pstr->valid_len;
 697
 698           if (BE (pstr->offsets_needed, 0))
 699             {
 700               pstr->len = pstr->raw_len - idx + offset;
 701               pstr->stop = pstr->raw_stop - idx + offset;
 702               pstr->offsets_needed = 0;
 703             }
 704 #endif
 705           pstr->valid_len = 0;
 706 #ifdef RE_ENABLE_I18N
 707           if (pstr->mb_cur_max > 1)
 708             {
 709               int wcs_idx;
 710               wint_t wc = WEOF;
 711
 712               if (pstr->is_utf8)
 713                 {
 714                   const unsigned char *raw, *p, *end;
 715
 716                   /* Special case UTF-8.  Multi-byte chars start with any
 717                      byte other than 0x80 - 0xbf.  */
 718                   raw = pstr->raw_mbs + pstr->raw_mbs_idx;
 719                   end = raw + (offset - pstr->mb_cur_max);
 720                   if (end < pstr->raw_mbs)
 721                     end = pstr->raw_mbs;
 722                   p = raw + offset - 1;
 723 #ifdef _LIBC
 724                   /* We know the wchar_t encoding is UCS4, so for the simple
 725                      case, ASCII characters, skip the conversion step.  */
 726                   if (isascii (*p) && BE (pstr->trans == NULL, 1))
 727                     {
 728                       memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 729                       /* pstr->valid_len = 0; */
 730                       wc = (wchar_t) *p;
 731                     }
 732                   else
 733 #endif
 734                     for (; p >= end; --p)
 735                       if ((*p & 0xc0) != 0x80)
 736                         {
 737                           mbstate_t cur_state;
 738                           wchar_t wc2;
 739                           int mlen = raw + pstr->len - p;
 740                           unsigned char buf[6];
 741                           size_t mbclen;
 742
 743                           if (BE (pstr->trans != NULL, 0))
 744                             {
 745                               int i = mlen < 6 ? mlen : 6;
 746                               while (--i >= 0)
 747                                 buf[i] = pstr->trans[p[i]];
 748                             }
 749                           /* XXX Don't use mbrtowc, we know which conversion
 750                              to use (UTF-8 -> UCS4).  */
 751                           memset (&cur_state, 0, sizeof (cur_state));
 752                           mbclen = __mbrtowc (&wc2, (const char *) p, mlen,
 753                                               &cur_state);
 754                           if (raw + offset - p <= mbclen
 755                               && mbclen < (size_t) -2)
 756                             {
 757                               memset (&pstr->cur_state, '\0',
 758                                       sizeof (mbstate_t));
 759                               pstr->valid_len = mbclen - (raw + offset - p);
 760                               wc = wc2;
 761                             }
 762                           break;
 763                         }
 764                 }
 765
 766               if (wc == WEOF)
 767                 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
 768               if (wc == WEOF)
 769                 pstr->tip_context
 770                   = re_string_context_at (pstr, prev_valid_len - 1, eflags);
 771               else
 772                 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
 773                                       && IS_WIDE_WORD_CHAR (wc))
 774                                      ? CONTEXT_WORD
 775                                      : ((IS_WIDE_NEWLINE (wc)
 776                                          && pstr->newline_anchor)
 777                                         ? CONTEXT_NEWLINE : 0));
 778               if (BE (pstr->valid_len, 0))
 779                 {
 780                   for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
 781                     pstr->wcs[wcs_idx] = WEOF;
 782                   if (pstr->mbs_allocated)
 783                     memset (pstr->mbs, 255, pstr->valid_len);
 784                 }
 785               pstr->valid_raw_len = pstr->valid_len;
 786             }
 787           else
 788 #endif /* RE_ENABLE_I18N */
 789             {
 790               int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
 791               pstr->valid_raw_len = 0;
 792               if (pstr->trans)
 793                 c = pstr->trans[c];
 794               pstr->tip_context = (bitset_contain (pstr->word_char, c)
 795                                    ? CONTEXT_WORD
 796                                    : ((IS_NEWLINE (c) && pstr->newline_anchor)
 797                                       ? CONTEXT_NEWLINE : 0));
 798             }
 799         }
 800       if (!BE (pstr->mbs_allocated, 0))
 801         pstr->mbs += offset;
 802     }
 803   pstr->raw_mbs_idx = idx;
 804   pstr->len -= offset;
 805   pstr->stop -= offset;
 806
 807   /* Then build the buffers.  */
 808 #ifdef RE_ENABLE_I18N
 809   if (pstr->mb_cur_max > 1)
 810     {
 811       if (pstr->icase)
 812         {
 813           reg_errcode_t ret = build_wcs_upper_buffer (pstr);
 814           if (BE (ret != REG_NOERROR, 0))
 815             return ret;
 816         }
 817       else
 818         build_wcs_buffer (pstr);
 819     }
 820   else
 821 #endif /* RE_ENABLE_I18N */
 822     if (BE (pstr->mbs_allocated, 0))
 823       {
 824         if (pstr->icase)
 825           build_upper_buffer (pstr);
 826         else if (pstr->trans != NULL)
 827           re_string_translate_buffer (pstr);
 828       }
 829     else
 830       pstr->valid_len = pstr->len;
 831
 832   pstr->cur_idx = 0;
 833   return REG_NOERROR;
 834 }
 835
 836 static unsigned char
 837 internal_function __attribute ((pure))
 838 re_string_peek_byte_case (const re_string_t *pstr, int idx)
 839 {
 840   int ch, off;
 841
 842   /* Handle the common (easiest) cases first.  */
 843   if (BE (!pstr->mbs_allocated, 1))
 844     return re_string_peek_byte (pstr, idx);
 845
 846 #ifdef RE_ENABLE_I18N
 847   if (pstr->mb_cur_max > 1
 848       && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
 849     return re_string_peek_byte (pstr, idx);
 850 #endif
 851
 852   off = pstr->cur_idx + idx;
 853 #ifdef RE_ENABLE_I18N
 854   if (pstr->offsets_needed)
 855     off = pstr->offsets[off];
 856 #endif
 857
 858   ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 859
 860 #ifdef RE_ENABLE_I18N
 861   /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
 862      this function returns CAPITAL LETTER I instead of first byte of
 863      DOTLESS SMALL LETTER I.  The latter would confuse the parser,
 864      since peek_byte_case doesn't advance cur_idx in any way.  */
 865   if (pstr->offsets_needed && !isascii (ch))
 866     return re_string_peek_byte (pstr, idx);
 867 #endif
 868
 869   return ch;
 870 }
 871
 872 static unsigned char
 873 internal_function __attribute ((pure))
 874 re_string_fetch_byte_case (re_string_t *pstr)
 875 {
 876   if (BE (!pstr->mbs_allocated, 1))
 877     return re_string_fetch_byte (pstr);
 878
 879 #ifdef RE_ENABLE_I18N
 880   if (pstr->offsets_needed)
 881     {
 882       int off, ch;
 883
 884       /* For tr_TR.UTF-8 [[:islower:]] there is
 885          [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
 886          in that case the whole multi-byte character and return
 887          the original letter.  On the other side, with
 888          [[: DOTLESS SMALL LETTER I return [[:I, as doing
 889          anything else would complicate things too much.  */
 890
 891       if (!re_string_first_byte (pstr, pstr->cur_idx))
 892         return re_string_fetch_byte (pstr);
 893
 894       off = pstr->offsets[pstr->cur_idx];
 895       ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 896
 897       if (! isascii (ch))
 898         return re_string_fetch_byte (pstr);
 899
 900       re_string_skip_bytes (pstr,
 901                             re_string_char_size_at (pstr, pstr->cur_idx));
 902       return ch;
 903     }
 904 #endif
 905
 906   return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
 907 }
 908
 909 static void
 910 internal_function
 911 re_string_destruct (re_string_t *pstr)
 912 {
 913 #ifdef RE_ENABLE_I18N
 914   re_free (pstr->wcs);
 915   re_free (pstr->offsets);
 916 #endif /* RE_ENABLE_I18N  */
 917   if (pstr->mbs_allocated)
 918     re_free (pstr->mbs);
 919 }
 920
 921 /* Return the context at IDX in INPUT.  */
 922
 923 static unsigned int
 924 internal_function
 925 re_string_context_at (const re_string_t *input, int idx, int eflags)
 926 {
 927   int c;
 928   if (BE (idx < 0, 0))
 929     /* In this case, we use the value stored in input->tip_context,
 930        since we can't know the character in input->mbs[-1] here.  */
 931     return input->tip_context;
 932   if (BE (idx == input->len, 0))
 933     return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
 934             : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
 935 #ifdef RE_ENABLE_I18N
 936   if (input->mb_cur_max > 1)
 937     {
 938       wint_t wc;
 939       int wc_idx = idx;
 940       while(input->wcs[wc_idx] == WEOF)
 941         {
 942 #ifdef DEBUG
 943           /* It must not happen.  */
 944           assert (wc_idx >= 0);
 945 #endif
 946           --wc_idx;
 947           if (wc_idx < 0)
 948             return input->tip_context;
 949         }
 950       wc = input->wcs[wc_idx];
 951       if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
 952         return CONTEXT_WORD;
 953       return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
 954               ? CONTEXT_NEWLINE : 0);
 955     }
 956   else
 957 #endif
 958     {
 959       c = re_string_byte_at (input, idx);
 960       if (bitset_contain (input->word_char, c))
 961         return CONTEXT_WORD;
 962       return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
 963     }
 964 }
 965 \f
 966 /* Functions for set operation.  */
 967
 968 static reg_errcode_t
 969 internal_function
 970 re_node_set_alloc (re_node_set *set, int size)
 971 {
 972   /*
 973    * ADR: valgrind says size can be 0, which then doesn't
 974    * free the block of size 0.  Harumph. This seems
 975    * to work ok, though.
 976    */
 977   if (size == 0)
 978     {
 979        memset(set, 0, sizeof(*set));
 980        return REG_NOERROR;
 981     }
 982   set->alloc = size;
 983   set->nelem = 0;
 984   set->elems = re_malloc (int, size);
 985   if (BE (set->elems == NULL, 0))
 986     return REG_ESPACE;
 987   return REG_NOERROR;
 988 }
 989
 990 static reg_errcode_t
 991 internal_function
 992 re_node_set_init_1 (re_node_set *set, int elem)
 993 {
 994   set->alloc = 1;
 995   set->nelem = 1;
 996   set->elems = re_malloc (int, 1);
 997   if (BE (set->elems == NULL, 0))
 998     {
 999       set->alloc = set->nelem = 0;
1000       return REG_ESPACE;
1001     }
1002   set->elems[0] = elem;
1003   return REG_NOERROR;
1004 }
1005
1006 static reg_errcode_t
1007 internal_function
1008 re_node_set_init_2 (re_node_set *set, int elem1, int elem2)
1009 {
1010   set->alloc = 2;
1011   set->elems = re_malloc (int, 2);
1012   if (BE (set->elems == NULL, 0))
1013     return REG_ESPACE;
1014   if (elem1 == elem2)
1015     {
1016       set->nelem = 1;
1017       set->elems[0] = elem1;
1018     }
1019   else
1020     {
1021       set->nelem = 2;
1022       if (elem1 < elem2)
1023         {
1024           set->elems[0] = elem1;
1025           set->elems[1] = elem2;
1026         }
1027       else
1028         {
1029           set->elems[0] = elem2;
1030           set->elems[1] = elem1;
1031         }
1032     }
1033   return REG_NOERROR;
1034 }
1035
1036 static reg_errcode_t
1037 internal_function
1038 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1039 {
1040   dest->nelem = src->nelem;
1041   if (src->nelem > 0)
1042     {
1043       dest->alloc = dest->nelem;
1044       dest->elems = re_malloc (int, dest->alloc);
1045       if (BE (dest->elems == NULL, 0))
1046         {
1047           dest->alloc = dest->nelem = 0;
1048           return REG_ESPACE;
1049         }
1050       memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1051     }
1052   else
1053     re_node_set_init_empty (dest);
1054   return REG_NOERROR;
1055 }
1056
1057 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1058    DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1059    Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
1060
1061 static reg_errcode_t
1062 internal_function
1063 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1064                            const re_node_set *src2)
1065 {
1066   int i1, i2, is, id, delta, sbase;
1067   if (src1->nelem == 0 || src2->nelem == 0)
1068     return REG_NOERROR;
1069
1070   /* We need dest->nelem + 2 * elems_in_intersection; this is a
1071      conservative estimate.  */
1072   if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1073     {
1074       int new_alloc = src1->nelem + src2->nelem + dest->alloc;
1075       int *new_elems = re_realloc (dest->elems, int, new_alloc);
1076       if (BE (new_elems == NULL, 0))
1077         return REG_ESPACE;
1078       dest->elems = new_elems;
1079       dest->alloc = new_alloc;
1080     }
1081
1082   /* Find the items in the intersection of SRC1 and SRC2, and copy
1083      into the top of DEST those that are not already in DEST itself.  */
1084   sbase = dest->nelem + src1->nelem + src2->nelem;
1085   i1 = src1->nelem - 1;
1086   i2 = src2->nelem - 1;
1087   id = dest->nelem - 1;
1088   for (;;)
1089     {
1090       if (src1->elems[i1] == src2->elems[i2])
1091         {
1092           /* Try to find the item in DEST.  Maybe we could binary search?  */
1093           while (id >= 0 && dest->elems[id] > src1->elems[i1])
1094             --id;
1095
1096           if (id < 0 || dest->elems[id] != src1->elems[i1])
1097             dest->elems[--sbase] = src1->elems[i1];
1098
1099           if (--i1 < 0 || --i2 < 0)
1100             break;
1101         }
1102
1103       /* Lower the highest of the two items.  */
1104       else if (src1->elems[i1] < src2->elems[i2])
1105         {
1106           if (--i2 < 0)
1107             break;
1108         }
1109       else
1110         {
1111           if (--i1 < 0)
1112             break;
1113         }
1114     }
1115
1116   id = dest->nelem - 1;
1117   is = dest->nelem + src1->nelem + src2->nelem - 1;
1118   delta = is - sbase + 1;
1119
1120   /* Now copy.  When DELTA becomes zero, the remaining
1121      DEST elements are already in place; this is more or
1122      less the same loop that is in re_node_set_merge.  */
1123   dest->nelem += delta;
1124   if (delta > 0 && id >= 0)
1125     for (;;)
1126       {
1127         if (dest->elems[is] > dest->elems[id])
1128           {
1129             /* Copy from the top.  */
1130             dest->elems[id + delta--] = dest->elems[is--];
1131             if (delta == 0)
1132               break;
1133           }
1134         else
1135           {
1136             /* Slide from the bottom.  */
1137             dest->elems[id + delta] = dest->elems[id];
1138             if (--id < 0)
1139               break;
1140           }
1141       }
1142
1143   /* Copy remaining SRC elements.  */
1144   memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1145
1146   return REG_NOERROR;
1147 }
1148
1149 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1150    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1151
1152 static reg_errcode_t
1153 internal_function
1154 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1155                         const re_node_set *src2)
1156 {
1157   int i1, i2, id;
1158   if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1159     {
1160       dest->alloc = src1->nelem + src2->nelem;
1161       dest->elems = re_malloc (int, dest->alloc);
1162       if (BE (dest->elems == NULL, 0))
1163         return REG_ESPACE;
1164     }
1165   else
1166     {
1167       if (src1 != NULL && src1->nelem > 0)
1168         return re_node_set_init_copy (dest, src1);
1169       else if (src2 != NULL && src2->nelem > 0)
1170         return re_node_set_init_copy (dest, src2);
1171       else
1172         re_node_set_init_empty (dest);
1173       return REG_NOERROR;
1174     }
1175   for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1176     {
1177       if (src1->elems[i1] > src2->elems[i2])
1178         {
1179           dest->elems[id++] = src2->elems[i2++];
1180           continue;
1181         }
1182       if (src1->elems[i1] == src2->elems[i2])
1183         ++i2;
1184       dest->elems[id++] = src1->elems[i1++];
1185     }
1186   if (i1 < src1->nelem)
1187     {
1188       memcpy (dest->elems + id, src1->elems + i1,
1189              (src1->nelem - i1) * sizeof (int));
1190       id += src1->nelem - i1;
1191     }
1192   else if (i2 < src2->nelem)
1193     {
1194       memcpy (dest->elems + id, src2->elems + i2,
1195              (src2->nelem - i2) * sizeof (int));
1196       id += src2->nelem - i2;
1197     }
1198   dest->nelem = id;
1199   return REG_NOERROR;
1200 }
1201
1202 /* Calculate the union set of the sets DEST and SRC. And store it to
1203    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1204
1205 static reg_errcode_t
1206 internal_function
1207 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1208 {
1209   int is, id, sbase, delta;
1210   if (src == NULL || src->nelem == 0)
1211     return REG_NOERROR;
1212   if (dest->alloc < 2 * src->nelem + dest->nelem)
1213     {
1214       int new_alloc = 2 * (src->nelem + dest->alloc);
1215       int *new_buffer = re_realloc (dest->elems, int, new_alloc);
1216       if (BE (new_buffer == NULL, 0))
1217         return REG_ESPACE;
1218       dest->elems = new_buffer;
1219       dest->alloc = new_alloc;
1220     }
1221
1222   if (BE (dest->nelem == 0, 0))
1223     {
1224       dest->nelem = src->nelem;
1225       memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1226       return REG_NOERROR;
1227     }
1228
1229   /* Copy into the top of DEST the items of SRC that are not
1230      found in DEST.  Maybe we could binary search in DEST?  */
1231   for (sbase = dest->nelem + 2 * src->nelem,
1232        is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1233     {
1234       if (dest->elems[id] == src->elems[is])
1235         is--, id--;
1236       else if (dest->elems[id] < src->elems[is])
1237         dest->elems[--sbase] = src->elems[is--];
1238       else /* if (dest->elems[id] > src->elems[is]) */
1239         --id;
1240     }
1241
1242   if (is >= 0)
1243     {
1244       /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1245       sbase -= is + 1;
1246       memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
1247     }
1248
1249   id = dest->nelem - 1;
1250   is = dest->nelem + 2 * src->nelem - 1;
1251   delta = is - sbase + 1;
1252   if (delta == 0)
1253     return REG_NOERROR;
1254
1255   /* Now copy.  When DELTA becomes zero, the remaining
1256      DEST elements are already in place.  */
1257   dest->nelem += delta;
1258   for (;;)
1259     {
1260       if (dest->elems[is] > dest->elems[id])
1261         {
1262           /* Copy from the top.  */
1263           dest->elems[id + delta--] = dest->elems[is--];
1264           if (delta == 0)
1265             break;
1266         }
1267       else
1268         {
1269           /* Slide from the bottom.  */
1270           dest->elems[id + delta] = dest->elems[id];
1271           if (--id < 0)
1272             {
1273               /* Copy remaining SRC elements.  */
1274               memcpy (dest->elems, dest->elems + sbase,
1275                       delta * sizeof (int));
1276               break;
1277             }
1278         }
1279     }
1280
1281   return REG_NOERROR;
1282 }
1283
1284 /* Insert the new element ELEM to the re_node_set* SET.
1285    SET should not already have ELEM.
1286    return -1 if an error has occurred, return 1 otherwise.  */
1287
1288 static int
1289 internal_function
1290 re_node_set_insert (re_node_set *set, int elem)
1291 {
1292   int idx;
1293   /* In case the set is empty.  */
1294   if (set->alloc == 0)
1295     {
1296       if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
1297         return 1;
1298       else
1299         return -1;
1300     }
1301
1302   if (BE (set->nelem, 0) == 0)
1303     {
1304       /* We already guaranteed above that set->alloc != 0.  */
1305       set->elems[0] = elem;
1306       ++set->nelem;
1307       return 1;
1308     }
1309
1310   /* Realloc if we need.  */
1311   if (set->alloc == set->nelem)
1312     {
1313       int *new_elems;
1314       set->alloc = set->alloc * 2;
1315       new_elems = re_realloc (set->elems, int, set->alloc);
1316       if (BE (new_elems == NULL, 0))
1317         return -1;
1318       set->elems = new_elems;
1319     }
1320
1321   /* Move the elements which follows the new element.  Test the
1322      first element separately to skip a check in the inner loop.  */
1323   if (elem < set->elems[0])
1324     {
1325       idx = 0;
1326       for (idx = set->nelem; idx > 0; idx--)
1327         set->elems[idx] = set->elems[idx - 1];
1328     }
1329   else
1330     {
1331       for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1332         set->elems[idx] = set->elems[idx - 1];
1333     }
1334
1335   /* Insert the new element.  */
1336   set->elems[idx] = elem;
1337   ++set->nelem;
1338   return 1;
1339 }
1340
1341 /* Insert the new element ELEM to the re_node_set* SET.
1342    SET should not already have any element greater than or equal to ELEM.
1343    Return -1 if an error has occurred, return 1 otherwise.  */
1344
1345 static int
1346 internal_function
1347 re_node_set_insert_last (re_node_set *set, int elem)
1348 {
1349   /* Realloc if we need.  */
1350   if (set->alloc == set->nelem)
1351     {
1352       int *new_elems;
1353       set->alloc = (set->alloc + 1) * 2;
1354       new_elems = re_realloc (set->elems, int, set->alloc);
1355       if (BE (new_elems == NULL, 0))
1356         return -1;
1357       set->elems = new_elems;
1358     }
1359
1360   /* Insert the new element.  */
1361   set->elems[set->nelem++] = elem;
1362   return 1;
1363 }
1364
1365 /* Compare two node sets SET1 and SET2.
1366    return 1 if SET1 and SET2 are equivalent, return 0 otherwise.  */
1367
1368 static int
1369 internal_function __attribute ((pure))
1370 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1371 {
1372   int i;
1373   if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1374     return 0;
1375   for (i = set1->nelem ; --i >= 0 ; )
1376     if (set1->elems[i] != set2->elems[i])
1377       return 0;
1378   return 1;
1379 }
1380
1381 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1382
1383 static int
1384 internal_function __attribute ((pure))
1385 re_node_set_contains (const re_node_set *set, int elem)
1386 {
1387   unsigned int idx, right, mid;
1388   if (set->nelem <= 0)
1389     return 0;
1390
1391   /* Binary search the element.  */
1392   idx = 0;
1393   right = set->nelem - 1;
1394   while (idx < right)
1395     {
1396       mid = idx + (right - idx) / 2;
1397       if (set->elems[mid] < elem)
1398         idx = mid + 1;
1399       else
1400         right = mid;
1401     }
1402   return set->elems[idx] == elem ? idx + 1 : 0;
1403 }
1404
1405 static void
1406 internal_function
1407 re_node_set_remove_at (re_node_set *set, int idx)
1408 {
1409   if (idx < 0 || idx >= set->nelem)
1410     return;
1411   --set->nelem;
1412   for (; idx < set->nelem; idx++)
1413     set->elems[idx] = set->elems[idx + 1];
1414 }
1415 \f
1416
1417 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1418    Or return -1, if an error has occurred.  */
1419
1420 static int
1421 internal_function
1422 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1423 {
1424   if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1425     {
1426       size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1427       int *new_nexts, *new_indices;
1428       re_node_set *new_edests, *new_eclosures;
1429       re_token_t *new_nodes;
1430
1431       /* Avoid overflows in realloc.  */
1432       const size_t max_object_size = MAX (sizeof (re_token_t),
1433                                           MAX (sizeof (re_node_set),
1434                                                sizeof (int)));
1435       if (BE (SIZE_MAX / max_object_size < new_nodes_alloc, 0))
1436         return -1;
1437
1438       new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1439       if (BE (new_nodes == NULL, 0))
1440         return -1;
1441       dfa->nodes = new_nodes;
1442       new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
1443       new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
1444       new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1445       new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1446       if (BE (new_nexts == NULL || new_indices == NULL
1447               || new_edests == NULL || new_eclosures == NULL, 0))
1448         return -1;
1449       dfa->nexts = new_nexts;
1450       dfa->org_indices = new_indices;
1451       dfa->edests = new_edests;
1452       dfa->eclosures = new_eclosures;
1453       dfa->nodes_alloc = new_nodes_alloc;
1454     }
1455   dfa->nodes[dfa->nodes_len] = token;
1456   dfa->nodes[dfa->nodes_len].constraint = 0;
1457 #ifdef RE_ENABLE_I18N
1458   dfa->nodes[dfa->nodes_len].accept_mb =
1459     (token.type == OP_PERIOD && dfa->mb_cur_max > 1) || token.type == COMPLEX_BRACKET;
1460 #endif
1461   dfa->nexts[dfa->nodes_len] = -1;
1462   re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1463   re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1464   return dfa->nodes_len++;
1465 }
1466
1467 static inline unsigned int
1468 internal_function
1469 calc_state_hash (const re_node_set *nodes, unsigned int context)
1470 {
1471   unsigned int hash = nodes->nelem + context;
1472   int i;
1473   for (i = 0 ; i < nodes->nelem ; i++)
1474     hash += nodes->elems[i];
1475   return hash;
1476 }
1477
1478 /* Search for the state whose node_set is equivalent to NODES.
1479    Return the pointer to the state, if we found it in the DFA.
1480    Otherwise create the new one and return it.  In case of an error
1481    return NULL and set the error code in ERR.
1482    Note: - We assume NULL as the invalid state, then it is possible that
1483            return value is NULL and ERR is REG_NOERROR.
1484          - We never return non-NULL value in case of any errors, it is for
1485            optimization.  */
1486
1487 static re_dfastate_t *
1488 internal_function
1489 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1490                   const re_node_set *nodes)
1491 {
1492   unsigned int hash;
1493   re_dfastate_t *new_state;
1494   struct re_state_table_entry *spot;
1495   int i;
1496   if (BE (nodes->nelem == 0, 0))
1497     {
1498       *err = REG_NOERROR;
1499       return NULL;
1500     }
1501   hash = calc_state_hash (nodes, 0);
1502   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1503
1504   for (i = 0 ; i < spot->num ; i++)
1505     {
1506       re_dfastate_t *state = spot->array[i];
1507       if (hash != state->hash)
1508         continue;
1509       if (re_node_set_compare (&state->nodes, nodes))
1510         return state;
1511     }
1512
1513   /* There are no appropriate state in the dfa, create the new one.  */
1514   new_state = create_ci_newstate (dfa, nodes, hash);
1515   if (BE (new_state == NULL, 0))
1516     *err = REG_ESPACE;
1517
1518   return new_state;
1519 }
1520
1521 /* Search for the state whose node_set is equivalent to NODES and
1522    whose context is equivalent to CONTEXT.
1523    Return the pointer to the state, if we found it in the DFA.
1524    Otherwise create the new one and return it.  In case of an error
1525    return NULL and set the error code in ERR.
1526    Note: - We assume NULL as the invalid state, then it is possible that
1527            return value is NULL and ERR is REG_NOERROR.
1528          - We never return non-NULL value in case of any errors, it is for
1529            optimization.  */
1530
1531 static re_dfastate_t *
1532 internal_function
1533 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1534                           const re_node_set *nodes, unsigned int context)
1535 {
1536   unsigned int hash;
1537   re_dfastate_t *new_state;
1538   struct re_state_table_entry *spot;
1539   int i;
1540   if (nodes->nelem == 0)
1541     {
1542       *err = REG_NOERROR;
1543       return NULL;
1544     }
1545   hash = calc_state_hash (nodes, context);
1546   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1547
1548   for (i = 0 ; i < spot->num ; i++)
1549     {
1550       re_dfastate_t *state = spot->array[i];
1551       if (state->hash == hash
1552           && state->context == context
1553           && re_node_set_compare (state->entrance_nodes, nodes))
1554         return state;
1555     }
1556   /* There are no appropriate state in `dfa', create the new one.  */
1557   new_state = create_cd_newstate (dfa, nodes, context, hash);
1558   if (BE (new_state == NULL, 0))
1559     *err = REG_ESPACE;
1560
1561   return new_state;
1562 }
1563
1564 /* Finish initialization of the new state NEWSTATE, and using its hash value
1565    HASH put in the appropriate bucket of DFA's state table.  Return value
1566    indicates the error code if failed.  */
1567
1568 static reg_errcode_t
1569 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1570                 unsigned int hash)
1571 {
1572   struct re_state_table_entry *spot;
1573   reg_errcode_t err;
1574   int i;
1575
1576   newstate->hash = hash;
1577   err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1578   if (BE (err != REG_NOERROR, 0))
1579     return REG_ESPACE;
1580   for (i = 0; i < newstate->nodes.nelem; i++)
1581     {
1582       int elem = newstate->nodes.elems[i];
1583       if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1584         if (re_node_set_insert_last (&newstate->non_eps_nodes, elem) < 0)
1585           return REG_ESPACE;
1586     }
1587
1588   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1589   if (BE (spot->alloc <= spot->num, 0))
1590     {
1591       int new_alloc = 2 * spot->num + 2;
1592       re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1593                                               new_alloc);
1594       if (BE (new_array == NULL, 0))
1595         return REG_ESPACE;
1596       spot->array = new_array;
1597       spot->alloc = new_alloc;
1598     }
1599   spot->array[spot->num++] = newstate;
1600   return REG_NOERROR;
1601 }
1602
1603 static void
1604 free_state (re_dfastate_t *state)
1605 {
1606   re_node_set_free (&state->non_eps_nodes);
1607   re_node_set_free (&state->inveclosure);
1608   if (state->entrance_nodes != &state->nodes)
1609     {
1610       re_node_set_free (state->entrance_nodes);
1611       re_free (state->entrance_nodes);
1612     }
1613   re_node_set_free (&state->nodes);
1614   re_free (state->word_trtable);
1615   re_free (state->trtable);
1616   re_free (state);
1617 }
1618
1619 /* Create the new state which is independent of contexts.
1620    Return the new state if succeeded, otherwise return NULL.  */
1621
1622 static re_dfastate_t *
1623 internal_function
1624 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1625                     unsigned int hash)
1626 {
1627   int i;
1628   reg_errcode_t err;
1629   re_dfastate_t *newstate;
1630
1631   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1632   if (BE (newstate == NULL, 0))
1633     return NULL;
1634   err = re_node_set_init_copy (&newstate->nodes, nodes);
1635   if (BE (err != REG_NOERROR, 0))
1636     {
1637       re_free (newstate);
1638       return NULL;
1639     }
1640
1641   newstate->entrance_nodes = &newstate->nodes;
1642   for (i = 0 ; i < nodes->nelem ; i++)
1643     {
1644       re_token_t *node = dfa->nodes + nodes->elems[i];
1645       re_token_type_t type = node->type;
1646       if (type == CHARACTER && !node->constraint)
1647         continue;
1648 #ifdef RE_ENABLE_I18N
1649       newstate->accept_mb |= node->accept_mb;
1650 #endif /* RE_ENABLE_I18N */
1651
1652       /* If the state has the halt node, the state is a halt state.  */
1653       if (type == END_OF_RE)
1654         newstate->halt = 1;
1655       else if (type == OP_BACK_REF)
1656         newstate->has_backref = 1;
1657       else if (type == ANCHOR || node->constraint)
1658         newstate->has_constraint = 1;
1659     }
1660   err = register_state (dfa, newstate, hash);
1661   if (BE (err != REG_NOERROR, 0))
1662     {
1663       free_state (newstate);
1664       newstate = NULL;
1665     }
1666   return newstate;
1667 }
1668
1669 /* Create the new state which is depend on the context CONTEXT.
1670    Return the new state if succeeded, otherwise return NULL.  */
1671
1672 static re_dfastate_t *
1673 internal_function
1674 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1675                     unsigned int context, unsigned int hash)
1676 {
1677   int i, nctx_nodes = 0;
1678   reg_errcode_t err;
1679   re_dfastate_t *newstate;
1680
1681   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1682   if (BE (newstate == NULL, 0))
1683     return NULL;
1684   err = re_node_set_init_copy (&newstate->nodes, nodes);
1685   if (BE (err != REG_NOERROR, 0))
1686     {
1687       re_free (newstate);
1688       return NULL;
1689     }
1690
1691   newstate->context = context;
1692   newstate->entrance_nodes = &newstate->nodes;
1693
1694   for (i = 0 ; i < nodes->nelem ; i++)
1695     {
1696       re_token_t *node = dfa->nodes + nodes->elems[i];
1697       re_token_type_t type = node->type;
1698       unsigned int constraint = node->constraint;
1699
1700       if (type == CHARACTER && !constraint)
1701         continue;
1702 #ifdef RE_ENABLE_I18N
1703       newstate->accept_mb |= node->accept_mb;
1704 #endif /* RE_ENABLE_I18N */
1705
1706       /* If the state has the halt node, the state is a halt state.  */
1707       if (type == END_OF_RE)
1708         newstate->halt = 1;
1709       else if (type == OP_BACK_REF)
1710         newstate->has_backref = 1;
1711
1712       if (constraint)
1713         {
1714           if (newstate->entrance_nodes == &newstate->nodes)
1715             {
1716               newstate->entrance_nodes = re_malloc (re_node_set, 1);
1717               if (BE (newstate->entrance_nodes == NULL, 0))
1718                 {
1719                   free_state (newstate);
1720                   return NULL;
1721                 }
1722               if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1723                   != REG_NOERROR)
1724                 return NULL;
1725               nctx_nodes = 0;
1726               newstate->has_constraint = 1;
1727             }
1728
1729           if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1730             {
1731               re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1732               ++nctx_nodes;
1733             }
1734         }
1735     }
1736   err = register_state (dfa, newstate, hash);
1737   if (BE (err != REG_NOERROR, 0))
1738     {
1739       free_state (newstate);
1740       newstate = NULL;
1741     }
1742   return  newstate;
1743 }