lib/dfa.c

   1 /* dfa.c - deterministic extended regexp routines for GNU
   2    Copyright (C) 1988, 1998, 2000, 2002, 2004-2005, 2007-2016 Free Software
   3    Foundation, Inc.
   4
   5    This program is free software; you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation; either version 3, or (at your option)
   8    any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program; if not, write to the Free Software
  17    Foundation, Inc.,
  18    51 Franklin Street - Fifth Floor, Boston, MA  02110-1301, USA */
  19
  20 /* Written June, 1988 by Mike Haertel
  21    Modified July, 1988 by Arthur David Olson to assist BMG speedups  */
  22
  23 #include <config.h>
  24
  25 #include "dfa.h"
  26
  27 #include <assert.h>
  28 #include <ctype.h>
  29 #include <stdio.h>
  30 #include <stdlib.h>
  31 #include <limits.h>
  32 #include <string.h>
  33 #include <locale.h>
  34
  35 #define STREQ(a, b) (strcmp (a, b) == 0)
  36
  37 /* ISASCIIDIGIT differs from isdigit, as follows:
  38    - Its arg may be any int or unsigned int; it need not be an unsigned char.
  39    - It's guaranteed to evaluate its argument exactly once.
  40    - It's typically faster.
  41    Posix 1003.2-1992 section 2.5.2.1 page 50 lines 1556-1558 says that
  42    only '0' through '9' are digits.  Prefer ISASCIIDIGIT to isdigit unless
  43    it's important to use the locale's definition of "digit" even when the
  44    host does not conform to Posix.  */
  45 #define ISASCIIDIGIT(c) ((unsigned) (c) - '0' <= 9)
  46
  47 #include "gettext.h"
  48 #define _(str) gettext (str)
  49
  50 #include <wchar.h>
  51
  52 #include "xalloc.h"
  53 #include "localeinfo.h"
  54
  55 /* HPUX defines these as macros in sys/param.h.  */
  56 #ifdef setbit
  57 # undef setbit
  58 #endif
  59 #ifdef clrbit
  60 # undef clrbit
  61 #endif
  62
  63 /* First integer value that is greater than any character code.  */
  64 enum { NOTCHAR = 1 << CHAR_BIT };
  65
  66 /* This represents part of a character class.  It must be unsigned and
  67    at least CHARCLASS_WORD_BITS wide.  Any excess bits are zero.  */
  68 typedef unsigned long int charclass_word;
  69
  70 /* CHARCLASS_WORD_BITS is the number of bits used in a charclass word.
  71    CHARCLASS_PAIR (LO, HI) is part of a charclass initializer, and
  72    represents 64 bits' worth of a charclass, where LO and HI are the
  73    low and high-order 32 bits of the 64-bit quantity.  */
  74 #if ULONG_MAX >> 31 >> 31 < 3
  75 enum { CHARCLASS_WORD_BITS = 32 };
  76 # define CHARCLASS_PAIR(lo, hi) lo, hi
  77 #else
  78 enum { CHARCLASS_WORD_BITS = 64 };
  79 # define CHARCLASS_PAIR(lo, hi) (((charclass_word) (hi) << 32) + (lo))
  80 #endif
  81
  82 /* An initializer for a charclass whose 32-bit words are A through H.  */
  83 #define CHARCLASS_INIT(a, b, c, d, e, f, g, h)          \
  84     {                                                   \
  85       CHARCLASS_PAIR (a, b), CHARCLASS_PAIR (c, d),     \
  86       CHARCLASS_PAIR (e, f), CHARCLASS_PAIR (g, h)      \
  87     }
  88
  89 /* The maximum useful value of a charclass_word; all used bits are 1.  */
  90 static charclass_word const CHARCLASS_WORD_MASK
  91   = ((charclass_word) 1 << (CHARCLASS_WORD_BITS - 1) << 1) - 1;
  92
  93 /* Number of words required to hold a bit for every character.  */
  94 enum
  95 {
  96   CHARCLASS_WORDS = (NOTCHAR + CHARCLASS_WORD_BITS - 1) / CHARCLASS_WORD_BITS
  97 };
  98
  99 /* Sets of unsigned characters are stored as bit vectors in arrays of ints.  */
 100 typedef charclass_word charclass[CHARCLASS_WORDS];
 101
 102 /* Convert a possibly-signed character to an unsigned character.  This is
 103    a bit safer than casting to unsigned char, since it catches some type
 104    errors that the cast doesn't.  */
 105 static unsigned char
 106 to_uchar (char ch)
 107 {
 108   return ch;
 109 }
 110
 111 /* Contexts tell us whether a character is a newline or a word constituent.
 112    Word-constituent characters are those that satisfy iswalnum, plus '_'.
 113    Each character has a single CTX_* value; bitmasks of CTX_* values denote
 114    a particular character class.
 115
 116    A state also stores a context value, which is a bitmask of CTX_* values.
 117    A state's context represents a set of characters that the state's
 118    predecessors must match.  For example, a state whose context does not
 119    include CTX_LETTER will never have transitions where the previous
 120    character is a word constituent.  A state whose context is CTX_ANY
 121    might have transitions from any character.  */
 122
 123 #define CTX_NONE        1
 124 #define CTX_LETTER      2
 125 #define CTX_NEWLINE     4
 126 #define CTX_ANY         7
 127
 128 /* Sometimes characters can only be matched depending on the surrounding
 129    context.  Such context decisions depend on what the previous character
 130    was, and the value of the current (lookahead) character.  Context
 131    dependent constraints are encoded as 12-bit integers.  Each bit that
 132    is set indicates that the constraint succeeds in the corresponding
 133    context.
 134
 135    bit 8-11 - valid contexts when next character is CTX_NEWLINE
 136    bit 4-7  - valid contexts when next character is CTX_LETTER
 137    bit 0-3  - valid contexts when next character is CTX_NONE
 138
 139    The macro SUCCEEDS_IN_CONTEXT determines whether a given constraint
 140    succeeds in a particular context.  Prev is a bitmask of possible
 141    context values for the previous character, curr is the (single-bit)
 142    context value for the lookahead character.  */
 143 #define NEWLINE_CONSTRAINT(constraint) (((constraint) >> 8) & 0xf)
 144 #define LETTER_CONSTRAINT(constraint)  (((constraint) >> 4) & 0xf)
 145 #define OTHER_CONSTRAINT(constraint)    ((constraint)       & 0xf)
 146
 147 #define SUCCEEDS_IN_CONTEXT(constraint, prev, curr) \
 148   ((((curr) & CTX_NONE      ? OTHER_CONSTRAINT (constraint) : 0) \
 149     | ((curr) & CTX_LETTER  ? LETTER_CONSTRAINT (constraint) : 0) \
 150     | ((curr) & CTX_NEWLINE ? NEWLINE_CONSTRAINT (constraint) : 0)) \
 151    & (prev))
 152
 153 /* The following macros describe what a constraint depends on.  */
 154 #define PREV_NEWLINE_CONSTRAINT(constraint) (((constraint) >> 2) & 0x111)
 155 #define PREV_LETTER_CONSTRAINT(constraint)  (((constraint) >> 1) & 0x111)
 156 #define PREV_OTHER_CONSTRAINT(constraint)    ((constraint)       & 0x111)
 157
 158 #define PREV_NEWLINE_DEPENDENT(constraint) \
 159   (PREV_NEWLINE_CONSTRAINT (constraint) != PREV_OTHER_CONSTRAINT (constraint))
 160 #define PREV_LETTER_DEPENDENT(constraint) \
 161   (PREV_LETTER_CONSTRAINT (constraint) != PREV_OTHER_CONSTRAINT (constraint))
 162
 163 /* Tokens that match the empty string subject to some constraint actually
 164    work by applying that constraint to determine what may follow them,
 165    taking into account what has gone before.  The following values are
 166    the constraints corresponding to the special tokens previously defined.  */
 167 #define NO_CONSTRAINT         0x777
 168 #define BEGLINE_CONSTRAINT    0x444
 169 #define ENDLINE_CONSTRAINT    0x700
 170 #define BEGWORD_CONSTRAINT    0x050
 171 #define ENDWORD_CONSTRAINT    0x202
 172 #define LIMWORD_CONSTRAINT    0x252
 173 #define NOTLIMWORD_CONSTRAINT 0x525
 174
 175 /* The regexp is parsed into an array of tokens in postfix form.  Some tokens
 176    are operators and others are terminal symbols.  Most (but not all) of these
 177    codes are returned by the lexical analyzer.  */
 178
 179 typedef ptrdiff_t token;
 180
 181 /* States are indexed by state_num values.  These are normally
 182    nonnegative but -1 is used as a special value.  */
 183 typedef ptrdiff_t state_num;
 184
 185 /* Predefined token values.  */
 186 enum
 187 {
 188   END = -1,                     /* END is a terminal symbol that matches the
 189                                    end of input; any value of END or less in
 190                                    the parse tree is such a symbol.  Accepting
 191                                    states of the DFA are those that would have
 192                                    a transition on END.  */
 193
 194   /* Ordinary character values are terminal symbols that match themselves.  */
 195
 196   EMPTY = NOTCHAR,              /* EMPTY is a terminal symbol that matches
 197                                    the empty string.  */
 198
 199   BACKREF,                      /* BACKREF is generated by \<digit>
 200                                    or by any other construct that
 201                                    is not completely handled.  If the scanner
 202                                    detects a transition on backref, it returns
 203                                    a kind of "semi-success" indicating that
 204                                    the match will have to be verified with
 205                                    a backtracking matcher.  */
 206
 207   BEGLINE,                      /* BEGLINE is a terminal symbol that matches
 208                                    the empty string at the beginning of a
 209                                    line.  */
 210
 211   ENDLINE,                      /* ENDLINE is a terminal symbol that matches
 212                                    the empty string at the end of a line.  */
 213
 214   BEGWORD,                      /* BEGWORD is a terminal symbol that matches
 215                                    the empty string at the beginning of a
 216                                    word.  */
 217
 218   ENDWORD,                      /* ENDWORD is a terminal symbol that matches
 219                                    the empty string at the end of a word.  */
 220
 221   LIMWORD,                      /* LIMWORD is a terminal symbol that matches
 222                                    the empty string at the beginning or the
 223                                    end of a word.  */
 224
 225   NOTLIMWORD,                   /* NOTLIMWORD is a terminal symbol that
 226                                    matches the empty string not at
 227                                    the beginning or end of a word.  */
 228
 229   QMARK,                        /* QMARK is an operator of one argument that
 230                                    matches zero or one occurrences of its
 231                                    argument.  */
 232
 233   STAR,                         /* STAR is an operator of one argument that
 234                                    matches the Kleene closure (zero or more
 235                                    occurrences) of its argument.  */
 236
 237   PLUS,                         /* PLUS is an operator of one argument that
 238                                    matches the positive closure (one or more
 239                                    occurrences) of its argument.  */
 240
 241   REPMN,                        /* REPMN is a lexical token corresponding
 242                                    to the {m,n} construct.  REPMN never
 243                                    appears in the compiled token vector.  */
 244
 245   CAT,                          /* CAT is an operator of two arguments that
 246                                    matches the concatenation of its
 247                                    arguments.  CAT is never returned by the
 248                                    lexical analyzer.  */
 249
 250   OR,                           /* OR is an operator of two arguments that
 251                                    matches either of its arguments.  */
 252
 253   LPAREN,                       /* LPAREN never appears in the parse tree,
 254                                    it is only a lexeme.  */
 255
 256   RPAREN,                       /* RPAREN never appears in the parse tree.  */
 257
 258   ANYCHAR,                      /* ANYCHAR is a terminal symbol that matches
 259                                    a valid multibyte (or single byte) character.
 260                                    It is used only if MB_CUR_MAX > 1.  */
 261
 262   MBCSET,                       /* MBCSET is similar to CSET, but for
 263                                    multibyte characters.  */
 264
 265   WCHAR,                        /* Only returned by lex.  wctok contains
 266                                    the wide character representation.  */
 267
 268   CSET                          /* CSET and (and any value greater) is a
 269                                    terminal symbol that matches any of a
 270                                    class of characters.  */
 271 };
 272
 273
 274 /* States of the recognizer correspond to sets of positions in the parse
 275    tree, together with the constraints under which they may be matched.
 276    So a position is encoded as an index into the parse tree together with
 277    a constraint.  */
 278 typedef struct
 279 {
 280   size_t index;                 /* Index into the parse array.  */
 281   unsigned int constraint;      /* Constraint for matching this position.  */
 282 } position;
 283
 284 /* Sets of positions are stored as arrays.  */
 285 typedef struct
 286 {
 287   position *elems;              /* Elements of this position set.  */
 288   size_t nelem;                 /* Number of elements in this set.  */
 289   size_t alloc;                 /* Number of elements allocated in ELEMS.  */
 290 } position_set;
 291
 292 /* Sets of leaves are also stored as arrays.  */
 293 typedef struct
 294 {
 295   size_t *elems;                /* Elements of this position set.  */
 296   size_t nelem;                 /* Number of elements in this set.  */
 297 } leaf_set;
 298
 299 /* A state of the dfa consists of a set of positions, some flags,
 300    and the token value of the lowest-numbered position of the state that
 301    contains an END token.  */
 302 typedef struct
 303 {
 304   size_t hash;                  /* Hash of the positions of this state.  */
 305   position_set elems;           /* Positions this state could match.  */
 306   unsigned char context;        /* Context from previous state.  */
 307   unsigned short constraint;    /* Constraint for this state to accept.  */
 308   token first_end;              /* Token value of the first END in elems.  */
 309   position_set mbps;            /* Positions which can match multibyte
 310                                    characters or the follows, e.g., period.
 311                                    Used only if MB_CUR_MAX > 1.  */
 312   state_num mb_trindex;         /* Index of this state in MB_TRANS, or
 313                                    negative if the state does not have
 314                                    ANYCHAR.  */
 315 } dfa_state;
 316
 317 /* Maximum for any transition table count that exceeds min_trcount.  */
 318 enum { MAX_TRCOUNT = 1024 };
 319
 320 /* A bracket operator.
 321    e.g., [a-c], [[:alpha:]], etc.  */
 322 struct mb_char_classes
 323 {
 324   ptrdiff_t cset;
 325   bool invert;
 326   wchar_t *chars;               /* Normal characters.  */
 327   size_t nchars;
 328 };
 329
 330 struct regex_syntax
 331 {
 332   /* Syntax bits controlling the behavior of the lexical analyzer.  */
 333   reg_syntax_t syntax_bits;
 334   bool syntax_bits_set;
 335
 336   /* Flag for case-folding letters into sets.  */
 337   bool case_fold;
 338
 339   /* True if ^ and $ match only the start and end of data, and do not match
 340      end-of-line within data.  */
 341   bool anchor;
 342
 343   /* End-of-line byte in data.  */
 344   unsigned char eolbyte;
 345
 346   /* Cache of char-context values.  */
 347   int sbit[NOTCHAR];
 348
 349   /* If never_trail[B], the byte B cannot be a non-initial byte in a
 350      multibyte character.  */
 351   bool never_trail[NOTCHAR];
 352
 353   /* Set of characters considered letters.  */
 354   charclass letters;
 355
 356   /* Set of characters that are newline.  */
 357   charclass newline;
 358 };
 359
 360 /* Lexical analyzer.  All the dross that deals with the obnoxious
 361    GNU Regex syntax bits is located here.  The poor, suffering
 362    reader is referred to the GNU Regex documentation for the
 363    meaning of the @#%!@#%^!@ syntax bits.  */
 364 struct lexer_state
 365 {
 366   char const *ptr;      /* Pointer to next input character.  */
 367   size_t left;          /* Number of characters remaining.  */
 368   token lasttok;        /* Previous token returned; initially END.  */
 369   size_t parens;        /* Count of outstanding left parens.  */
 370   int minrep, maxrep;   /* Repeat counts for {m,n}.  */
 371
 372   /* Wide character representation of the current multibyte character,
 373      or WEOF if there was an encoding error.  Used only if
 374      MB_CUR_MAX > 1.  */
 375   wint_t wctok;
 376
 377   /* Length of the multibyte representation of wctok.  */
 378   int cur_mb_len;
 379
 380   /* We're separated from beginning or (, | only by zero-width characters.  */
 381   bool laststart;
 382 };
 383
 384 /* Recursive descent parser for regular expressions.  */
 385
 386 struct parser_state
 387 {
 388   token tok;               /* Lookahead token.  */
 389   size_t depth;            /* Current depth of a hypothetical stack
 390                               holding deferred productions.  This is
 391                               used to determine the depth that will be
 392                               required of the real stack later on in
 393                               dfaanalyze.  */
 394 };
 395
 396 /* A compiled regular expression.  */
 397 struct dfa
 398 {
 399   /* Syntax configuration */
 400   struct regex_syntax syntax;
 401
 402   /* Fields filled by the scanner.  */
 403   charclass *charclasses;       /* Array of character sets for CSET tokens.  */
 404   size_t cindex;                /* Index for adding new charclasses.  */
 405   size_t calloc;                /* Number of charclasses allocated.  */
 406   size_t canychar;              /* Index of anychar class, or (size_t) -1.  */
 407
 408   /* Scanner state */
 409   struct lexer_state lex;
 410
 411   /* Parser state */
 412   struct parser_state parse;
 413
 414   /* Fields filled by the parser.  */
 415   token *tokens;                /* Postfix parse array.  */
 416   size_t tindex;                /* Index for adding new tokens.  */
 417   size_t talloc;                /* Number of tokens currently allocated.  */
 418   size_t depth;                 /* Depth required of an evaluation stack
 419                                    used for depth-first traversal of the
 420                                    parse tree.  */
 421   size_t nleaves;               /* Number of leaves on the parse tree.  */
 422   size_t nregexps;              /* Count of parallel regexps being built
 423                                    with dfaparse.  */
 424   bool fast;                    /* The DFA is fast.  */
 425   token utf8_anychar_classes[5]; /* To lower ANYCHAR in UTF-8 locales.  */
 426   mbstate_t mbs;                /* Multibyte conversion state.  */
 427
 428   /* The following are valid only if MB_CUR_MAX > 1.  */
 429
 430   /* The value of multibyte_prop[i] is defined by following rule.
 431      if tokens[i] < NOTCHAR
 432      bit 0 : tokens[i] is the first byte of a character, including
 433      single-byte characters.
 434      bit 1 : tokens[i] is the last byte of a character, including
 435      single-byte characters.
 436
 437      if tokens[i] = MBCSET
 438      ("the index of mbcsets corresponding to this operator" << 2) + 3
 439
 440      e.g.
 441      tokens
 442      = 'single_byte_a', 'multi_byte_A', single_byte_b'
 443      = 'sb_a', 'mb_A(1st byte)', 'mb_A(2nd byte)', 'mb_A(3rd byte)', 'sb_b'
 444      multibyte_prop
 445      = 3     , 1               ,  0              ,  2              , 3
 446    */
 447   int *multibyte_prop;
 448
 449   /* Array of the bracket expression in the DFA.  */
 450   struct mb_char_classes *mbcsets;
 451   size_t nmbcsets;
 452   size_t mbcsets_alloc;
 453
 454   /* Fields filled by the superset.  */
 455   struct dfa *superset;             /* Hint of the dfa.  */
 456
 457   /* Fields filled by the state builder.  */
 458   dfa_state *states;            /* States of the dfa.  */
 459   state_num sindex;             /* Index for adding new states.  */
 460   size_t salloc;                /* Number of states currently allocated.  */
 461
 462   /* Fields filled by the parse tree->NFA conversion.  */
 463   position_set *follows;        /* Array of follow sets, indexed by position
 464                                    index.  The follow of a position is the set
 465                                    of positions containing characters that
 466                                    could conceivably follow a character
 467                                    matching the given position in a string
 468                                    matching the regexp.  Allocated to the
 469                                    maximum possible position index.  */
 470   bool searchflag;              /* We are supposed to build a searching
 471                                    as opposed to an exact matcher.  A searching
 472                                    matcher finds the first and shortest string
 473                                    matching a regexp anywhere in the buffer,
 474                                    whereas an exact matcher finds the longest
 475                                    string matching, but anchored to the
 476                                    beginning of the buffer.  */
 477
 478   /* Fields filled by dfaexec.  */
 479   state_num tralloc;            /* Number of transition tables that have
 480                                    slots so far, not counting trans[-1].  */
 481   int trcount;                  /* Number of transition tables that have
 482                                    actually been built.  */
 483   int min_trcount;              /* Minimum of number of transition tables.
 484                                    Always keep the number, even after freeing
 485                                    the transition tables.  It is also the
 486                                    number of initial states.  */
 487   state_num **trans;            /* Transition tables for states that can
 488                                    never accept.  If the transitions for a
 489                                    state have not yet been computed, or the
 490                                    state could possibly accept, its entry in
 491                                    this table is NULL.  This points to one
 492                                    past the start of the allocated array,
 493                                    and trans[-1] is always NULL.  */
 494   state_num **fails;            /* Transition tables after failing to accept
 495                                    on a state that potentially could do so.  */
 496   int *success;                 /* Table of acceptance conditions used in
 497                                    dfaexec and computed in build_state.  */
 498   state_num *newlines;          /* Transitions on newlines.  The entry for a
 499                                    newline in any transition table is always
 500                                    -1 so we can count lines without wasting
 501                                    too many cycles.  The transition for a
 502                                    newline is stored separately and handled
 503                                    as a special case.  Newline is also used
 504                                    as a sentinel at the end of the buffer.  */
 505   state_num initstate_notbol;   /* Initial state for CTX_LETTER and CTX_NONE
 506                                    context in multibyte locales, in which we
 507                                    do not distinguish between their contexts,
 508                                    as not supported word.  */
 509   position_set mb_follows;      /* Follow set added by ANYCHAR on demand.  */
 510   state_num **mb_trans;      /* Transition tables for states with ANYCHAR.  */
 511   state_num mb_trcount;         /* Number of transition tables for states with
 512                                    ANYCHAR that have actually been built.  */
 513
 514   /* Information derived from the locale.  This is at the end so that
 515      a quick memset need not clear it specially.  */
 516
 517   /* dfaexec implementation.  */
 518   char *(*dfaexec) (struct dfa *, char const *, char *,
 519                     bool, size_t *, bool *);
 520
 521   /* The locale is simple, like the C locale.  These locales can be
 522      processed more efficiently, e.g., the relationship between lower-
 523      and upper-case letters is 1-1.  */
 524   bool simple_locale;
 525
 526   /* Other cached information derived from the locale.  */
 527   struct localeinfo localeinfo;
 528 };
 529
 530 /* Some macros for user access to dfa internals.  */
 531
 532 /* S could possibly be an accepting state of R.  */
 533 #define ACCEPTING(s, r) ((r).states[s].constraint)
 534
 535 /* STATE accepts in the specified context.  */
 536 #define ACCEPTS_IN_CONTEXT(prev, curr, state, dfa) \
 537   SUCCEEDS_IN_CONTEXT ((dfa).states[state].constraint, prev, curr)
 538
 539 static void regexp (struct dfa *dfa);
 540
 541 /* Store into *PWC the result of converting the leading bytes of the
 542    multibyte buffer S of length N bytes, using D->localeinfo.sbctowc
 543    and updating the conversion state in *D.  On conversion error,
 544    convert just a single byte, to WEOF.  Return the number of bytes
 545    converted.
 546
 547    This differs from mbrtowc (PWC, S, N, &D->mbs) as follows:
 548
 549    * PWC points to wint_t, not to wchar_t.
 550    * The last arg is a dfa *D instead of merely a multibyte conversion
 551      state D->mbs.
 552    * N must be at least 1.
 553    * S[N - 1] must be a sentinel byte.
 554    * Shift encodings are not supported.
 555    * The return value is always in the range 1..N.
 556    * D->mbs is always valid afterwards.
 557    * *PWC is always set to something.  */
 558 static size_t
 559 mbs_to_wchar (wint_t *pwc, char const *s, size_t n, struct dfa *d)
 560 {
 561   unsigned char uc = s[0];
 562   wint_t wc = d->localeinfo.sbctowc[uc];
 563
 564   if (wc == WEOF)
 565     {
 566       wchar_t wch;
 567       size_t nbytes = mbrtowc (&wch, s, n, &d->mbs);
 568       if (0 < nbytes && nbytes < (size_t) -2)
 569         {
 570           *pwc = wch;
 571           return nbytes;
 572         }
 573       memset (&d->mbs, 0, sizeof d->mbs);
 574     }
 575
 576   *pwc = wc;
 577   return 1;
 578 }
 579
 580 #ifdef DEBUG
 581
 582 static void
 583 prtok (token t)
 584 {
 585   char const *s;
 586
 587   if (t < 0)
 588     fprintf (stderr, "END");
 589   else if (t < NOTCHAR)
 590     {
 591       unsigned int ch = t;
 592       fprintf (stderr, "0x%02x", ch);
 593     }
 594   else
 595     {
 596       switch (t)
 597         {
 598         case EMPTY:
 599           s = "EMPTY";
 600           break;
 601         case BACKREF:
 602           s = "BACKREF";
 603           break;
 604         case BEGLINE:
 605           s = "BEGLINE";
 606           break;
 607         case ENDLINE:
 608           s = "ENDLINE";
 609           break;
 610         case BEGWORD:
 611           s = "BEGWORD";
 612           break;
 613         case ENDWORD:
 614           s = "ENDWORD";
 615           break;
 616         case LIMWORD:
 617           s = "LIMWORD";
 618           break;
 619         case NOTLIMWORD:
 620           s = "NOTLIMWORD";
 621           break;
 622         case QMARK:
 623           s = "QMARK";
 624           break;
 625         case STAR:
 626           s = "STAR";
 627           break;
 628         case PLUS:
 629           s = "PLUS";
 630           break;
 631         case CAT:
 632           s = "CAT";
 633           break;
 634         case OR:
 635           s = "OR";
 636           break;
 637         case LPAREN:
 638           s = "LPAREN";
 639           break;
 640         case RPAREN:
 641           s = "RPAREN";
 642           break;
 643         case ANYCHAR:
 644           s = "ANYCHAR";
 645           break;
 646         case MBCSET:
 647           s = "MBCSET";
 648           break;
 649         default:
 650           s = "CSET";
 651           break;
 652         }
 653       fprintf (stderr, "%s", s);
 654     }
 655 }
 656 #endif /* DEBUG */
 657
 658 /* Stuff pertaining to charclasses.  */
 659
 660 static bool
 661 tstbit (unsigned int b, charclass const c)
 662 {
 663   return c[b / CHARCLASS_WORD_BITS] >> b % CHARCLASS_WORD_BITS & 1;
 664 }
 665
 666 static void
 667 setbit (unsigned int b, charclass c)
 668 {
 669   c[b / CHARCLASS_WORD_BITS] |= (charclass_word) 1 << b % CHARCLASS_WORD_BITS;
 670 }
 671
 672 static void
 673 clrbit (unsigned int b, charclass c)
 674 {
 675   c[b / CHARCLASS_WORD_BITS] &= ~((charclass_word) 1
 676                                   << b % CHARCLASS_WORD_BITS);
 677 }
 678
 679 static void
 680 copyset (charclass const src, charclass dst)
 681 {
 682   memcpy (dst, src, sizeof (charclass));
 683 }
 684
 685 static void
 686 zeroset (charclass s)
 687 {
 688   memset (s, 0, sizeof (charclass));
 689 }
 690
 691 static void
 692 notset (charclass s)
 693 {
 694   int i;
 695
 696   for (i = 0; i < CHARCLASS_WORDS; ++i)
 697     s[i] = CHARCLASS_WORD_MASK & ~s[i];
 698 }
 699
 700 static bool
 701 equal (charclass const s1, charclass const s2)
 702 {
 703   charclass_word w = 0;
 704   int i;
 705   for (i = 0; i < CHARCLASS_WORDS; i++)
 706     w |= s1[i] ^ s2[i];
 707   return w == 0;
 708 }
 709
 710 static bool
 711 emptyset (charclass const s)
 712 {
 713   charclass_word w = 0;
 714   int i;
 715   for (i = 0; i < CHARCLASS_WORDS; i++)
 716     w |= s[i];
 717   return w == 0;
 718 }
 719
 720 /* Ensure that the array addressed by PTR holds at least NITEMS +
 721    (PTR || !NITEMS) items.  Either return PTR, or reallocate the array
 722    and return its new address.  Although PTR may be null, the returned
 723    value is never null.
 724
 725    The array holds *NALLOC items; *NALLOC is updated on reallocation.
 726    ITEMSIZE is the size of one item.  Avoid O(N**2) behavior on arrays
 727    growing linearly.  */
 728 static void *
 729 maybe_realloc (void *ptr, size_t nitems, size_t *nalloc, size_t itemsize)
 730 {
 731   if (nitems < *nalloc)
 732     return ptr;
 733   *nalloc = nitems;
 734   return x2nrealloc (ptr, nalloc, itemsize);
 735 }
 736
 737 /* In DFA D, find the index of charclass S, or allocate a new one.  */
 738 static size_t
 739 charclass_index (struct dfa *d, charclass const s)
 740 {
 741   size_t i;
 742
 743   for (i = 0; i < d->cindex; ++i)
 744     if (equal (s, d->charclasses[i]))
 745       return i;
 746   d->charclasses = maybe_realloc (d->charclasses, d->cindex, &d->calloc,
 747                                   sizeof *d->charclasses);
 748   ++d->cindex;
 749   copyset (s, d->charclasses[i]);
 750   return i;
 751 }
 752
 753 static bool
 754 unibyte_word_constituent (struct dfa const *dfa, unsigned char c)
 755 {
 756   return dfa->localeinfo.sbctowc[c] != WEOF && (isalnum (c) || (c) == '_');
 757 }
 758
 759 static int
 760 char_context (struct dfa const *dfa, unsigned char c)
 761 {
 762   if (c == dfa->syntax.eolbyte && !dfa->syntax.anchor)
 763     return CTX_NEWLINE;
 764   if (unibyte_word_constituent (dfa, c))
 765     return CTX_LETTER;
 766   return CTX_NONE;
 767 }
 768
 769 /* Set a bit in the charclass for the given wchar_t.  Do nothing if WC
 770    is represented by a multi-byte sequence.  Even for MB_CUR_MAX == 1,
 771    this may happen when folding case in weird Turkish locales where
 772    dotless i/dotted I are not included in the chosen character set.
 773    Return whether a bit was set in the charclass.  */
 774 static bool
 775 setbit_wc (wint_t wc, charclass c)
 776 {
 777   int b = wctob (wc);
 778   if (b == EOF)
 779     return false;
 780
 781   setbit (b, c);
 782   return true;
 783 }
 784
 785 /* Set a bit for B and its case variants in the charclass C.
 786    MB_CUR_MAX must be 1.  */
 787 static void
 788 setbit_case_fold_c (int b, charclass c)
 789 {
 790   int ub = toupper (b);
 791   int i;
 792   for (i = 0; i < NOTCHAR; i++)
 793     if (toupper (i) == ub)
 794       setbit (i, c);
 795 }
 796
 797 /* Return true if the locale compatible with the C locale.  */
 798
 799 static bool
 800 using_simple_locale (bool multibyte)
 801 {
 802   /* The native character set is known to be compatible with
 803      the C locale.  The following test isn't perfect, but it's good
 804      enough in practice, as only ASCII and EBCDIC are in common use
 805      and this test correctly accepts ASCII and rejects EBCDIC.  */
 806   enum { native_c_charset =
 807     ('\b' == 8 && '\t' == 9 && '\n' == 10 && '\v' == 11 && '\f' == 12
 808      && '\r' == 13 && ' ' == 32 && '!' == 33 && '"' == 34 && '#' == 35
 809      && '%' == 37 && '&' == 38 && '\'' == 39 && '(' == 40 && ')' == 41
 810      && '*' == 42 && '+' == 43 && ',' == 44 && '-' == 45 && '.' == 46
 811      && '/' == 47 && '0' == 48 && '9' == 57 && ':' == 58 && ';' == 59
 812      && '<' == 60 && '=' == 61 && '>' == 62 && '?' == 63 && 'A' == 65
 813      && 'Z' == 90 && '[' == 91 && '\\' == 92 && ']' == 93 && '^' == 94
 814      && '_' == 95 && 'a' == 97 && 'z' == 122 && '{' == 123 && '|' == 124
 815      && '}' == 125 && '~' == 126)
 816   };
 817
 818   if (native_c_charset && !multibyte)
 819     return true;
 820   else
 821     {
 822       /* Treat C and POSIX locales as being compatible.  Also, treat
 823          errors as compatible, as these are invariably from stubs.  */
 824       char const *loc = setlocale (LC_ALL, NULL);
 825       return !loc || STREQ (loc, "C") || STREQ (loc, "POSIX");
 826     }
 827 }
 828
 829 /* Fetch the next lexical input character.  Set C (of type int) to the
 830    next input byte, except set C to EOF if the input is a multibyte
 831    character of length greater than 1.  Set WC (of type wint_t) to the
 832    value of the input if it is a valid multibyte character (possibly
 833    of length 1); otherwise set WC to WEOF.  If there is no more input,
 834    report EOFERR if EOFERR is not null, and return lasttok = END
 835    otherwise.  */
 836 # define FETCH_WC(dfa, c, wc, eoferr)           \
 837   do {                                          \
 838     if (! (dfa)->lex.left)                      \
 839       {                                         \
 840         if ((eoferr) != 0)                      \
 841           dfaerror (eoferr);                    \
 842         else                                    \
 843           return (dfa)->lex.lasttok = END;      \
 844       }                                         \
 845     else                                        \
 846       {                                         \
 847         wint_t _wc;                             \
 848         size_t nbytes = mbs_to_wchar (&_wc, (dfa)->lex.ptr, \
 849                                       (dfa)->lex.left, dfa); \
 850         (dfa)->lex.cur_mb_len = nbytes;         \
 851         (wc) = _wc;                             \
 852         (c) = nbytes == 1 ? to_uchar ((dfa)->lex.ptr[0]) : EOF; \
 853         (dfa)->lex.ptr += nbytes;               \
 854         (dfa)->lex.left -= nbytes;              \
 855       }                                         \
 856   } while (false)
 857
 858 #ifndef MIN
 859 # define MIN(a,b) ((a) < (b) ? (a) : (b))
 860 #endif
 861
 862 typedef int predicate (int);
 863
 864 /* The following list maps the names of the Posix named character classes
 865    to predicate functions that determine whether a given character is in
 866    the class.  The leading [ has already been eaten by the lexical
 867    analyzer.  */
 868 struct dfa_ctype
 869 {
 870   const char *name;
 871   predicate *func;
 872   bool single_byte_only;
 873 };
 874
 875 static const struct dfa_ctype prednames[] = {
 876   {"alpha", isalpha, false},
 877   {"upper", isupper, false},
 878   {"lower", islower, false},
 879   {"digit", isdigit, true},
 880   {"xdigit", isxdigit, false},
 881   {"space", isspace, false},
 882   {"punct", ispunct, false},
 883   {"alnum", isalnum, false},
 884   {"print", isprint, false},
 885   {"graph", isgraph, false},
 886   {"cntrl", iscntrl, false},
 887   {"blank", isblank, false},
 888   {NULL, NULL, false}
 889 };
 890
 891 static const struct dfa_ctype *_GL_ATTRIBUTE_PURE
 892 find_pred (const char *str)
 893 {
 894   unsigned int i;
 895   for (i = 0; prednames[i].name; ++i)
 896     if (STREQ (str, prednames[i].name))
 897       return &prednames[i];
 898   return NULL;
 899 }
 900
 901 /* Multibyte character handling sub-routine for lex.
 902    Parse a bracket expression and build a struct mb_char_classes.  */
 903 static token
 904 parse_bracket_exp (struct dfa *dfa)
 905 {
 906   bool invert;
 907   int c, c1, c2;
 908   charclass ccl;
 909
 910   /* This is a bracket expression that dfaexec is known to
 911      process correctly.  */
 912   bool known_bracket_exp = true;
 913
 914   /* Used to warn about [:space:].
 915      Bit 0 = first character is a colon.
 916      Bit 1 = last character is a colon.
 917      Bit 2 = includes any other character but a colon.
 918      Bit 3 = includes ranges, char/equiv classes or collation elements.  */
 919   int colon_warning_state;
 920
 921   wint_t wc;
 922   wint_t wc2;
 923   wint_t wc1 = 0;
 924
 925   /* Work area to build a mb_char_classes.  */
 926   struct mb_char_classes *work_mbc;
 927   size_t chars_al;
 928
 929   chars_al = 0;
 930   if (dfa->localeinfo.multibyte)
 931     {
 932       dfa->mbcsets = maybe_realloc (dfa->mbcsets, dfa->nmbcsets,
 933                                     &dfa->mbcsets_alloc,
 934                                     sizeof *dfa->mbcsets);
 935
 936       /* dfa->multibyte_prop[] hold the index of dfa->mbcsets.
 937          We will update dfa->multibyte_prop[] in addtok, because we can't
 938          decide the index in dfa->tokens[].  */
 939
 940       /* Initialize work area.  */
 941       work_mbc = &dfa->mbcsets[dfa->nmbcsets++];
 942       memset (work_mbc, 0, sizeof *work_mbc);
 943     }
 944   else
 945     work_mbc = NULL;
 946
 947   memset (ccl, 0, sizeof ccl);
 948   FETCH_WC (dfa, c, wc, _("unbalanced ["));
 949   if (c == '^')
 950     {
 951       FETCH_WC (dfa, c, wc, _("unbalanced ["));
 952       invert = true;
 953       known_bracket_exp = dfa->simple_locale;
 954     }
 955   else
 956     invert = false;
 957
 958   colon_warning_state = (c == ':');
 959   do
 960     {
 961       c1 = NOTCHAR;     /* Mark c1 as not initialized.  */
 962       colon_warning_state &= ~2;
 963
 964       /* Note that if we're looking at some other [:...:] construct,
 965          we just treat it as a bunch of ordinary characters.  We can do
 966          this because we assume regex has checked for syntax errors before
 967          dfa is ever called.  */
 968       if (c == '[')
 969         {
 970           FETCH_WC (dfa, c1, wc1, _("unbalanced ["));
 971
 972           if ((c1 == ':' && (dfa->syntax.syntax_bits & RE_CHAR_CLASSES))
 973               || c1 == '.' || c1 == '=')
 974             {
 975               enum { MAX_BRACKET_STRING_LEN = 32 };
 976               char str[MAX_BRACKET_STRING_LEN + 1];
 977               size_t len = 0;
 978               for (;;)
 979                 {
 980                   FETCH_WC (dfa, c, wc, _("unbalanced ["));
 981                   if (dfa->lex.left == 0
 982                       || (c == c1 && dfa->lex.ptr[0] == ']'))
 983                     break;
 984                   if (len < MAX_BRACKET_STRING_LEN)
 985                     str[len++] = c;
 986                   else
 987                     /* This is in any case an invalid class name.  */
 988                     str[0] = '\0';
 989                 }
 990               str[len] = '\0';
 991
 992               /* Fetch bracket.  */
 993               FETCH_WC (dfa, c, wc, _("unbalanced ["));
 994               if (c1 == ':')
 995                 /* Build character class.  POSIX allows character
 996                    classes to match multicharacter collating elements,
 997                    but the regex code does not support that, so do not
 998                    worry about that possibility.  */
 999                 {
1000                   char const *class
1001                     = (dfa->syntax.case_fold && (STREQ (str, "upper")
1002                                                  || STREQ (str, "lower"))
1003                        ? "alpha" : str);
1004                   const struct dfa_ctype *pred = find_pred (class);
1005                   if (!pred)
1006                     dfaerror (_("invalid character class"));
1007
1008                   if (dfa->localeinfo.multibyte && !pred->single_byte_only)
1009                     known_bracket_exp = false;
1010                   else
1011                     for (c2 = 0; c2 < NOTCHAR; ++c2)
1012                       if (pred->func (c2))
1013                         setbit (c2, ccl);
1014                 }
1015               else
1016                 known_bracket_exp = false;
1017
1018               colon_warning_state |= 8;
1019
1020               /* Fetch new lookahead character.  */
1021               FETCH_WC (dfa, c1, wc1, _("unbalanced ["));
1022               continue;
1023             }
1024
1025           /* We treat '[' as a normal character here.  c/c1/wc/wc1
1026              are already set up.  */
1027         }
1028
1029       if (c == '\\' && (dfa->syntax.syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1030         FETCH_WC (dfa, c, wc, _("unbalanced ["));
1031
1032       if (c1 == NOTCHAR)
1033         FETCH_WC (dfa, c1, wc1, _("unbalanced ["));
1034
1035       if (c1 == '-')
1036         /* build range characters.  */
1037         {
1038           FETCH_WC (dfa, c2, wc2, _("unbalanced ["));
1039
1040           /* A bracket expression like [a-[.aa.]] matches an unknown set.
1041              Treat it like [-a[.aa.]] while parsing it, and
1042              remember that the set is unknown.  */
1043           if (c2 == '[' && dfa->lex.ptr[0] == '.')
1044             {
1045               known_bracket_exp = false;
1046               c2 = ']';
1047             }
1048
1049           if (c2 == ']')
1050             {
1051               /* In the case [x-], the - is an ordinary hyphen,
1052                  which is left in c1, the lookahead character.  */
1053               dfa->lex.ptr -= dfa->lex.cur_mb_len;
1054               dfa->lex.left += dfa->lex.cur_mb_len;
1055             }
1056           else
1057             {
1058               if (c2 == '\\' && (dfa->syntax.syntax_bits
1059                                  & RE_BACKSLASH_ESCAPE_IN_LISTS))
1060                 FETCH_WC (dfa, c2, wc2, _("unbalanced ["));
1061
1062               colon_warning_state |= 8;
1063               FETCH_WC (dfa, c1, wc1, _("unbalanced ["));
1064
1065               /* Treat [x-y] as a range if x != y.  */
1066               if (wc != wc2 || wc == WEOF)
1067                 {
1068                   if (dfa->localeinfo.multibyte)
1069                     known_bracket_exp = false;
1070                   else if (dfa->simple_locale)
1071                     {
1072                       int ci;
1073                       for (ci = c; ci <= c2; ci++)
1074                         setbit (ci, ccl);
1075                       if (dfa->syntax.case_fold)
1076                         {
1077                           int uc = toupper (c);
1078                           int uc2 = toupper (c2);
1079                           for (ci = 0; ci < NOTCHAR; ci++)
1080                             {
1081                               int uci = toupper (ci);
1082                               if (uc <= uci && uci <= uc2)
1083                                 setbit (ci, ccl);
1084                             }
1085                         }
1086                     }
1087                   else
1088                     known_bracket_exp = false;
1089
1090                   continue;
1091                 }
1092             }
1093         }
1094
1095       colon_warning_state |= (c == ':') ? 2 : 4;
1096
1097       if (!dfa->localeinfo.multibyte)
1098         {
1099           if (dfa->syntax.case_fold)
1100             setbit_case_fold_c (c, ccl);
1101           else
1102             setbit (c, ccl);
1103           continue;
1104         }
1105
1106       if (wc == WEOF)
1107         known_bracket_exp = false;
1108       else
1109         {
1110           wchar_t folded[CASE_FOLDED_BUFSIZE + 1];
1111           unsigned int i;
1112           unsigned int n = (dfa->syntax.case_fold
1113                             ? case_folded_counterparts (wc, folded + 1) + 1
1114                             : 1);
1115           folded[0] = wc;
1116           for (i = 0; i < n; i++)
1117             if (!setbit_wc (folded[i], ccl))
1118               {
1119                 work_mbc->chars
1120                   = maybe_realloc (work_mbc->chars, work_mbc->nchars,
1121                                    &chars_al, sizeof *work_mbc->chars);
1122                 work_mbc->chars[work_mbc->nchars++] = folded[i];
1123               }
1124         }
1125     }
1126   while ((wc = wc1, (c = c1) != ']'));
1127
1128   if (colon_warning_state == 7)
1129     dfawarn (_("character class syntax is [[:space:]], not [:space:]"));
1130
1131   if (! known_bracket_exp)
1132     return BACKREF;
1133
1134   if (dfa->localeinfo.multibyte)
1135     {
1136       work_mbc->invert = invert;
1137       work_mbc->cset = emptyset (ccl) ? -1 : charclass_index (dfa, ccl);
1138       return MBCSET;
1139     }
1140
1141   if (invert)
1142     {
1143       assert (!dfa->localeinfo.multibyte);
1144       notset (ccl);
1145       if (dfa->syntax.syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
1146         clrbit ('\n', ccl);
1147     }
1148
1149   return CSET + charclass_index (dfa, ccl);
1150 }
1151
1152 struct lexptr
1153 {
1154   char const *ptr;
1155   size_t left;
1156 };
1157
1158 static void
1159 push_lex_state (struct dfa *dfa, struct lexptr *ls, char const *s)
1160 {
1161   ls->ptr = dfa->lex.ptr;
1162   ls->left = dfa->lex.left;
1163   dfa->lex.ptr = s;
1164   dfa->lex.left = strlen (s);
1165 }
1166
1167 static void
1168 pop_lex_state (struct dfa *dfa, struct lexptr const *ls)
1169 {
1170   dfa->lex.ptr = ls->ptr;
1171   dfa->lex.left = ls->left;
1172 }
1173
1174 static token
1175 lex (struct dfa *dfa)
1176 {
1177   int c, c2;
1178   bool backslash = false;
1179   charclass ccl;
1180   int i;
1181
1182   /* Basic plan: We fetch a character.  If it's a backslash,
1183      we set the backslash flag and go through the loop again.
1184      On the plus side, this avoids having a duplicate of the
1185      main switch inside the backslash case.  On the minus side,
1186      it means that just about every case begins with
1187      "if (backslash) ...".  */
1188   for (i = 0; i < 2; ++i)
1189     {
1190       FETCH_WC (dfa, c, dfa->lex.wctok, NULL);
1191
1192       switch (c)
1193         {
1194         case '\\':
1195           if (backslash)
1196             goto normal_char;
1197           if (dfa->lex.left == 0)
1198             dfaerror (_("unfinished \\ escape"));
1199           backslash = true;
1200           break;
1201
1202         case '^':
1203           if (backslash)
1204             goto normal_char;
1205           if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1206               || dfa->lex.lasttok == END || dfa->lex.lasttok == LPAREN
1207               || dfa->lex.lasttok == OR)
1208             return dfa->lex.lasttok = BEGLINE;
1209           goto normal_char;
1210
1211         case '$':
1212           if (backslash)
1213             goto normal_char;
1214           if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1215               || dfa->lex.left == 0
1216               || ((dfa->lex.left
1217                    > !(dfa->syntax.syntax_bits & RE_NO_BK_PARENS))
1218                   && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_PARENS)
1219                                    & (dfa->lex.ptr[0] == '\\')]
1220                       == ')'))
1221               || ((dfa->lex.left
1222                    > !(dfa->syntax.syntax_bits & RE_NO_BK_VBAR))
1223                   && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_VBAR)
1224                                    & (dfa->lex.ptr[0] == '\\')]
1225                       == '|'))
1226               || ((dfa->syntax.syntax_bits & RE_NEWLINE_ALT)
1227                   && dfa->lex.left > 0 && dfa->lex.ptr[0] == '\n'))
1228             return dfa->lex.lasttok = ENDLINE;
1229           goto normal_char;
1230
1231         case '1':
1232         case '2':
1233         case '3':
1234         case '4':
1235         case '5':
1236         case '6':
1237         case '7':
1238         case '8':
1239         case '9':
1240           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_BK_REFS))
1241             {
1242               dfa->lex.laststart = false;
1243               return dfa->lex.lasttok = BACKREF;
1244             }
1245           goto normal_char;
1246
1247         case '`':
1248           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1249             {
1250               /* FIXME: should be beginning of string */
1251               return dfa->lex.lasttok = BEGLINE;
1252             }
1253           goto normal_char;
1254
1255         case '\'':
1256           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1257             {
1258               /* FIXME: should be end of string */
1259               return dfa->lex.lasttok = ENDLINE;
1260             }
1261           goto normal_char;
1262
1263         case '<':
1264           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1265             return dfa->lex.lasttok = BEGWORD;
1266           goto normal_char;
1267
1268         case '>':
1269           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1270             return dfa->lex.lasttok = ENDWORD;
1271           goto normal_char;
1272
1273         case 'b':
1274           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1275             return dfa->lex.lasttok = LIMWORD;
1276           goto normal_char;
1277
1278         case 'B':
1279           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1280             return dfa->lex.lasttok = NOTLIMWORD;
1281           goto normal_char;
1282
1283         case '?':
1284           if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1285             goto normal_char;
1286           if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1287             goto normal_char;
1288           if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1289               && dfa->lex.laststart)
1290             goto normal_char;
1291           return dfa->lex.lasttok = QMARK;
1292
1293         case '*':
1294           if (backslash)
1295             goto normal_char;
1296           if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1297               && dfa->lex.laststart)
1298             goto normal_char;
1299           return dfa->lex.lasttok = STAR;
1300
1301         case '+':
1302           if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1303             goto normal_char;
1304           if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1305             goto normal_char;
1306           if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1307               && dfa->lex.laststart)
1308             goto normal_char;
1309           return dfa->lex.lasttok = PLUS;
1310
1311         case '{':
1312           if (!(dfa->syntax.syntax_bits & RE_INTERVALS))
1313             goto normal_char;
1314           if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_BRACES) == 0))
1315             goto normal_char;
1316           if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1317               && dfa->lex.laststart)
1318             goto normal_char;
1319
1320           /* Cases:
1321              {M} - exact count
1322              {M,} - minimum count, maximum is infinity
1323              {,N} - 0 through N
1324              {,} - 0 to infinity (same as '*')
1325              {M,N} - M through N */
1326           {
1327             char const *p = dfa->lex.ptr;
1328             char const *lim = p + dfa->lex.left;
1329             dfa->lex.minrep = dfa->lex.maxrep = -1;
1330             for (; p != lim && ISASCIIDIGIT (*p); p++)
1331               dfa->lex.minrep = (dfa->lex.minrep < 0
1332                                  ? *p - '0'
1333                                  : MIN (RE_DUP_MAX + 1,
1334                                         dfa->lex.minrep * 10 + *p - '0'));
1335             if (p != lim)
1336               {
1337                 if (*p != ',')
1338                   dfa->lex.maxrep = dfa->lex.minrep;
1339                 else
1340                   {
1341                     if (dfa->lex.minrep < 0)
1342                       dfa->lex.minrep = 0;
1343                     while (++p != lim && ISASCIIDIGIT (*p))
1344                       dfa->lex.maxrep
1345                         = (dfa->lex.maxrep < 0
1346                            ? *p - '0'
1347                            : MIN (RE_DUP_MAX + 1,
1348                                   dfa->lex.maxrep * 10 + *p - '0'));
1349                   }
1350               }
1351             if (! ((! backslash || (p != lim && *p++ == '\\'))
1352                    && p != lim && *p++ == '}'
1353                    && 0 <= dfa->lex.minrep
1354                    && (dfa->lex.maxrep < 0
1355                        || dfa->lex.minrep <= dfa->lex.maxrep)))
1356               {
1357                 if (dfa->syntax.syntax_bits & RE_INVALID_INTERVAL_ORD)
1358                   goto normal_char;
1359                 dfaerror (_("invalid content of \\{\\}"));
1360               }
1361             if (RE_DUP_MAX < dfa->lex.maxrep)
1362               dfaerror (_("regular expression too big"));
1363             dfa->lex.ptr = p;
1364             dfa->lex.left = lim - p;
1365           }
1366           dfa->lex.laststart = false;
1367           return dfa->lex.lasttok = REPMN;
1368
1369         case '|':
1370           if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1371             goto normal_char;
1372           if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_VBAR) == 0))
1373             goto normal_char;
1374           dfa->lex.laststart = true;
1375           return dfa->lex.lasttok = OR;
1376
1377         case '\n':
1378           if (dfa->syntax.syntax_bits & RE_LIMITED_OPS
1379               || backslash || !(dfa->syntax.syntax_bits & RE_NEWLINE_ALT))
1380             goto normal_char;
1381           dfa->lex.laststart = true;
1382           return dfa->lex.lasttok = OR;
1383
1384         case '(':
1385           if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1386             goto normal_char;
1387           dfa->lex.parens++;
1388           dfa->lex.laststart = true;
1389           return dfa->lex.lasttok = LPAREN;
1390
1391         case ')':
1392           if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1393             goto normal_char;
1394           if (dfa->lex.parens == 0
1395               && dfa->syntax.syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
1396             goto normal_char;
1397           dfa->lex.parens--;
1398           dfa->lex.laststart = false;
1399           return dfa->lex.lasttok = RPAREN;
1400
1401         case '.':
1402           if (backslash)
1403             goto normal_char;
1404           if (dfa->canychar == (size_t) -1)
1405             {
1406               zeroset (ccl);
1407               notset (ccl);
1408               if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1409                 clrbit ('\n', ccl);
1410               if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1411                 clrbit ('\0', ccl);
1412               if (dfa->localeinfo.multibyte)
1413                 for (c2 = 0; c2 < NOTCHAR; c2++)
1414                   if (dfa->localeinfo.sbctowc[c2] == WEOF)
1415                     clrbit (c2, ccl);
1416               dfa->canychar = charclass_index (dfa, ccl);
1417             }
1418           dfa->lex.laststart = false;
1419           return dfa->lex.lasttok = (dfa->localeinfo.multibyte
1420                                      ? ANYCHAR
1421                                      : CSET + dfa->canychar);
1422
1423         case 's':
1424         case 'S':
1425           if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1426             goto normal_char;
1427           if (!dfa->localeinfo.multibyte)
1428             {
1429               zeroset (ccl);
1430               for (c2 = 0; c2 < NOTCHAR; ++c2)
1431                 if (isspace (c2))
1432                   setbit (c2, ccl);
1433               if (c == 'S')
1434                 notset (ccl);
1435               dfa->lex.laststart = false;
1436               return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl);
1437             }
1438
1439           /* FIXME: see if optimizing this, as is done with ANYCHAR and
1440              add_utf8_anychar, makes sense.  */
1441
1442           /* \s and \S are documented to be equivalent to [[:space:]] and
1443              [^[:space:]] respectively, so tell the lexer to process those
1444              strings, each minus its "already processed" '['.  */
1445           {
1446             struct lexptr ls;
1447             push_lex_state (dfa, &ls, &"^[:space:]]"[c == 's']);
1448             dfa->lex.lasttok = parse_bracket_exp (dfa);
1449             pop_lex_state (dfa, &ls);
1450           }
1451
1452           dfa->lex.laststart = false;
1453           return dfa->lex.lasttok;
1454
1455         case 'w':
1456         case 'W':
1457           if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1458             goto normal_char;
1459
1460           if (!dfa->localeinfo.multibyte)
1461             {
1462               zeroset (ccl);
1463               for (c2 = 0; c2 < NOTCHAR; ++c2)
1464                 if (dfa->syntax.sbit[c2] == CTX_LETTER)
1465                   setbit (c2, ccl);
1466               if (c == 'W')
1467                 notset (ccl);
1468               dfa->lex.laststart = false;
1469               return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl);
1470             }
1471
1472           /* FIXME: see if optimizing this, as is done with ANYCHAR and
1473              add_utf8_anychar, makes sense.  */
1474
1475           /* \w and \W are documented to be equivalent to [_[:alnum:]] and
1476              [^_[:alnum:]] respectively, so tell the lexer to process those
1477              strings, each minus its "already processed" '['.  */
1478           {
1479             struct lexptr ls;
1480             push_lex_state (dfa, &ls, &"^_[:alnum:]]"[c == 'w']);
1481             dfa->lex.lasttok = parse_bracket_exp (dfa);
1482             pop_lex_state (dfa, &ls);
1483           }
1484
1485           dfa->lex.laststart = false;
1486           return dfa->lex.lasttok;
1487
1488         case '[':
1489           if (backslash)
1490             goto normal_char;
1491           dfa->lex.laststart = false;
1492           return dfa->lex.lasttok = parse_bracket_exp (dfa);
1493
1494         default:
1495         normal_char:
1496           dfa->lex.laststart = false;
1497           /* For multibyte character sets, folding is done in atom.  Always
1498              return WCHAR.  */
1499           if (dfa->localeinfo.multibyte)
1500             return dfa->lex.lasttok = WCHAR;
1501
1502           if (dfa->syntax.case_fold && isalpha (c))
1503             {
1504               zeroset (ccl);
1505               setbit_case_fold_c (c, ccl);
1506               return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl);
1507             }
1508
1509           return dfa->lex.lasttok = c;
1510         }
1511     }
1512
1513   /* The above loop should consume at most a backslash
1514      and some other character.  */
1515   abort ();
1516   return END;                   /* keeps pedantic compilers happy.  */
1517 }
1518
1519 static void
1520 addtok_mb (struct dfa *dfa, token t, int mbprop)
1521 {
1522   if (dfa->talloc == dfa->tindex)
1523     {
1524       dfa->tokens = x2nrealloc (dfa->tokens, &dfa->talloc,
1525                                 sizeof *dfa->tokens);
1526       if (dfa->localeinfo.multibyte)
1527         dfa->multibyte_prop = xnrealloc (dfa->multibyte_prop, dfa->talloc,
1528                                          sizeof *dfa->multibyte_prop);
1529     }
1530   if (dfa->localeinfo.multibyte)
1531     dfa->multibyte_prop[dfa->tindex] = mbprop;
1532   dfa->tokens[dfa->tindex++] = t;
1533
1534   switch (t)
1535     {
1536     case QMARK:
1537     case STAR:
1538     case PLUS:
1539       break;
1540
1541     case CAT:
1542     case OR:
1543       dfa->parse.depth--;
1544       break;
1545
1546     case BACKREF:
1547       dfa->fast = false;
1548       /* fallthrough */
1549     default:
1550       dfa->nleaves++;
1551       /* fallthrough */
1552     case EMPTY:
1553       dfa->parse.depth++;
1554       break;
1555     }
1556   if (dfa->parse.depth > dfa->depth)
1557     dfa->depth = dfa->parse.depth;
1558 }
1559
1560 static void addtok_wc (struct dfa *dfa, wint_t wc);
1561
1562 /* Add the given token to the parse tree, maintaining the depth count and
1563    updating the maximum depth if necessary.  */
1564 static void
1565 addtok (struct dfa *dfa, token t)
1566 {
1567   if (dfa->localeinfo.multibyte && t == MBCSET)
1568     {
1569       bool need_or = false;
1570       struct mb_char_classes *work_mbc = &dfa->mbcsets[dfa->nmbcsets - 1];
1571       size_t i;
1572
1573       /* Extract wide characters into alternations for better performance.
1574          This does not require UTF-8.  */
1575       for (i = 0; i < work_mbc->nchars; i++)
1576         {
1577           addtok_wc (dfa, work_mbc->chars[i]);
1578           if (need_or)
1579             addtok (dfa, OR);
1580           need_or = true;
1581         }
1582       work_mbc->nchars = 0;
1583
1584       /* Characters have been handled above, so it is possible
1585          that the mbcset is empty now.  Do nothing in that case.  */
1586       if (work_mbc->cset != -1)
1587         {
1588           addtok (dfa, CSET + work_mbc->cset);
1589           if (need_or)
1590             addtok (dfa, OR);
1591         }
1592     }
1593   else
1594     {
1595       addtok_mb (dfa, t, 3);
1596     }
1597 }
1598
1599 /* We treat a multibyte character as a single atom, so that DFA
1600    can treat a multibyte character as a single expression.
1601
1602    e.g., we construct the following tree from "<mb1><mb2>".
1603    <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
1604    <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> */
1605 static void
1606 addtok_wc (struct dfa *dfa, wint_t wc)
1607 {
1608   unsigned char buf[MB_LEN_MAX];
1609   mbstate_t s = { 0 };
1610   int i;
1611   size_t stored_bytes = wcrtomb ((char *) buf, wc, &s);
1612
1613   if (stored_bytes != (size_t) -1)
1614     dfa->lex.cur_mb_len = stored_bytes;
1615   else
1616     {
1617       /* This is merely stop-gap.  buf[0] is undefined, yet skipping
1618          the addtok_mb call altogether can corrupt the heap.  */
1619       dfa->lex.cur_mb_len = 1;
1620       buf[0] = 0;
1621     }
1622
1623   addtok_mb (dfa, buf[0], dfa->lex.cur_mb_len == 1 ? 3 : 1);
1624   for (i = 1; i < dfa->lex.cur_mb_len; i++)
1625     {
1626       addtok_mb (dfa, buf[i], i == dfa->lex.cur_mb_len - 1 ? 2 : 0);
1627       addtok (dfa, CAT);
1628     }
1629 }
1630
1631 static void
1632 add_utf8_anychar (struct dfa *dfa)
1633 {
1634   static charclass const utf8_classes[5] = {
1635     /* 80-bf: non-leading bytes.  */
1636     CHARCLASS_INIT (0, 0, 0, 0, 0xffffffff, 0xffffffff, 0, 0),
1637
1638     /* 00-7f: 1-byte sequence.  */
1639     CHARCLASS_INIT (0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0, 0, 0, 0),
1640
1641     /* c2-df: 2-byte sequence.  */
1642     CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0xfffffffc, 0),
1643
1644     /* e0-ef: 3-byte sequence.  */
1645     CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xffff),
1646
1647     /* f0-f7: 4-byte sequence.  */
1648     CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xff0000)
1649   };
1650   const unsigned int n = sizeof (utf8_classes) / sizeof (utf8_classes[0]);
1651   unsigned int i;
1652
1653   /* Define the five character classes that are needed below.  */
1654   if (dfa->utf8_anychar_classes[0] == 0)
1655     for (i = 0; i < n; i++)
1656       {
1657         charclass c;
1658         copyset (utf8_classes[i], c);
1659         if (i == 1)
1660           {
1661             if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1662               clrbit ('\n', c);
1663             if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1664               clrbit ('\0', c);
1665           }
1666         dfa->utf8_anychar_classes[i] = CSET + charclass_index (dfa, c);
1667       }
1668
1669   /* A valid UTF-8 character is
1670
1671      ([0x00-0x7f]
1672      |[0xc2-0xdf][0x80-0xbf]
1673      |[0xe0-0xef[0x80-0xbf][0x80-0xbf]
1674      |[0xf0-f7][0x80-0xbf][0x80-0xbf][0x80-0xbf])
1675
1676      which I'll write more concisely "B|CA|DAA|EAAA".  Factor the [0x00-0x7f]
1677      and you get "B|(C|(D|EA)A)A".  And since the token buffer is in reverse
1678      Polish notation, you get "B C D E A CAT OR A CAT OR A CAT OR".  */
1679   for (i = 1; i < n; i++)
1680     addtok (dfa, dfa->utf8_anychar_classes[i]);
1681   while (--i > 1)
1682     {
1683       addtok (dfa, dfa->utf8_anychar_classes[0]);
1684       addtok (dfa, CAT);
1685       addtok (dfa, OR);
1686     }
1687 }
1688
1689 /* The grammar understood by the parser is as follows.
1690
1691    regexp:
1692      regexp OR branch
1693      branch
1694
1695    branch:
1696      branch closure
1697      closure
1698
1699    closure:
1700      closure QMARK
1701      closure STAR
1702      closure PLUS
1703      closure REPMN
1704      atom
1705
1706    atom:
1707      <normal character>
1708      <multibyte character>
1709      ANYCHAR
1710      MBCSET
1711      CSET
1712      BACKREF
1713      BEGLINE
1714      ENDLINE
1715      BEGWORD
1716      ENDWORD
1717      LIMWORD
1718      NOTLIMWORD
1719      LPAREN regexp RPAREN
1720      <empty>
1721
1722    The parser builds a parse tree in postfix form in an array of tokens.  */
1723
1724 static void
1725 atom (struct dfa *dfa)
1726 {
1727   if (dfa->parse.tok == WCHAR)
1728     {
1729       if (dfa->lex.wctok == WEOF)
1730         addtok (dfa, BACKREF);
1731       else
1732         {
1733           addtok_wc (dfa, dfa->lex.wctok);
1734
1735           if (dfa->syntax.case_fold)
1736             {
1737               wchar_t folded[CASE_FOLDED_BUFSIZE];
1738               unsigned int i, n = case_folded_counterparts (dfa->lex.wctok,
1739                                                             folded);
1740               for (i = 0; i < n; i++)
1741                 {
1742                   addtok_wc (dfa, folded[i]);
1743                   addtok (dfa, OR);
1744                 }
1745             }
1746         }
1747
1748       dfa->parse.tok = lex (dfa);
1749     }
1750   else if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8)
1751     {
1752       /* For UTF-8 expand the period to a series of CSETs that define a valid
1753          UTF-8 character.  This avoids using the slow multibyte path.  I'm
1754          pretty sure it would be both profitable and correct to do it for
1755          any encoding; however, the optimization must be done manually as
1756          it is done above in add_utf8_anychar.  So, let's start with
1757          UTF-8: it is the most used, and the structure of the encoding
1758          makes the correctness more obvious.  */
1759       add_utf8_anychar (dfa);
1760       dfa->parse.tok = lex (dfa);
1761     }
1762   else if ((0 <= dfa->parse.tok && dfa->parse.tok < NOTCHAR)
1763            || dfa->parse.tok >= CSET || dfa->parse.tok == BACKREF
1764            || dfa->parse.tok == BEGLINE || dfa->parse.tok == ENDLINE
1765            || dfa->parse.tok == BEGWORD || dfa->parse.tok == ANYCHAR
1766            || dfa->parse.tok == MBCSET || dfa->parse.tok == ENDWORD
1767            || dfa->parse.tok == LIMWORD || dfa->parse.tok == NOTLIMWORD)
1768     {
1769       addtok (dfa, dfa->parse.tok);
1770       dfa->parse.tok = lex (dfa);
1771     }
1772   else if (dfa->parse.tok == LPAREN)
1773     {
1774       dfa->parse.tok = lex (dfa);
1775       regexp (dfa);
1776       if (dfa->parse.tok != RPAREN)
1777         dfaerror (_("unbalanced ("));
1778       dfa->parse.tok = lex (dfa);
1779     }
1780   else
1781     addtok (dfa, EMPTY);
1782 }
1783
1784 /* Return the number of tokens in the given subexpression.  */
1785 static size_t _GL_ATTRIBUTE_PURE
1786 nsubtoks (struct dfa const *dfa, size_t tindex)
1787 {
1788   size_t ntoks1;
1789
1790   switch (dfa->tokens[tindex - 1])
1791     {
1792     default:
1793       return 1;
1794     case QMARK:
1795     case STAR:
1796     case PLUS:
1797       return 1 + nsubtoks (dfa, tindex - 1);
1798     case CAT:
1799     case OR:
1800       ntoks1 = nsubtoks (dfa, tindex - 1);
1801       return 1 + ntoks1 + nsubtoks (dfa, tindex - 1 - ntoks1);
1802     }
1803 }
1804
1805 /* Copy the given subexpression to the top of the tree.  */
1806 static void
1807 copytoks (struct dfa *dfa, size_t tindex, size_t ntokens)
1808 {
1809   size_t i;
1810
1811   if (dfa->localeinfo.multibyte)
1812     for (i = 0; i < ntokens; ++i)
1813       addtok_mb (dfa, dfa->tokens[tindex + i], dfa->multibyte_prop[tindex + i]);
1814   else
1815     for (i = 0; i < ntokens; ++i)
1816       addtok_mb (dfa, dfa->tokens[tindex + i], 3);
1817 }
1818
1819 static void
1820 closure (struct dfa *dfa)
1821 {
1822   int i;
1823   size_t tindex, ntokens;
1824
1825   atom (dfa);
1826   while (dfa->parse.tok == QMARK || dfa->parse.tok == STAR
1827          || dfa->parse.tok == PLUS || dfa->parse.tok == REPMN)
1828     if (dfa->parse.tok == REPMN && (dfa->lex.minrep || dfa->lex.maxrep))
1829       {
1830         ntokens = nsubtoks (dfa, dfa->tindex);
1831         tindex = dfa->tindex - ntokens;
1832         if (dfa->lex.maxrep < 0)
1833           addtok (dfa, PLUS);
1834         if (dfa->lex.minrep == 0)
1835           addtok (dfa, QMARK);
1836         for (i = 1; i < dfa->lex.minrep; i++)
1837           {
1838             copytoks (dfa, tindex, ntokens);
1839             addtok (dfa, CAT);
1840           }
1841         for (; i < dfa->lex.maxrep; i++)
1842           {
1843             copytoks (dfa, tindex, ntokens);
1844             addtok (dfa, QMARK);
1845             addtok (dfa, CAT);
1846           }
1847         dfa->parse.tok = lex (dfa);
1848       }
1849     else if (dfa->parse.tok == REPMN)
1850       {
1851         dfa->tindex -= nsubtoks (dfa, dfa->tindex);
1852         dfa->parse.tok = lex (dfa);
1853         closure (dfa);
1854       }
1855     else
1856       {
1857         addtok (dfa, dfa->parse.tok);
1858         dfa->parse.tok = lex (dfa);
1859       }
1860 }
1861
1862 static void
1863 branch (struct dfa* dfa)
1864 {
1865   closure (dfa);
1866   while (dfa->parse.tok != RPAREN && dfa->parse.tok != OR
1867          && dfa->parse.tok >= 0)
1868     {
1869       closure (dfa);
1870       addtok (dfa, CAT);
1871     }
1872 }
1873
1874 static void
1875 regexp (struct dfa *dfa)
1876 {
1877   branch (dfa);
1878   while (dfa->parse.tok == OR)
1879     {
1880       dfa->parse.tok = lex (dfa);
1881       branch (dfa);
1882       addtok (dfa, OR);
1883     }
1884 }
1885
1886 /* Main entry point for the parser.  S is a string to be parsed, len is the
1887    length of the string, so s can include NUL characters.  D is a pointer to
1888    the struct dfa to parse into.  */
1889 static void
1890 dfaparse (char const *s, size_t len, struct dfa *d)
1891 {
1892   d->lex.ptr = s;
1893   d->lex.left = len;
1894   d->lex.lasttok = END;
1895   d->lex.laststart = true;
1896   d->lex.parens = 0;
1897   if (d->localeinfo.multibyte)
1898     {
1899       d->lex.cur_mb_len = 0;
1900       memset (&d->mbs, 0, sizeof d->mbs);
1901     }
1902
1903   if (!d->syntax.syntax_bits_set)
1904     dfaerror (_("no syntax specified"));
1905
1906   d->parse.tok = lex (d);
1907   d->parse.depth = d->depth;
1908
1909   regexp (d);
1910
1911   if (d->parse.tok != END)
1912     dfaerror (_("unbalanced )"));
1913
1914   addtok (d, END - d->nregexps);
1915   addtok (d, CAT);
1916
1917   if (d->nregexps)
1918     addtok (d, OR);
1919
1920   ++d->nregexps;
1921 }
1922
1923 /* Some primitives for operating on sets of positions.  */
1924
1925 /* Copy one set to another.  */
1926 static void
1927 copy (position_set const *src, position_set *dst)
1928 {
1929   if (dst->alloc < src->nelem)
1930     {
1931       free (dst->elems);
1932       dst->alloc = src->nelem;
1933       dst->elems = x2nrealloc (NULL, &dst->alloc, sizeof *dst->elems);
1934     }
1935   memcpy (dst->elems, src->elems, src->nelem * sizeof *dst->elems);
1936   dst->nelem = src->nelem;
1937 }
1938
1939 static void
1940 alloc_position_set (position_set *s, size_t size)
1941 {
1942   s->elems = xnmalloc (size, sizeof *s->elems);
1943   s->alloc = size;
1944   s->nelem = 0;
1945 }
1946
1947 /* Insert position P in set S.  S is maintained in sorted order on
1948    decreasing index.  If there is already an entry in S with P.index
1949    then merge (logically-OR) P's constraints into the one in S.
1950    S->elems must point to an array large enough to hold the resulting set.  */
1951 static void
1952 insert (position p, position_set *s)
1953 {
1954   size_t count = s->nelem;
1955   size_t lo = 0, hi = count;
1956   size_t i;
1957   while (lo < hi)
1958     {
1959       size_t mid = (lo + hi) >> 1;
1960       if (s->elems[mid].index > p.index)
1961         lo = mid + 1;
1962       else
1963         hi = mid;
1964     }
1965
1966   if (lo < count && p.index == s->elems[lo].index)
1967     {
1968       s->elems[lo].constraint |= p.constraint;
1969       return;
1970     }
1971
1972   s->elems = maybe_realloc (s->elems, count, &s->alloc, sizeof *s->elems);
1973   for (i = count; i > lo; i--)
1974     s->elems[i] = s->elems[i - 1];
1975   s->elems[lo] = p;
1976   ++s->nelem;
1977 }
1978
1979 /* Merge two sets of positions into a third.  The result is exactly as if
1980    the positions of both sets were inserted into an initially empty set.  */
1981 static void
1982 merge (position_set const *s1, position_set const *s2, position_set *m)
1983 {
1984   size_t i = 0, j = 0;
1985
1986   if (m->alloc < s1->nelem + s2->nelem)
1987     {
1988       free (m->elems);
1989       m->elems = maybe_realloc (NULL, s1->nelem + s2->nelem, &m->alloc,
1990                                 sizeof *m->elems);
1991     }
1992   m->nelem = 0;
1993   while (i < s1->nelem && j < s2->nelem)
1994     if (s1->elems[i].index > s2->elems[j].index)
1995       m->elems[m->nelem++] = s1->elems[i++];
1996     else if (s1->elems[i].index < s2->elems[j].index)
1997       m->elems[m->nelem++] = s2->elems[j++];
1998     else
1999       {
2000         m->elems[m->nelem] = s1->elems[i++];
2001         m->elems[m->nelem++].constraint |= s2->elems[j++].constraint;
2002       }
2003   while (i < s1->nelem)
2004     m->elems[m->nelem++] = s1->elems[i++];
2005   while (j < s2->nelem)
2006     m->elems[m->nelem++] = s2->elems[j++];
2007 }
2008
2009 /* Delete a position from a set.  */
2010 static void
2011 delete (position p, position_set *s)
2012 {
2013   size_t i;
2014
2015   for (i = 0; i < s->nelem; ++i)
2016     if (p.index == s->elems[i].index)
2017       break;
2018   if (i < s->nelem)
2019     for (--s->nelem; i < s->nelem; ++i)
2020       s->elems[i] = s->elems[i + 1];
2021 }
2022
2023 /* Find the index of the state corresponding to the given position set with
2024    the given preceding context, or create a new state if there is no such
2025    state.  Context tells whether we got here on a newline or letter.  */
2026 static state_num
2027 state_index (struct dfa *d, position_set const *s, int context)
2028 {
2029   size_t hash = 0;
2030   int constraint = 0;
2031   state_num i, j;
2032   token first_end = 0;
2033
2034   for (i = 0; i < s->nelem; ++i)
2035     hash ^= s->elems[i].index + s->elems[i].constraint;
2036
2037   /* Try to find a state that exactly matches the proposed one.  */
2038   for (i = 0; i < d->sindex; ++i)
2039     {
2040       if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem
2041           || context != d->states[i].context)
2042         continue;
2043       for (j = 0; j < s->nelem; ++j)
2044         if (s->elems[j].constraint != d->states[i].elems.elems[j].constraint
2045             || s->elems[j].index != d->states[i].elems.elems[j].index)
2046           break;
2047       if (j == s->nelem)
2048         return i;
2049     }
2050
2051 #ifdef DEBUG
2052   fprintf (stderr, "new state %zd\n nextpos:", i);
2053   for (j = 0; j < s->nelem; ++j)
2054     {
2055       fprintf (stderr, " %zu:", s->elems[j].index);
2056       prtok (d->tokens[s->elems[j].index]);
2057     }
2058   fprintf (stderr, "\n context:");
2059   if (context ^ CTX_ANY)
2060     {
2061       if (context & CTX_NONE)
2062         fprintf (stderr, " CTX_NONE");
2063       if (context & CTX_LETTER)
2064         fprintf (stderr, " CTX_LETTER");
2065       if (context & CTX_NEWLINE)
2066         fprintf (stderr, " CTX_NEWLINE");
2067     }
2068   else
2069     fprintf (stderr, " CTX_ANY");
2070   fprintf (stderr, "\n");
2071 #endif
2072
2073   for (j = 0; j < s->nelem; ++j)
2074     {
2075       int c = s->elems[j].constraint;
2076       if (d->tokens[s->elems[j].index] < 0)
2077         {
2078           if (SUCCEEDS_IN_CONTEXT (c, context, CTX_ANY))
2079             constraint |= c;
2080           if (!first_end)
2081             first_end = d->tokens[s->elems[j].index];
2082         }
2083       else if (d->tokens[s->elems[j].index] == BACKREF)
2084         constraint = NO_CONSTRAINT;
2085     }
2086
2087
2088   /* Create a new state.  */
2089   d->states = maybe_realloc (d->states, d->sindex, &d->salloc,
2090                              sizeof *d->states);
2091   d->states[i].hash = hash;
2092   alloc_position_set (&d->states[i].elems, s->nelem);
2093   copy (s, &d->states[i].elems);
2094   d->states[i].context = context;
2095   d->states[i].constraint = constraint;
2096   d->states[i].first_end = first_end;
2097   d->states[i].mbps.nelem = 0;
2098   d->states[i].mbps.elems = NULL;
2099   d->states[i].mb_trindex = -1;
2100
2101   ++d->sindex;
2102
2103   return i;
2104 }
2105
2106 /* Find the epsilon closure of a set of positions.  If any position of the set
2107    contains a symbol that matches the empty string in some context, replace
2108    that position with the elements of its follow labeled with an appropriate
2109    constraint.  Repeat exhaustively until no funny positions are left.
2110    S->elems must be large enough to hold the result.  */
2111 static void
2112 epsclosure (position_set *s, struct dfa const *d, char *visited)
2113 {
2114   size_t i, j;
2115   position p, old;
2116   bool initialized = false;
2117
2118   for (i = 0; i < s->nelem; ++i)
2119     if (d->tokens[s->elems[i].index] >= NOTCHAR
2120         && d->tokens[s->elems[i].index] != BACKREF
2121         && d->tokens[s->elems[i].index] != ANYCHAR
2122         && d->tokens[s->elems[i].index] != MBCSET
2123         && d->tokens[s->elems[i].index] < CSET)
2124       {
2125         if (!initialized)
2126           {
2127             memset (visited, 0, d->tindex * sizeof (*visited));
2128             initialized = true;
2129           }
2130         old = s->elems[i];
2131         p.constraint = old.constraint;
2132         delete (s->elems[i], s);
2133         if (visited[old.index])
2134           {
2135             --i;
2136             continue;
2137           }
2138         visited[old.index] = 1;
2139         switch (d->tokens[old.index])
2140           {
2141           case BEGLINE:
2142             p.constraint &= BEGLINE_CONSTRAINT;
2143             break;
2144           case ENDLINE:
2145             p.constraint &= ENDLINE_CONSTRAINT;
2146             break;
2147           case BEGWORD:
2148             p.constraint &= BEGWORD_CONSTRAINT;
2149             break;
2150           case ENDWORD:
2151             p.constraint &= ENDWORD_CONSTRAINT;
2152             break;
2153           case LIMWORD:
2154             p.constraint &= LIMWORD_CONSTRAINT;
2155             break;
2156           case NOTLIMWORD:
2157             p.constraint &= NOTLIMWORD_CONSTRAINT;
2158             break;
2159           default:
2160             break;
2161           }
2162         for (j = 0; j < d->follows[old.index].nelem; ++j)
2163           {
2164             p.index = d->follows[old.index].elems[j].index;
2165             insert (p, s);
2166           }
2167         /* Force rescan to start at the beginning.  */
2168         i = -1;
2169       }
2170 }
2171
2172 /* Returns the set of contexts for which there is at least one
2173    character included in C.  */
2174
2175 static int
2176 charclass_context (struct dfa const *dfa, charclass c)
2177 {
2178   int context = 0;
2179   unsigned int j;
2180
2181   for (j = 0; j < CHARCLASS_WORDS; ++j)
2182     {
2183       if (c[j] & dfa->syntax.newline[j])
2184         context |= CTX_NEWLINE;
2185       if (c[j] & dfa->syntax.letters[j])
2186         context |= CTX_LETTER;
2187       if (c[j] & ~(dfa->syntax.letters[j] | dfa->syntax.newline[j]))
2188         context |= CTX_NONE;
2189     }
2190
2191   return context;
2192 }
2193
2194 /* Returns the contexts on which the position set S depends.  Each context
2195    in the set of returned contexts (let's call it SC) may have a different
2196    follow set than other contexts in SC, and also different from the
2197    follow set of the complement set (sc ^ CTX_ANY).  However, all contexts
2198    in the complement set will have the same follow set.  */
2199
2200 static int _GL_ATTRIBUTE_PURE
2201 state_separate_contexts (position_set const *s)
2202 {
2203   int separate_contexts = 0;
2204   size_t j;
2205
2206   for (j = 0; j < s->nelem; ++j)
2207     {
2208       if (PREV_NEWLINE_DEPENDENT (s->elems[j].constraint))
2209         separate_contexts |= CTX_NEWLINE;
2210       if (PREV_LETTER_DEPENDENT (s->elems[j].constraint))
2211         separate_contexts |= CTX_LETTER;
2212     }
2213
2214   return separate_contexts;
2215 }
2216
2217
2218 /* Perform bottom-up analysis on the parse tree, computing various functions.
2219    Note that at this point, we're pretending constructs like \< are real
2220    characters rather than constraints on what can follow them.
2221
2222    Nullable:  A node is nullable if it is at the root of a regexp that can
2223    match the empty string.
2224    *  EMPTY leaves are nullable.
2225    * No other leaf is nullable.
2226    * A QMARK or STAR node is nullable.
2227    * A PLUS node is nullable if its argument is nullable.
2228    * A CAT node is nullable if both its arguments are nullable.
2229    * An OR node is nullable if either argument is nullable.
2230
2231    Firstpos:  The firstpos of a node is the set of positions (nonempty leaves)
2232    that could correspond to the first character of a string matching the
2233    regexp rooted at the given node.
2234    * EMPTY leaves have empty firstpos.
2235    * The firstpos of a nonempty leaf is that leaf itself.
2236    * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
2237      argument.
2238    * The firstpos of a CAT node is the firstpos of the left argument, union
2239      the firstpos of the right if the left argument is nullable.
2240    * The firstpos of an OR node is the union of firstpos of each argument.
2241
2242    Lastpos:  The lastpos of a node is the set of positions that could
2243    correspond to the last character of a string matching the regexp at
2244    the given node.
2245    * EMPTY leaves have empty lastpos.
2246    * The lastpos of a nonempty leaf is that leaf itself.
2247    * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
2248      argument.
2249    * The lastpos of a CAT node is the lastpos of its right argument, union
2250      the lastpos of the left if the right argument is nullable.
2251    * The lastpos of an OR node is the union of the lastpos of each argument.
2252
2253    Follow:  The follow of a position is the set of positions that could
2254    correspond to the character following a character matching the node in
2255    a string matching the regexp.  At this point we consider special symbols
2256    that match the empty string in some context to be just normal characters.
2257    Later, if we find that a special symbol is in a follow set, we will
2258    replace it with the elements of its follow, labeled with an appropriate
2259    constraint.
2260    * Every node in the firstpos of the argument of a STAR or PLUS node is in
2261      the follow of every node in the lastpos.
2262    * Every node in the firstpos of the second argument of a CAT node is in
2263      the follow of every node in the lastpos of the first argument.
2264
2265    Because of the postfix representation of the parse tree, the depth-first
2266    analysis is conveniently done by a linear scan with the aid of a stack.
2267    Sets are stored as arrays of the elements, obeying a stack-like allocation
2268    scheme; the number of elements in each set deeper in the stack can be
2269    used to determine the address of a particular set's array.  */
2270 static void
2271 dfaanalyze (struct dfa *d, bool searchflag)
2272 {
2273   /* Array allocated to hold position sets.  */
2274   position *posalloc = xnmalloc (d->nleaves, 2 * sizeof *posalloc);
2275   /* Firstpos and lastpos elements.  */
2276   position *firstpos = posalloc + d->nleaves;
2277   position *lastpos = firstpos + d->nleaves;
2278
2279   /* Stack for element counts and nullable flags.  */
2280   struct
2281   {
2282     /* Whether the entry is nullable.  */
2283     bool nullable;
2284
2285     /* Counts of firstpos and lastpos sets.  */
2286     size_t nfirstpos;
2287     size_t nlastpos;
2288   } *stkalloc = xnmalloc (d->depth, sizeof *stkalloc), *stk = stkalloc;
2289
2290   position_set tmp;             /* Temporary set for merging sets.  */
2291   position_set merged;          /* Result of merging sets.  */
2292   int separate_contexts;        /* Context wanted by some position.  */
2293   size_t i, j;
2294   position *pos;
2295   char *visited = xnmalloc (d->tindex, sizeof *visited);
2296
2297 #ifdef DEBUG
2298   fprintf (stderr, "dfaanalyze:\n");
2299   for (i = 0; i < d->tindex; ++i)
2300     {
2301       fprintf (stderr, " %zu:", i);
2302       prtok (d->tokens[i]);
2303     }
2304   putc ('\n', stderr);
2305 #endif
2306
2307   d->searchflag = searchflag;
2308   alloc_position_set (&merged, d->nleaves);
2309   d->follows = xcalloc (d->tindex, sizeof *d->follows);
2310
2311   for (i = 0; i < d->tindex; ++i)
2312     {
2313       switch (d->tokens[i])
2314         {
2315         case EMPTY:
2316           /* The empty set is nullable.  */
2317           stk->nullable = true;
2318
2319           /* The firstpos and lastpos of the empty leaf are both empty.  */
2320           stk->nfirstpos = stk->nlastpos = 0;
2321           stk++;
2322           break;
2323
2324         case STAR:
2325         case PLUS:
2326           /* Every element in the firstpos of the argument is in the follow
2327              of every element in the lastpos.  */
2328           tmp.nelem = stk[-1].nfirstpos;
2329           tmp.elems = firstpos;
2330           pos = lastpos;
2331           for (j = 0; j < stk[-1].nlastpos; ++j)
2332             {
2333               merge (&tmp, &d->follows[pos[j].index], &merged);
2334               copy (&merged, &d->follows[pos[j].index]);
2335             }
2336           /* fallthrough */
2337
2338         case QMARK:
2339           /* A QMARK or STAR node is automatically nullable.  */
2340           if (d->tokens[i] != PLUS)
2341             stk[-1].nullable = true;
2342           break;
2343
2344         case CAT:
2345           /* Every element in the firstpos of the second argument is in the
2346              follow of every element in the lastpos of the first argument.  */
2347           tmp.nelem = stk[-1].nfirstpos;
2348           tmp.elems = firstpos;
2349           pos = lastpos + stk[-1].nlastpos;
2350           for (j = 0; j < stk[-2].nlastpos; ++j)
2351             {
2352               merge (&tmp, &d->follows[pos[j].index], &merged);
2353               copy (&merged, &d->follows[pos[j].index]);
2354             }
2355
2356           /* The firstpos of a CAT node is the firstpos of the first argument,
2357              union that of the second argument if the first is nullable.  */
2358           if (stk[-2].nullable)
2359             stk[-2].nfirstpos += stk[-1].nfirstpos;
2360           else
2361             firstpos += stk[-1].nfirstpos;
2362
2363           /* The lastpos of a CAT node is the lastpos of the second argument,
2364              union that of the first argument if the second is nullable.  */
2365           if (stk[-1].nullable)
2366             stk[-2].nlastpos += stk[-1].nlastpos;
2367           else
2368             {
2369               pos = lastpos + stk[-2].nlastpos;
2370               for (j = stk[-1].nlastpos; j-- > 0;)
2371                 pos[j] = lastpos[j];
2372               lastpos += stk[-2].nlastpos;
2373               stk[-2].nlastpos = stk[-1].nlastpos;
2374             }
2375
2376           /* A CAT node is nullable if both arguments are nullable.  */
2377           stk[-2].nullable &= stk[-1].nullable;
2378           stk--;
2379           break;
2380
2381         case OR:
2382           /* The firstpos is the union of the firstpos of each argument.  */
2383           stk[-2].nfirstpos += stk[-1].nfirstpos;
2384
2385           /* The lastpos is the union of the lastpos of each argument.  */
2386           stk[-2].nlastpos += stk[-1].nlastpos;
2387
2388           /* An OR node is nullable if either argument is nullable.  */
2389           stk[-2].nullable |= stk[-1].nullable;
2390           stk--;
2391           break;
2392
2393         default:
2394           /* Anything else is a nonempty position.  (Note that special
2395              constructs like \< are treated as nonempty strings here;
2396              an "epsilon closure" effectively makes them nullable later.
2397              Backreferences have to get a real position so we can detect
2398              transitions on them later.  But they are nullable.  */
2399           stk->nullable = d->tokens[i] == BACKREF;
2400
2401           /* This position is in its own firstpos and lastpos.  */
2402           stk->nfirstpos = stk->nlastpos = 1;
2403           stk++;
2404
2405           --firstpos, --lastpos;
2406           firstpos->index = lastpos->index = i;
2407           firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
2408
2409           /* Allocate the follow set for this position.  */
2410           alloc_position_set (&d->follows[i], 1);
2411           break;
2412         }
2413 #ifdef DEBUG
2414       /* ... balance the above nonsyntactic #ifdef goo...  */
2415       fprintf (stderr, "node %zu:", i);
2416       prtok (d->tokens[i]);
2417       putc ('\n', stderr);
2418       fprintf (stderr,
2419                stk[-1].nullable ? " nullable: yes\n" : " nullable: no\n");
2420       fprintf (stderr, " firstpos:");
2421       for (j = stk[-1].nfirstpos; j-- > 0;)
2422         {
2423           fprintf (stderr, " %zu:", firstpos[j].index);
2424           prtok (d->tokens[firstpos[j].index]);
2425         }
2426       fprintf (stderr, "\n lastpos:");
2427       for (j = stk[-1].nlastpos; j-- > 0;)
2428         {
2429           fprintf (stderr, " %zu:", lastpos[j].index);
2430           prtok (d->tokens[lastpos[j].index]);
2431         }
2432       putc ('\n', stderr);
2433 #endif
2434     }
2435
2436   /* For each follow set that is the follow set of a real position, replace
2437      it with its epsilon closure.  */
2438   for (i = 0; i < d->tindex; ++i)
2439     if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF
2440         || d->tokens[i] == ANYCHAR || d->tokens[i] == MBCSET
2441         || d->tokens[i] >= CSET)
2442       {
2443 #ifdef DEBUG
2444         fprintf (stderr, "follows(%zu:", i);
2445         prtok (d->tokens[i]);
2446         fprintf (stderr, "):");
2447         for (j = d->follows[i].nelem; j-- > 0;)
2448           {
2449             fprintf (stderr, " %zu:", d->follows[i].elems[j].index);
2450             prtok (d->tokens[d->follows[i].elems[j].index]);
2451           }
2452         putc ('\n', stderr);
2453 #endif
2454         copy (&d->follows[i], &merged);
2455         epsclosure (&merged, d, visited);
2456         copy (&merged, &d->follows[i]);
2457       }
2458
2459   /* Get the epsilon closure of the firstpos of the regexp.  The result will
2460      be the set of positions of state 0.  */
2461   merged.nelem = 0;
2462   for (i = 0; i < stk[-1].nfirstpos; ++i)
2463     insert (firstpos[i], &merged);
2464   epsclosure (&merged, d, visited);
2465
2466   /* Build the initial state.  */
2467   separate_contexts = state_separate_contexts (&merged);
2468   if (separate_contexts & CTX_NEWLINE)
2469     state_index (d, &merged, CTX_NEWLINE);
2470   d->initstate_notbol = d->min_trcount
2471     = state_index (d, &merged, separate_contexts ^ CTX_ANY);
2472   if (separate_contexts & CTX_LETTER)
2473     d->min_trcount = state_index (d, &merged, CTX_LETTER);
2474   d->min_trcount++;
2475
2476   free (posalloc);
2477   free (stkalloc);
2478   free (merged.elems);
2479   free (visited);
2480 }
2481
2482
2483 /* Find, for each character, the transition out of state s of d, and store
2484    it in the appropriate slot of trans.
2485
2486    We divide the positions of s into groups (positions can appear in more
2487    than one group).  Each group is labeled with a set of characters that
2488    every position in the group matches (taking into account, if necessary,
2489    preceding context information of s).  For each group, find the union
2490    of the its elements' follows.  This set is the set of positions of the
2491    new state.  For each character in the group's label, set the transition
2492    on this character to be to a state corresponding to the set's positions,
2493    and its associated backward context information, if necessary.
2494
2495    If we are building a searching matcher, we include the positions of state
2496    0 in every state.
2497
2498    The collection of groups is constructed by building an equivalence-class
2499    partition of the positions of s.
2500
2501    For each position, find the set of characters C that it matches.  Eliminate
2502    any characters from C that fail on grounds of backward context.
2503
2504    Search through the groups, looking for a group whose label L has nonempty
2505    intersection with C.  If L - C is nonempty, create a new group labeled
2506    L - C and having the same positions as the current group, and set L to
2507    the intersection of L and C.  Insert the position in this group, set
2508    C = C - L, and resume scanning.
2509
2510    If after comparing with every group there are characters remaining in C,
2511    create a new group labeled with the characters of C and insert this
2512    position in that group.  */
2513 static void
2514 dfastate (state_num s, struct dfa *d, state_num trans[])
2515 {
2516   leaf_set grps[NOTCHAR];       /* As many as will ever be needed.  */
2517   charclass labels[NOTCHAR];    /* Labels corresponding to the groups.  */
2518   size_t ngrps = 0;             /* Number of groups actually used.  */
2519   position pos;                 /* Current position being considered.  */
2520   charclass matches;            /* Set of matching characters.  */
2521   charclass_word matchesf;      /* Nonzero if matches is nonempty.  */
2522   charclass intersect;          /* Intersection with some label set.  */
2523   charclass_word intersectf;    /* Nonzero if intersect is nonempty.  */
2524   charclass leftovers;          /* Stuff in the label that didn't match.  */
2525   charclass_word leftoversf;    /* Nonzero if leftovers is nonempty.  */
2526   position_set follows;         /* Union of the follows of some group.  */
2527   position_set tmp;             /* Temporary space for merging sets.  */
2528   int possible_contexts;        /* Contexts that this group can match.  */
2529   int separate_contexts;        /* Context that new state wants to know.  */
2530   state_num state;              /* New state.  */
2531   state_num state_newline;      /* New state on a newline transition.  */
2532   state_num state_letter;       /* New state on a letter transition.  */
2533   bool next_isnt_1st_byte = false; /* We can't add state0.  */
2534   size_t i, j, k;
2535
2536 #ifdef DEBUG
2537   fprintf (stderr, "build state %td\n", s);
2538 #endif
2539
2540   zeroset (matches);
2541
2542   for (i = 0; i < d->states[s].elems.nelem; ++i)
2543     {
2544       pos = d->states[s].elems.elems[i];
2545       if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
2546         setbit (d->tokens[pos.index], matches);
2547       else if (d->tokens[pos.index] >= CSET)
2548         copyset (d->charclasses[d->tokens[pos.index] - CSET], matches);
2549       else if (d->tokens[pos.index] == ANYCHAR)
2550         {
2551           copyset (d->charclasses[d->canychar], matches);
2552
2553           /* ANYCHAR must match with a single character, so we must put
2554              it to D->states[s].mbps which contains the positions which
2555              can match with a single character not a byte.  If all
2556              positions which has ANYCHAR does not depend on context of
2557              next character, we put the follows instead of it to
2558              D->states[s].mbps to optimize.  */
2559           if (SUCCEEDS_IN_CONTEXT (pos.constraint, d->states[s].context,
2560                                    CTX_NONE))
2561             {
2562               if (d->states[s].mbps.nelem == 0)
2563                 alloc_position_set (&d->states[s].mbps,
2564                                     d->follows[pos.index].nelem);
2565               for (j = 0; j < d->follows[pos.index].nelem; j++)
2566                 insert (d->follows[pos.index].elems[j], &d->states[s].mbps);
2567             }
2568         }
2569       else
2570         continue;
2571
2572       /* Some characters may need to be eliminated from matches because
2573          they fail in the current context.  */
2574       if (pos.constraint != NO_CONSTRAINT)
2575         {
2576           if (!SUCCEEDS_IN_CONTEXT (pos.constraint,
2577                                     d->states[s].context, CTX_NEWLINE))
2578             for (j = 0; j < CHARCLASS_WORDS; ++j)
2579               matches[j] &= ~d->syntax.newline[j];
2580           if (!SUCCEEDS_IN_CONTEXT (pos.constraint,
2581                                     d->states[s].context, CTX_LETTER))
2582             for (j = 0; j < CHARCLASS_WORDS; ++j)
2583               matches[j] &= ~d->syntax.letters[j];
2584           if (!SUCCEEDS_IN_CONTEXT (pos.constraint,
2585                                     d->states[s].context, CTX_NONE))
2586             for (j = 0; j < CHARCLASS_WORDS; ++j)
2587               matches[j] &= d->syntax.letters[j] | d->syntax.newline[j];
2588
2589           /* If there are no characters left, there's no point in going on.  */
2590           for (j = 0; j < CHARCLASS_WORDS && !matches[j]; ++j)
2591             continue;
2592           if (j == CHARCLASS_WORDS)
2593             continue;
2594         }
2595
2596 #ifdef DEBUG
2597       fprintf (stderr, " nextpos %zu:", pos.index);
2598       prtok (d->tokens[pos.index]);
2599       fprintf (stderr, " of");
2600       for (j = 0; j < NOTCHAR; j++)
2601         if (tstbit (j, matches))
2602           fprintf (stderr, " 0x%02zx", j);
2603       fprintf (stderr, "\n");
2604 #endif
2605
2606       for (j = 0; j < ngrps; ++j)
2607         {
2608           /* If matches contains a single character only, and the current
2609              group's label doesn't contain that character, go on to the
2610              next group.  */
2611           if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR
2612               && !tstbit (d->tokens[pos.index], labels[j]))
2613             continue;
2614
2615           /* Check if this group's label has a nonempty intersection with
2616              matches.  */
2617           intersectf = 0;
2618           for (k = 0; k < CHARCLASS_WORDS; ++k)
2619             intersectf |= intersect[k] = matches[k] & labels[j][k];
2620           if (!intersectf)
2621             continue;
2622
2623           /* It does; now find the set differences both ways.  */
2624           leftoversf = matchesf = 0;
2625           for (k = 0; k < CHARCLASS_WORDS; ++k)
2626             {
2627               /* Even an optimizing compiler can't know this for sure.  */
2628               charclass_word match = matches[k], label = labels[j][k];
2629
2630               leftoversf |= leftovers[k] = label & ~match;
2631               matchesf |= matches[k] = match & ~label;
2632             }
2633
2634           /* If there were leftovers, create a new group labeled with them.  */
2635           if (leftoversf)
2636             {
2637               copyset (leftovers, labels[ngrps]);
2638               copyset (intersect, labels[j]);
2639               grps[ngrps].elems = xnmalloc (d->nleaves,
2640                                             sizeof *grps[ngrps].elems);
2641               memcpy (grps[ngrps].elems, grps[j].elems,
2642                       sizeof (grps[j].elems[0]) * grps[j].nelem);
2643               grps[ngrps].nelem = grps[j].nelem;
2644               ++ngrps;
2645             }
2646
2647           /* Put the position in the current group.  The constraint is
2648              irrelevant here.  */
2649           grps[j].elems[grps[j].nelem++] = pos.index;
2650
2651           /* If every character matching the current position has been
2652              accounted for, we're done.  */
2653           if (!matchesf)
2654             break;
2655         }
2656
2657       /* If we've passed the last group, and there are still characters
2658          unaccounted for, then we'll have to create a new group.  */
2659       if (j == ngrps)
2660         {
2661           copyset (matches, labels[ngrps]);
2662           zeroset (matches);
2663           grps[ngrps].elems = xnmalloc (d->nleaves, sizeof *grps[ngrps].elems);
2664           grps[ngrps].nelem = 1;
2665           grps[ngrps].elems[0] = pos.index;
2666           ++ngrps;
2667         }
2668     }
2669
2670   alloc_position_set (&follows, d->nleaves);
2671   alloc_position_set (&tmp, d->nleaves);
2672
2673   /* If we are a searching matcher, the default transition is to a state
2674      containing the positions of state 0, otherwise the default transition
2675      is to fail miserably.  */
2676   if (d->searchflag)
2677     {
2678       int c;
2679
2680       state_newline = 0;
2681       state_letter = d->min_trcount - 1;
2682       state = d->initstate_notbol;
2683
2684       for (c = 0; c < NOTCHAR; ++c)
2685         {
2686           switch (d->syntax.sbit[c])
2687             {
2688             case CTX_NEWLINE:
2689               trans[c] = state_newline;
2690               break;
2691             case CTX_LETTER:
2692               trans[c] = state_letter;
2693               break;
2694             default:
2695               trans[c] = state;
2696               break;
2697             }
2698         }
2699     }
2700   else
2701     for (i = 0; i < NOTCHAR; ++i)
2702       trans[i] = -1;
2703
2704   for (i = 0; i < ngrps; ++i)
2705     {
2706       follows.nelem = 0;
2707
2708       /* Find the union of the follows of the positions of the group.
2709          This is a hideously inefficient loop.  Fix it someday.  */
2710       for (j = 0; j < grps[i].nelem; ++j)
2711         for (k = 0; k < d->follows[grps[i].elems[j]].nelem; ++k)
2712           insert (d->follows[grps[i].elems[j]].elems[k], &follows);
2713
2714       if (d->localeinfo.multibyte)
2715         {
2716           /* If a token in follows.elems is not 1st byte of a multibyte
2717              character, or the states of follows must accept the bytes
2718              which are not 1st byte of the multibyte character.
2719              Then, if a state of follows encounter a byte, it must not be
2720              a 1st byte of a multibyte character nor single byte character.
2721              We cansel to add state[0].follows to next state, because
2722              state[0] must accept 1st-byte
2723
2724              For example, we assume <sb a> is a certain single byte
2725              character, <mb A> is a certain multibyte character, and the
2726              codepoint of <sb a> equals the 2nd byte of the codepoint of
2727              <mb A>.
2728              When state[0] accepts <sb a>, state[i] transit to state[i+1]
2729              by accepting accepts 1st byte of <mb A>, and state[i+1]
2730              accepts 2nd byte of <mb A>, if state[i+1] encounter the
2731              codepoint of <sb a>, it must not be <sb a> but 2nd byte of
2732              <mb A>, so we cannot add state[0].  */
2733
2734           next_isnt_1st_byte = false;
2735           for (j = 0; j < follows.nelem; ++j)
2736             {
2737               if (!(d->multibyte_prop[follows.elems[j].index] & 1))
2738                 {
2739                   next_isnt_1st_byte = true;
2740                   break;
2741                 }
2742             }
2743         }
2744
2745       /* If we are building a searching matcher, throw in the positions
2746          of state 0 as well.  */
2747       if (d->searchflag && (!d->localeinfo.multibyte || !next_isnt_1st_byte))
2748         {
2749           merge (&d->states[0].elems, &follows, &tmp);
2750           copy (&tmp, &follows);
2751         }
2752
2753       /* Find out if the new state will want any context information.  */
2754       possible_contexts = charclass_context (d, labels[i]);
2755       separate_contexts = state_separate_contexts (&follows);
2756
2757       /* Find the state(s) corresponding to the union of the follows.  */
2758       if (possible_contexts & ~separate_contexts)
2759         state = state_index (d, &follows, separate_contexts ^ CTX_ANY);
2760       else
2761         state = -1;
2762       if (separate_contexts & possible_contexts & CTX_NEWLINE)
2763         state_newline = state_index (d, &follows, CTX_NEWLINE);
2764       else
2765         state_newline = state;
2766       if (separate_contexts & possible_contexts & CTX_LETTER)
2767         state_letter = state_index (d, &follows, CTX_LETTER);
2768       else
2769         state_letter = state;
2770
2771 #ifdef DEBUG
2772       fprintf (stderr, "group %zu\n nextpos:", i);
2773       for (j = 0; j < grps[i].nelem; ++j)
2774         {
2775           fprintf (stderr, " %zu:", grps[i].elems[j]);
2776           prtok (d->tokens[grps[i].elems[j]]);
2777         }
2778       fprintf (stderr, "\n follows:");
2779       for (j = 0; j < follows.nelem; ++j)
2780         {
2781           fprintf (stderr, " %zu:", follows.elems[j].index);
2782           prtok (d->tokens[follows.elems[j].index]);
2783         }
2784       fprintf (stderr, "\n states:");
2785       if (possible_contexts & CTX_NEWLINE)
2786         fprintf (stderr, " CTX_NEWLINE:%td", state_newline);
2787       if (possible_contexts & CTX_LETTER)
2788         fprintf (stderr, " CTX_LETTER:%td", state_letter);
2789       if (possible_contexts & CTX_NONE)
2790         fprintf (stderr, " CTX_NONE:%td", state);
2791       fprintf (stderr, "\n");
2792 #endif
2793
2794       /* Set the transitions for each character in the current label.  */
2795       for (j = 0; j < CHARCLASS_WORDS; ++j)
2796         for (k = 0; k < CHARCLASS_WORD_BITS; ++k)
2797           if (labels[i][j] >> k & 1)
2798             {
2799               int c = j * CHARCLASS_WORD_BITS + k;
2800
2801               switch (d->syntax.sbit[c])
2802                 {
2803                 case CTX_NEWLINE:
2804                   trans[c] = state_newline;
2805                   break;
2806                 case CTX_LETTER:
2807                   trans[c] = state_letter;
2808                   break;
2809                 default:
2810                   trans[c] = state;
2811                   break;
2812                 }
2813             }
2814     }
2815
2816 #ifdef DEBUG
2817   fprintf (stderr, "trans table %td", s);
2818   for (i = 0; i < NOTCHAR; ++i)
2819     {
2820       if (!(i & 0xf))
2821         fprintf (stderr, "\n");
2822       fprintf (stderr, " %2td", trans[i]);
2823     }
2824   fprintf (stderr, "\n");
2825 #endif
2826
2827   for (i = 0; i < ngrps; ++i)
2828     free (grps[i].elems);
2829   free (follows.elems);
2830   free (tmp.elems);
2831 }
2832
2833 /* Make sure D's state arrays are large enough to hold NEW_STATE.  */
2834 static void
2835 realloc_trans_if_necessary (struct dfa *d, state_num new_state)
2836 {
2837   state_num oldalloc = d->tralloc;
2838   if (oldalloc <= new_state)
2839     {
2840       state_num **realtrans = d->trans ? d->trans - 1 : NULL;
2841       size_t newalloc, newalloc1;
2842       newalloc1 = new_state + 1;
2843       realtrans = x2nrealloc (realtrans, &newalloc1, sizeof *realtrans);
2844       realtrans[0] = NULL;
2845       d->trans = realtrans + 1;
2846       d->tralloc = newalloc = newalloc1 - 1;
2847       d->fails = xnrealloc (d->fails, newalloc, sizeof *d->fails);
2848       d->success = xnrealloc (d->success, newalloc, sizeof *d->success);
2849       d->newlines = xnrealloc (d->newlines, newalloc, sizeof *d->newlines);
2850       if (d->localeinfo.multibyte)
2851         {
2852           realtrans = d->mb_trans ? d->mb_trans - 1 : NULL;
2853           realtrans = xnrealloc (realtrans, newalloc1, sizeof *realtrans);
2854           if (oldalloc == 0)
2855             realtrans[0] = NULL;
2856           d->mb_trans = realtrans + 1;
2857         }
2858       for (; oldalloc < newalloc; oldalloc++)
2859         {
2860           d->trans[oldalloc] = NULL;
2861           d->fails[oldalloc] = NULL;
2862           if (d->localeinfo.multibyte)
2863             d->mb_trans[oldalloc] = NULL;
2864         }
2865     }
2866 }
2867
2868 /* Some routines for manipulating a compiled dfa's transition tables.
2869    Each state may or may not have a transition table; if it does, and it
2870    is a non-accepting state, then d->trans[state] points to its table.
2871    If it is an accepting state then d->fails[state] points to its table.
2872    If it has no table at all, then d->trans[state] is NULL.
2873    TODO: Improve this comment, get rid of the unnecessary redundancy.  */
2874
2875 static void
2876 build_state (state_num s, struct dfa *d)
2877 {
2878   state_num *trans;             /* The new transition table.  */
2879   state_num i, maxstate;
2880
2881   /* Set an upper limit on the number of transition tables that will ever
2882      exist at once.  MAX_TRCOUNT is arbitrary.  The idea is that the frequently
2883      used transition tables will be quickly rebuilt, whereas the ones that
2884      were only needed once or twice will be cleared away.  However, do not
2885      clear the initial D->min_trcount states, since they are always used.  */
2886   if (MAX_TRCOUNT <= d->trcount)
2887     {
2888       for (i = d->min_trcount; i < d->tralloc; ++i)
2889         {
2890           free (d->trans[i]);
2891           free (d->fails[i]);
2892           d->trans[i] = d->fails[i] = NULL;
2893         }
2894       d->trcount = d->min_trcount;
2895
2896       if (d->localeinfo.multibyte)
2897         {
2898           for (i = d->min_trcount; i < d->tralloc; i++)
2899             {
2900               free (d->mb_trans[i]);
2901               d->mb_trans[i] = NULL;
2902             }
2903           free (d->mb_trans[-1]);
2904           d->mb_trans[-1] = NULL;
2905         }
2906     }
2907
2908   ++d->trcount;
2909
2910   /* Set up the success bits for this state.  */
2911   d->success[s] = 0;
2912   if (ACCEPTS_IN_CONTEXT (d->states[s].context, CTX_NEWLINE, s, *d))
2913     d->success[s] |= CTX_NEWLINE;
2914   if (ACCEPTS_IN_CONTEXT (d->states[s].context, CTX_LETTER, s, *d))
2915     d->success[s] |= CTX_LETTER;
2916   if (ACCEPTS_IN_CONTEXT (d->states[s].context, CTX_NONE, s, *d))
2917     d->success[s] |= CTX_NONE;
2918
2919   trans = xmalloc (NOTCHAR * sizeof *trans);
2920   dfastate (s, d, trans);
2921
2922   /* Now go through the new transition table, and make sure that the trans
2923      and fail arrays are allocated large enough to hold a pointer for the
2924      largest state mentioned in the table.  */
2925   maxstate = -1;
2926   for (i = 0; i < NOTCHAR; ++i)
2927     if (maxstate < trans[i])
2928       maxstate = trans[i];
2929   realloc_trans_if_necessary (d, maxstate);
2930
2931   /* Keep the newline transition in a special place so we can use it as
2932      a sentinel.  */
2933   d->newlines[s] = trans[d->syntax.eolbyte];
2934   trans[d->syntax.eolbyte] = -1;
2935
2936   if (ACCEPTING (s, *d))
2937     d->fails[s] = trans;
2938   else
2939     d->trans[s] = trans;
2940 }
2941
2942 /* Multibyte character handling sub-routines for dfaexec.  */
2943
2944 /* Consume a single byte and transit state from 's' to '*next_state'.
2945    This function is almost same as the state transition routin in dfaexec.
2946    But state transition is done just once, otherwise matching succeed or
2947    reach the end of the buffer.  */
2948 static state_num
2949 transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const **pp)
2950 {
2951   state_num *t;
2952
2953   if (d->trans[s])
2954     t = d->trans[s];
2955   else if (d->fails[s])
2956     t = d->fails[s];
2957   else
2958     {
2959       build_state (s, d);
2960       if (d->trans[s])
2961         t = d->trans[s];
2962       else
2963         {
2964           t = d->fails[s];
2965           assert (t);
2966         }
2967     }
2968
2969   return t[*(*pp)++];
2970 }
2971
2972 /* Transit state from s, then return new state and update the pointer of
2973    the buffer.  This function is for a period operator which can match a
2974    multi-byte character.  */
2975 static state_num
2976 transit_state (struct dfa *d, state_num s, unsigned char const **pp,
2977                unsigned char const *end)
2978 {
2979   state_num s1, s2;
2980   wint_t wc;
2981   int separate_contexts;
2982   size_t i;
2983
2984   int mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d);
2985
2986   /* This state has some operators which can match a multibyte character.  */
2987   d->mb_follows.nelem = 0;
2988
2989   /* Calculate the state which can be reached from the state 's' by
2990      consuming 'mbclen' single bytes from the buffer.  */
2991   s1 = s;
2992   for (i = 0; i < mbclen && 0 <= s; i++)
2993     s = transit_state_singlebyte (d, s, pp);
2994   *pp += mbclen - i;
2995
2996   if (wc == WEOF)
2997     {
2998       /* It is an invalid character, so ANYCHAR is not accepted.  */
2999       return s;
3000     }
3001
3002   /* If all positions which have ANYCHAR do not depend on the context
3003      of the next character, calculate the next state with
3004      pre-calculated follows and cache the result.  */
3005   if (d->states[s1].mb_trindex < 0)
3006     {
3007       if (MAX_TRCOUNT <= d->mb_trcount)
3008         {
3009           state_num s3;
3010           for (s3 = -1; s3 < d->tralloc; s3++)
3011             {
3012               free (d->mb_trans[s3]);
3013               d->mb_trans[s3] = NULL;
3014             }
3015
3016           for (i = 0; i < d->sindex; i++)
3017             d->states[i].mb_trindex = -1;
3018           d->mb_trcount = 0;
3019         }
3020       d->states[s1].mb_trindex = d->mb_trcount++;
3021     }
3022
3023   if (! d->mb_trans[s])
3024     {
3025       enum { TRANSPTR_SIZE = sizeof *d->mb_trans[s] };
3026       enum { TRANSALLOC_SIZE = MAX_TRCOUNT * TRANSPTR_SIZE };
3027       d->mb_trans[s] = xmalloc (TRANSALLOC_SIZE);
3028       for (i = 0; i < MAX_TRCOUNT; i++)
3029         d->mb_trans[s][i] = -1;
3030     }
3031   else if (d->mb_trans[s][d->states[s1].mb_trindex] >= 0)
3032     return d->mb_trans[s][d->states[s1].mb_trindex];
3033
3034   if (s < 0)
3035     copy (&d->states[s1].mbps, &d->mb_follows);
3036   else
3037     merge (&d->states[s1].mbps, &d->states[s].elems, &d->mb_follows);
3038
3039   separate_contexts = state_separate_contexts (&d->mb_follows);
3040   s2 = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
3041   realloc_trans_if_necessary (d, s2);
3042
3043   d->mb_trans[s][d->states[s1].mb_trindex] = s2;
3044
3045   return s2;
3046 }
3047
3048 /* The initial state may encounter a byte which is not a single byte character
3049    nor the first byte of a multibyte character.  But it is incorrect for the
3050    initial state to accept such a byte.  For example, in Shift JIS the regular
3051    expression "\\" accepts the codepoint 0x5c, but should not accept the second
3052    byte of the codepoint 0x815c.  Then the initial state must skip the bytes
3053    that are not a single byte character nor the first byte of a multibyte
3054    character.
3055
3056    Given DFA state d, use mbs_to_wchar to advance MBP until it reaches
3057    or exceeds P, and return the advanced MBP.  If WCP is non-NULL and
3058    the result is greater than P, set *WCP to the final wide character
3059    processed, or to WEOF if no wide character is processed.  Otherwise,
3060    if WCP is non-NULL, *WCP may or may not be updated.
3061
3062    Both P and MBP must be no larger than END.  */
3063 static unsigned char const *
3064 skip_remains_mb (struct dfa *d, unsigned char const *p,
3065                  unsigned char const *mbp, char const *end)
3066 {
3067   wint_t wc;
3068   if (d->syntax.never_trail[*p])
3069     return p;
3070   while (mbp < p)
3071     mbp += mbs_to_wchar (&wc, (char const *) mbp,
3072                          end - (char const *) mbp, d);
3073   return mbp;
3074 }
3075
3076 /* Search through a buffer looking for a match to the struct dfa *D.
3077    Find the first occurrence of a string matching the regexp in the
3078    buffer, and the shortest possible version thereof.  Return a pointer to
3079    the first character after the match, or NULL if none is found.  BEGIN
3080    points to the beginning of the buffer, and END points to the first byte
3081    after its end.  Note however that we store a sentinel byte (usually
3082    newline) in *END, so the actual buffer must be one byte longer.
3083    When ALLOW_NL, newlines may appear in the matching string.
3084    If COUNT is non-NULL, increment *COUNT once for each newline processed.
3085    If MULTIBYTE, the input consists of multibyte characters and/or
3086    encoding-error bytes.  Otherwise, it consists of single-byte characters.
3087    Here is the list of features that make this DFA matcher punt:
3088     - [M-N] range in non-simple locale: regex is up to 25% faster on [a-z]
3089     - [^...] in non-simple locale
3090     - [[=foo=]] or [[.foo.]]
3091     - [[:alpha:]] etc. in multibyte locale (except [[:digit:]] works OK)
3092     - back-reference: (.)\1
3093     - word-delimiter in multibyte locale: \<, \>, \b, \B
3094    See using_simple_locale for the definition of "simple locale".  */
3095
3096 static inline char *
3097 dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
3098               size_t *count, bool multibyte)
3099 {
3100   state_num s, s1;              /* Current state.  */
3101   unsigned char const *p, *mbp; /* Current input character.  */
3102   state_num **trans, *t;        /* Copy of d->trans so it can be optimized
3103                                    into a register.  */
3104   unsigned char eol = d->syntax.eolbyte;  /* Likewise for eolbyte.  */
3105   unsigned char saved_end;
3106   size_t nlcount = 0;
3107
3108   if (!d->tralloc)
3109     {
3110       realloc_trans_if_necessary (d, 1);
3111       build_state (0, d);
3112     }
3113
3114   s = s1 = 0;
3115   p = mbp = (unsigned char const *) begin;
3116   trans = d->trans;
3117   saved_end = *(unsigned char *) end;
3118   *end = eol;
3119
3120   if (multibyte)
3121     {
3122       memset (&d->mbs, 0, sizeof d->mbs);
3123       if (d->mb_follows.alloc == 0)
3124         alloc_position_set (&d->mb_follows, d->nleaves);
3125     }
3126
3127   for (;;)
3128     {
3129       while ((t = trans[s]) != NULL)
3130         {
3131           if (s < d->min_trcount)
3132             {
3133               if (!multibyte || d->states[s].mbps.nelem == 0)
3134                 {
3135                   while (t[*p] == s)
3136                     p++;
3137                 }
3138               if (multibyte)
3139                 p = mbp = skip_remains_mb (d, p, mbp, end);
3140             }
3141
3142           if (multibyte)
3143             {
3144               s1 = s;
3145
3146               if (d->states[s].mbps.nelem == 0
3147                   || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
3148                 {
3149                   /* If an input character does not match ANYCHAR, do it
3150                      like a single-byte character.  */
3151                   s = t[*p++];
3152                 }
3153               else
3154                 {
3155                   s = transit_state (d, s, &p, (unsigned char *) end);
3156                   mbp = p;
3157                   trans = d->trans;
3158                 }
3159             }
3160           else
3161             {
3162               s1 = t[*p++];
3163               t = trans[s1];
3164               if (! t)
3165                 {
3166                   state_num tmp = s;
3167                   s = s1;
3168                   s1 = tmp;     /* swap */
3169                   break;
3170                 }
3171               if (s < d->min_trcount)
3172                 {
3173                   while (t[*p] == s1)
3174                     p++;
3175                 }
3176               s = t[*p++];
3177             }
3178         }
3179
3180       if (s < 0)
3181         {
3182           if ((char *) p > end || p[-1] != eol || d->newlines[s1] < 0)
3183             {
3184               p = NULL;
3185               goto done;
3186             }
3187
3188           /* The previous character was a newline, count it, and skip
3189              checking of multibyte character boundary until here.  */
3190           nlcount++;
3191           mbp = p;
3192
3193           s = (allow_nl ? d->newlines[s1]
3194                : d->syntax.sbit[eol] == CTX_NEWLINE ? 0
3195                : d->syntax.sbit[eol] == CTX_LETTER ? d->min_trcount - 1
3196                : d->initstate_notbol);
3197         }
3198       else if (d->fails[s])
3199         {
3200           if ((d->success[s] & d->syntax.sbit[*p])
3201               || ((char *) p == end
3202                   && ACCEPTS_IN_CONTEXT (d->states[s].context, CTX_NEWLINE, s,
3203                                          *d)))
3204             goto done;
3205
3206           if (multibyte && s < d->min_trcount)
3207             p = mbp = skip_remains_mb (d, p, mbp, end);
3208
3209           s1 = s;
3210           if (!multibyte || d->states[s].mbps.nelem == 0
3211               || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
3212             {
3213               /* If a input character does not match ANYCHAR, do it
3214                  like a single-byte character.  */
3215               s = d->fails[s][*p++];
3216             }
3217           else
3218             {
3219               s = transit_state (d, s, &p, (unsigned char *) end);
3220               mbp = p;
3221               trans = d->trans;
3222             }
3223         }
3224       else
3225         {
3226           build_state (s, d);
3227           trans = d->trans;
3228         }
3229     }
3230
3231  done:
3232   if (count)
3233     *count += nlcount;
3234   *end = saved_end;
3235   return (char *) p;
3236 }
3237
3238 /* Specialized versions of dfaexec for multibyte and single-byte cases.
3239    This is for performance, as dfaexec_main is an inline function.  */
3240
3241 static char *
3242 dfaexec_mb (struct dfa *d, char const *begin, char *end,
3243             bool allow_nl, size_t *count, bool *backref)
3244 {
3245   return dfaexec_main (d, begin, end, allow_nl, count, true);
3246 }
3247
3248 static char *
3249 dfaexec_sb (struct dfa *d, char const *begin, char *end,
3250             bool allow_nl, size_t *count, bool *backref)
3251 {
3252   return dfaexec_main (d, begin, end, allow_nl, count, false);
3253 }
3254
3255 /* Always set *BACKREF and return BEGIN.  Use this wrapper for
3256    any regexp that uses a construct not supported by this code.  */
3257 static char *
3258 dfaexec_noop (struct dfa *d, char const *begin, char *end,
3259               bool allow_nl, size_t *count, bool *backref)
3260 {
3261   *backref = true;
3262   return (char *) begin;
3263 }
3264
3265 /* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->localeinfo.multibyte),
3266    but faster and set *BACKREF if the DFA code does not support this
3267    regexp usage.  */
3268
3269 char *
3270 dfaexec (struct dfa *d, char const *begin, char *end,
3271          bool allow_nl, size_t *count, bool *backref)
3272 {
3273   return d->dfaexec (d, begin, end, allow_nl, count, backref);
3274 }
3275
3276 struct dfa *
3277 dfasuperset (struct dfa const *d)
3278 {
3279   return d->superset;
3280 }
3281
3282 bool
3283 dfaisfast (struct dfa const *d)
3284 {
3285   return d->fast;
3286 }
3287
3288 static void
3289 free_mbdata (struct dfa *d)
3290 {
3291   size_t i;
3292
3293   free (d->multibyte_prop);
3294
3295   for (i = 0; i < d->nmbcsets; ++i)
3296     free (d->mbcsets[i].chars);
3297
3298   free (d->mbcsets);
3299   free (d->mb_follows.elems);
3300
3301   if (d->mb_trans)
3302     {
3303       state_num s;
3304       for (s = -1; s < d->tralloc; s++)
3305         free (d->mb_trans[s]);
3306       free (d->mb_trans - 1);
3307     }
3308 }
3309
3310 /* Return true if every construct in D is supported by this DFA matcher.  */
3311 static bool _GL_ATTRIBUTE_PURE
3312 dfa_supported (struct dfa const *d)
3313 {
3314   size_t i;
3315   for (i = 0; i < d->tindex; i++)
3316     {
3317       switch (d->tokens[i])
3318         {
3319         case BEGWORD:
3320         case ENDWORD:
3321         case LIMWORD:
3322         case NOTLIMWORD:
3323           if (!d->localeinfo.multibyte)
3324             continue;
3325           /* fallthrough */
3326
3327         case BACKREF:
3328         case MBCSET:
3329           return false;
3330         }
3331     }
3332   return true;
3333 }
3334
3335 static void
3336 dfaoptimize (struct dfa *d)
3337 {
3338   size_t i;
3339   bool have_backref = false;
3340
3341   if (!d->localeinfo.using_utf8)
3342     return;
3343
3344   for (i = 0; i < d->tindex; ++i)
3345     {
3346       switch (d->tokens[i])
3347         {
3348         case ANYCHAR:
3349           /* Lowered.  */
3350           abort ();
3351         case BACKREF:
3352           have_backref = true;
3353           break;
3354         case MBCSET:
3355           /* Requires multi-byte algorithm.  */
3356           return;
3357         default:
3358           break;
3359         }
3360     }
3361
3362   if (!have_backref && d->superset)
3363     {
3364       /* The superset DFA is not likely to be much faster, so remove it.  */
3365       dfafree (d->superset);
3366       free (d->superset);
3367       d->superset = NULL;
3368     }
3369
3370   free_mbdata (d);
3371   d->localeinfo.multibyte = false;
3372   d->dfaexec = dfaexec_sb;
3373   d->fast = true;
3374 }
3375
3376 static void
3377 dfassbuild (struct dfa *d)
3378 {
3379   size_t i, j;
3380   charclass ccl;
3381   bool have_achar = false;
3382   bool have_nchar = false;
3383   struct dfa *sup = dfaalloc ();
3384
3385   *sup = *d;
3386   sup->localeinfo.multibyte = false;
3387   sup->dfaexec = dfaexec_sb;
3388   sup->multibyte_prop = NULL;
3389   sup->mbcsets = NULL;
3390   sup->superset = NULL;
3391   sup->states = NULL;
3392   sup->sindex = 0;
3393   sup->follows = NULL;
3394   sup->tralloc = 0;
3395   sup->trans = NULL;
3396   sup->fails = NULL;
3397   sup->success = NULL;
3398   sup->newlines = NULL;
3399
3400   sup->charclasses = xnmalloc (sup->calloc, sizeof *sup->charclasses);
3401   if (d->cindex)
3402     {
3403       memcpy (sup->charclasses, d->charclasses,
3404               d->cindex * sizeof *sup->charclasses);
3405     }
3406
3407   sup->tokens = xnmalloc (d->tindex, 2 * sizeof *sup->tokens);
3408   sup->talloc = d->tindex * 2;
3409
3410   for (i = j = 0; i < d->tindex; i++)
3411     {
3412       switch (d->tokens[i])
3413         {
3414         case ANYCHAR:
3415         case MBCSET:
3416         case BACKREF:
3417           zeroset (ccl);
3418           notset (ccl);
3419           sup->tokens[j++] = CSET + charclass_index (sup, ccl);
3420           sup->tokens[j++] = STAR;
3421           if (d->tokens[i + 1] == QMARK || d->tokens[i + 1] == STAR
3422               || d->tokens[i + 1] == PLUS)
3423             i++;
3424           have_achar = true;
3425           break;
3426         case BEGWORD:
3427         case ENDWORD:
3428         case LIMWORD:
3429         case NOTLIMWORD:
3430           if (d->localeinfo.multibyte)
3431             {
3432               /* These constraints aren't supported in a multibyte locale.
3433                  Ignore them in the superset DFA.  */
3434               sup->tokens[j++] = EMPTY;
3435               break;
3436             }
3437           /* fallthrough */
3438         default:
3439           sup->tokens[j++] = d->tokens[i];
3440           if ((0 <= d->tokens[i] && d->tokens[i] < NOTCHAR)
3441               || d->tokens[i] >= CSET)
3442             have_nchar = true;
3443           break;
3444         }
3445     }
3446   sup->tindex = j;
3447
3448   if (have_nchar && (have_achar || d->localeinfo.multibyte))
3449     d->superset = sup;
3450   else
3451     {
3452       dfafree (sup);
3453       free (sup);
3454     }
3455 }
3456
3457 /* Parse and analyze a single string of the given length.  */
3458 void
3459 dfacomp (char const *s, size_t len, struct dfa *d, bool searchflag)
3460 {
3461   dfaparse (s, len, d);
3462   dfassbuild (d);
3463
3464   if (dfa_supported (d))
3465     {
3466       dfaoptimize (d);
3467       dfaanalyze (d, searchflag);
3468     }
3469   else
3470     {
3471       d->dfaexec = dfaexec_noop;
3472     }
3473
3474   if (d->superset)
3475     {
3476       d->fast = true;
3477       dfaanalyze (d->superset, searchflag);
3478     }
3479 }
3480
3481 /* Free the storage held by the components of a dfa.  */
3482 void
3483 dfafree (struct dfa *d)
3484 {
3485   size_t i;
3486
3487   free (d->charclasses);
3488   free (d->tokens);
3489
3490   if (d->localeinfo.multibyte)
3491     free_mbdata (d);
3492
3493   for (i = 0; i < d->sindex; ++i)
3494     {
3495       free (d->states[i].elems.elems);
3496       free (d->states[i].mbps.elems);
3497     }
3498   free (d->states);
3499
3500   if (d->follows)
3501     {
3502       for (i = 0; i < d->tindex; ++i)
3503         free (d->follows[i].elems);
3504       free (d->follows);
3505     }
3506
3507   if (d->trans)
3508     {
3509       for (i = 0; i < d->tralloc; ++i)
3510         {
3511           free (d->trans[i]);
3512           free (d->fails[i]);
3513         }
3514
3515       free (d->trans - 1);
3516       free (d->fails);
3517       free (d->newlines);
3518       free (d->success);
3519     }
3520
3521   if (d->superset)
3522     dfafree (d->superset);
3523 }
3524
3525 /* Having found the postfix representation of the regular expression,
3526    try to find a long sequence of characters that must appear in any line
3527    containing the r.e.
3528    Finding a "longest" sequence is beyond the scope here;
3529    we take an easy way out and hope for the best.
3530    (Take "(ab|a)b"--please.)
3531
3532    We do a bottom-up calculation of sequences of characters that must appear
3533    in matches of r.e.'s represented by trees rooted at the nodes of the postfix
3534    representation:
3535         sequences that must appear at the left of the match ("left")
3536         sequences that must appear at the right of the match ("right")
3537         lists of sequences that must appear somewhere in the match ("in")
3538         sequences that must constitute the match ("is")
3539
3540    When we get to the root of the tree, we use one of the longest of its
3541    calculated "in" sequences as our answer.
3542
3543    The sequences calculated for the various types of node (in pseudo ANSI c)
3544    are shown below.  "p" is the operand of unary operators (and the left-hand
3545    operand of binary operators); "q" is the right-hand operand of binary
3546    operators.
3547
3548    "ZERO" means "a zero-length sequence" below.
3549
3550         Type    left            right           is              in
3551         ----    ----            -----           --              --
3552         char c  # c             # c             # c             # c
3553
3554         ANYCHAR ZERO            ZERO            ZERO            ZERO
3555
3556         MBCSET  ZERO            ZERO            ZERO            ZERO
3557
3558         CSET    ZERO            ZERO            ZERO            ZERO
3559
3560         STAR    ZERO            ZERO            ZERO            ZERO
3561
3562         QMARK   ZERO            ZERO            ZERO            ZERO
3563
3564         PLUS    p->left         p->right        ZERO            p->in
3565
3566         CAT     (p->is==ZERO)?  (q->is==ZERO)?  (p->is!=ZERO && p->in plus
3567                 p->left :       q->right :      q->is!=ZERO) ?  q->in plus
3568                 p->is##q->left  p->right##q->is p->is##q->is : p->right##q->left
3569                                                 ZERO
3570
3571         OR      longest common  longest common  (do p->is and substrings common
3572                 leading         trailing        to q->is have same p->in and
3573                 (sub)sequence   (sub)sequence   q->in length and content) ?
3574                 of p->left      of p->right
3575                 and q->left     and q->right    p->is : NULL
3576
3577    If there's anything else we recognize in the tree, all four sequences get set
3578    to zero-length sequences.  If there's something we don't recognize in the
3579    tree, we just return a zero-length sequence.
3580
3581    Break ties in favor of infrequent letters (choosing 'zzz' in preference to
3582    'aaa')?
3583
3584    And ... is it here or someplace that we might ponder "optimizations" such as
3585         egrep 'psi|epsilon'     ->      egrep 'psi'
3586         egrep 'pepsi|epsilon'   ->      egrep 'epsi'
3587                                         (Yes, we now find "epsi" as a "string
3588                                         that must occur", but we might also
3589                                         simplify the *entire* r.e. being sought)
3590         grep '[c]'              ->      grep 'c'
3591         grep '(ab|a)b'          ->      grep 'ab'
3592         grep 'ab*'              ->      grep 'a'
3593         grep 'a*b'              ->      grep 'b'
3594
3595    There are several issues:
3596
3597    Is optimization easy (enough)?
3598
3599    Does optimization actually accomplish anything,
3600    or is the automaton you get from "psi|epsilon" (for example)
3601    the same as the one you get from "psi" (for example)?
3602
3603    Are optimizable r.e.'s likely to be used in real-life situations
3604    (something like 'ab*' is probably unlikely; something like is
3605    'psi|epsilon' is likelier)?  */
3606
3607 static char *
3608 icatalloc (char *old, char const *new)
3609 {
3610   char *result;
3611   size_t oldsize;
3612   size_t newsize = strlen (new);
3613   if (newsize == 0)
3614     return old;
3615   oldsize = strlen (old);
3616   result = xrealloc (old, oldsize + newsize + 1);
3617   memcpy (result + oldsize, new, newsize + 1);
3618   return result;
3619 }
3620
3621 static void
3622 freelist (char **cpp)
3623 {
3624   while (*cpp)
3625     free (*cpp++);
3626 }
3627
3628 static char **
3629 enlist (char **cpp, char *new, size_t len)
3630 {
3631   size_t i, j;
3632   new = memcpy (xmalloc (len + 1), new, len);
3633   new[len] = '\0';
3634   /* Is there already something in the list that's new (or longer)?  */
3635   for (i = 0; cpp[i] != NULL; ++i)
3636     if (strstr (cpp[i], new) != NULL)
3637       {
3638         free (new);
3639         return cpp;
3640       }
3641   /* Eliminate any obsoleted strings.  */
3642   j = 0;
3643   while (cpp[j] != NULL)
3644     if (strstr (new, cpp[j]) == NULL)
3645       ++j;
3646     else
3647       {
3648         free (cpp[j]);
3649         if (--i == j)
3650           break;
3651         cpp[j] = cpp[i];
3652         cpp[i] = NULL;
3653       }
3654   /* Add the new string.  */
3655   cpp = xnrealloc (cpp, i + 2, sizeof *cpp);
3656   cpp[i] = new;
3657   cpp[i + 1] = NULL;
3658   return cpp;
3659 }
3660
3661 /* Given pointers to two strings, return a pointer to an allocated
3662    list of their distinct common substrings.  */
3663 static char **
3664 comsubs (char *left, char const *right)
3665 {
3666   char **cpp = xzalloc (sizeof *cpp);
3667   char *lcp;
3668
3669   for (lcp = left; *lcp != '\0'; ++lcp)
3670     {
3671       size_t len = 0;
3672       char *rcp = strchr (right, *lcp);
3673       while (rcp != NULL)
3674         {
3675           size_t i;
3676           for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
3677             continue;
3678           if (i > len)
3679             len = i;
3680           rcp = strchr (rcp + 1, *lcp);
3681         }
3682       if (len != 0)
3683         cpp = enlist (cpp, lcp, len);
3684     }
3685   return cpp;
3686 }
3687
3688 static char **
3689 addlists (char **old, char **new)
3690 {
3691   for (; *new; new++)
3692     old = enlist (old, *new, strlen (*new));
3693   return old;
3694 }
3695
3696 /* Given two lists of substrings, return a new list giving substrings
3697    common to both.  */
3698 static char **
3699 inboth (char **left, char **right)
3700 {
3701   char **both = xzalloc (sizeof *both);
3702   size_t lnum, rnum;
3703
3704   for (lnum = 0; left[lnum] != NULL; ++lnum)
3705     {
3706       for (rnum = 0; right[rnum] != NULL; ++rnum)
3707         {
3708           char **temp = comsubs (left[lnum], right[rnum]);
3709           both = addlists (both, temp);
3710           freelist (temp);
3711           free (temp);
3712         }
3713     }
3714   return both;
3715 }
3716
3717 typedef struct must must;
3718
3719 struct must
3720 {
3721   char **in;
3722   char *left;
3723   char *right;
3724   char *is;
3725   bool begline;
3726   bool endline;
3727   must *prev;
3728 };
3729
3730 static must *
3731 allocmust (must *mp, size_t size)
3732 {
3733   must *new_mp = xmalloc (sizeof *new_mp);
3734   new_mp->in = xzalloc (sizeof *new_mp->in);
3735   new_mp->left = xzalloc (size);
3736   new_mp->right = xzalloc (size);
3737   new_mp->is = xzalloc (size);
3738   new_mp->begline = false;
3739   new_mp->endline = false;
3740   new_mp->prev = mp;
3741   return new_mp;
3742 }
3743
3744 static void
3745 resetmust (must *mp)
3746 {
3747   freelist (mp->in);
3748   mp->in[0] = NULL;
3749   mp->left[0] = mp->right[0] = mp->is[0] = '\0';
3750   mp->begline = false;
3751   mp->endline = false;
3752 }
3753
3754 static void
3755 freemust (must *mp)
3756 {
3757   freelist (mp->in);
3758   free (mp->in);
3759   free (mp->left);
3760   free (mp->right);
3761   free (mp->is);
3762   free (mp);
3763 }
3764
3765 struct dfamust *
3766 dfamust (struct dfa const *d)
3767 {
3768   must *mp = NULL;
3769   char const *result = "";
3770   size_t i, ri;
3771   bool exact = false;
3772   bool begline = false;
3773   bool endline = false;
3774   size_t rj;
3775   bool need_begline = false;
3776   bool need_endline = false;
3777   bool case_fold_unibyte = d->syntax.case_fold && MB_CUR_MAX == 1;
3778   struct dfamust *dm;
3779
3780   for (ri = 0; ri < d->tindex; ++ri)
3781     {
3782       token t = d->tokens[ri];
3783       switch (t)
3784         {
3785         case BEGLINE:
3786           mp = allocmust (mp, 2);
3787           mp->begline = true;
3788           need_begline = true;
3789           break;
3790         case ENDLINE:
3791           mp = allocmust (mp, 2);
3792           mp->endline = true;
3793           need_endline = true;
3794           break;
3795         case LPAREN:
3796         case RPAREN:
3797           assert (!"neither LPAREN nor RPAREN may appear here");
3798
3799         case EMPTY:
3800         case BEGWORD:
3801         case ENDWORD:
3802         case LIMWORD:
3803         case NOTLIMWORD:
3804         case BACKREF:
3805         case ANYCHAR:
3806         case MBCSET:
3807           mp = allocmust (mp, 2);
3808           break;
3809
3810         case STAR:
3811         case QMARK:
3812           resetmust (mp);
3813           break;
3814
3815         case OR:
3816           {
3817             char **new;
3818             must *rmp = mp;
3819             must *lmp = mp = mp->prev;
3820             size_t j, ln, rn, n;
3821
3822             /* Guaranteed to be.  Unlikely, but ...  */
3823             if (STREQ (lmp->is, rmp->is))
3824               {
3825                 lmp->begline &= rmp->begline;
3826                 lmp->endline &= rmp->endline;
3827               }
3828             else
3829               {
3830                 lmp->is[0] = '\0';
3831                 lmp->begline = false;
3832                 lmp->endline = false;
3833               }
3834             /* Left side--easy */
3835             i = 0;
3836             while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
3837               ++i;
3838             lmp->left[i] = '\0';
3839             /* Right side */
3840             ln = strlen (lmp->right);
3841             rn = strlen (rmp->right);
3842             n = ln;
3843             if (n > rn)
3844               n = rn;
3845             for (i = 0; i < n; ++i)
3846               if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
3847                 break;
3848             for (j = 0; j < i; ++j)
3849               lmp->right[j] = lmp->right[(ln - i) + j];
3850             lmp->right[j] = '\0';
3851             new = inboth (lmp->in, rmp->in);
3852             freelist (lmp->in);
3853             free (lmp->in);
3854             lmp->in = new;
3855             freemust (rmp);
3856           }
3857           break;
3858
3859         case PLUS:
3860           mp->is[0] = '\0';
3861           break;
3862
3863         case END:
3864           assert (!mp->prev);
3865           for (i = 0; mp->in[i] != NULL; ++i)
3866             if (strlen (mp->in[i]) > strlen (result))
3867               result = mp->in[i];
3868           if (STREQ (result, mp->is))
3869             {
3870               if ((!need_begline || mp->begline) && (!need_endline
3871                                                      || mp->endline))
3872                 exact = true;
3873               begline = mp->begline;
3874               endline = mp->endline;
3875             }
3876           goto done;
3877
3878         case CAT:
3879           {
3880             must *rmp = mp;
3881             must *lmp = mp = mp->prev;
3882
3883             /* In.  Everything in left, plus everything in
3884                right, plus concatenation of
3885                left's right and right's left.  */
3886             lmp->in = addlists (lmp->in, rmp->in);
3887             if (lmp->right[0] != '\0' && rmp->left[0] != '\0')
3888               {
3889                 size_t lrlen = strlen (lmp->right);
3890                 size_t rllen = strlen (rmp->left);
3891                 char *tp = xmalloc (lrlen + rllen);
3892                 memcpy (tp, lmp->right, lrlen);
3893                 memcpy (tp + lrlen, rmp->left, rllen);
3894                 lmp->in = enlist (lmp->in, tp, lrlen + rllen);
3895                 free (tp);
3896               }
3897             /* Left-hand */
3898             if (lmp->is[0] != '\0')
3899               lmp->left = icatalloc (lmp->left, rmp->left);
3900             /* Right-hand */
3901             if (rmp->is[0] == '\0')
3902               lmp->right[0] = '\0';
3903             lmp->right = icatalloc (lmp->right, rmp->right);
3904             /* Guaranteed to be */
3905             if ((lmp->is[0] != '\0' || lmp->begline)
3906                 && (rmp->is[0] != '\0' || rmp->endline))
3907               {
3908                 lmp->is = icatalloc (lmp->is, rmp->is);
3909                 lmp->endline = rmp->endline;
3910               }
3911             else
3912               {
3913                 lmp->is[0] = '\0';
3914                 lmp->begline = false;
3915                 lmp->endline = false;
3916               }
3917             freemust (rmp);
3918           }
3919           break;
3920
3921         case '\0':
3922           /* Not on *my* shift.  */
3923           goto done;
3924
3925         default:
3926           if (CSET <= t)
3927             {
3928               /* If T is a singleton, or if case-folding in a unibyte
3929                  locale and T's members all case-fold to the same char,
3930                  convert T to one of its members.  Otherwise, do
3931                  nothing further with T.  */
3932               charclass *ccl = &d->charclasses[t - CSET];
3933               int j;
3934               for (j = 0; j < NOTCHAR; j++)
3935                 if (tstbit (j, *ccl))
3936                   break;
3937               if (! (j < NOTCHAR))
3938                 {
3939                   mp = allocmust (mp, 2);
3940                   break;
3941                 }
3942               t = j;
3943               while (++j < NOTCHAR)
3944                 if (tstbit (j, *ccl)
3945                     && ! (case_fold_unibyte
3946                           && toupper (j) == toupper (t)))
3947                   break;
3948               if (j < NOTCHAR)
3949                 {
3950                   mp = allocmust (mp, 2);
3951                   break;
3952                 }
3953             }
3954
3955           rj = ri + 2;
3956           if (d->tokens[ri + 1] == CAT)
3957             {
3958               for (; rj < d->tindex - 1; rj += 2)
3959                 {
3960                   if ((rj != ri && (d->tokens[rj] <= 0
3961                                     || NOTCHAR <= d->tokens[rj]))
3962                       || d->tokens[rj + 1] != CAT)
3963                     break;
3964                 }
3965             }
3966           mp = allocmust (mp, ((rj - ri) >> 1) + 1);
3967           mp->is[0] = mp->left[0] = mp->right[0]
3968             = case_fold_unibyte ? toupper (t) : t;
3969
3970           for (i = 1; ri + 2 < rj; i++)
3971             {
3972               ri += 2;
3973               t = d->tokens[ri];
3974               mp->is[i] = mp->left[i] = mp->right[i]
3975                 = case_fold_unibyte ? toupper (t) : t;
3976             }
3977           mp->is[i] = mp->left[i] = mp->right[i] = '\0';
3978           mp->in = enlist (mp->in, mp->is, i);
3979           break;
3980         }
3981     }
3982  done:;
3983
3984   dm = NULL;
3985   if (*result)
3986     {
3987       dm = xmalloc (sizeof *dm);
3988       dm->exact = exact;
3989       dm->begline = begline;
3990       dm->endline = endline;
3991       dm->must = xstrdup (result);
3992     }
3993
3994   while (mp)
3995     {
3996       must *prev = mp->prev;
3997       freemust (mp);
3998       mp = prev;
3999     }
4000
4001   return dm;
4002 }
4003
4004 void
4005 dfamustfree (struct dfamust *dm)
4006 {
4007   free (dm->must);
4008   free (dm);
4009 }
4010
4011 struct dfa *
4012 dfaalloc (void)
4013 {
4014   return xmalloc (sizeof (struct dfa));
4015 }
4016
4017 /* Initialize DFA.  */
4018 void
4019 dfasyntax (struct dfa *dfa, struct localeinfo const *linfo,
4020            reg_syntax_t bits, int dfaopts)
4021 {
4022   int i;
4023   memset (dfa, 0, offsetof (struct dfa, dfaexec));
4024   dfa->dfaexec = linfo->multibyte ? dfaexec_mb : dfaexec_sb;
4025   dfa->simple_locale = using_simple_locale (linfo->multibyte);
4026   dfa->localeinfo = *linfo;
4027
4028   dfa->fast = !dfa->localeinfo.multibyte;
4029
4030   dfa->canychar = -1;
4031   dfa->lex.cur_mb_len = 1;
4032   dfa->syntax.syntax_bits_set = true;
4033   dfa->syntax.case_fold = (dfaopts & DFA_CASE_FOLD) != 0;
4034   dfa->syntax.anchor = (dfaopts & DFA_ANCHOR) != 0;
4035   dfa->syntax.eolbyte = dfaopts & DFA_EOL_NUL ? '\0' : '\n';
4036   dfa->syntax.syntax_bits = bits;
4037
4038   for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
4039     {
4040       unsigned char uc = i;
4041
4042       dfa->syntax.sbit[uc] = char_context (dfa, uc);
4043       switch (dfa->syntax.sbit[uc])
4044         {
4045         case CTX_LETTER:
4046           setbit (uc, dfa->syntax.letters);
4047           break;
4048         case CTX_NEWLINE:
4049           setbit (uc, dfa->syntax.newline);
4050           break;
4051         }
4052
4053       /* POSIX requires that the five bytes in "\n\r./" (including the
4054          terminating NUL) cannot occur inside a multibyte character.  */
4055       dfa->syntax.never_trail[uc] = (dfa->localeinfo.using_utf8
4056                                      ? (uc & 0xc0) != 0x80
4057                                      : strchr ("\n\r./", uc) != NULL);
4058     }
4059 }
4060
4061 /* vim:set shiftwidth=2: */