Update.

author Ulrich Drepper <drepper@redhat.com>

Mon, 24 Nov 2003 19:30:51 +0000 (19:30 +0000)

committer Ulrich Drepper <drepper@redhat.com>

Mon, 24 Nov 2003 19:30:51 +0000 (19:30 +0000)
author Ulrich Drepper <drepper@redhat.com>
Mon, 24 Nov 2003 19:30:51 +0000 (19:30 +0000)
committer Ulrich Drepper <drepper@redhat.com>
Mon, 24 Nov 2003 19:30:51 +0000 (19:30 +0000)
diff --git a/ChangeLog b/ChangeLog

index 63444d48bde8fa5c7fe0c266d32ac15e52cbe573..969692626e80619954c2c810953913ec5bfe4369 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,31 @@
+2003-11-24  Jakub Jelinek  <jakub@redhat.com>
+
+       * posix/regex_internal.h (re_token_t): Add word_char bit.  Add
+       comment.
+       (re_dfa_t): Add sb_char field.
+       (bitset_mask): New function.
+       * posix/regcomp.c (free_dfa_content): Free sb_char.
+       (init_dfa): Don't initialize word_char unnecessarily.
+       Initialize sb_char.
+       (duplicate_node): Don't duplicate !word_char CHARACTERs with
+       NEXT_WORD_CONSTRAINT constraint or word_char CHARACTERs with
+       NEXT_NOTWORD_CONSTRAINT.  Return -1 in *new_idx instead.
+       (duplicate_node_closure): Handle clone_dest == -1 from
+       duplicate_node.
+       (peek_token): Initialize word_char bit.
+       (parse_expression, parse_dup_op): Add comments.
+       (parse_bracket_exp): Don't set bitmask bits for multi-byte char
+       starting bytes here at the beginning.  Mask off the bits right
+       before creating SIMPLE_BRACKET.
+       (build_charclass_op): Likewise.
+       * posix/regexec.c (group_nodes_into_DFAstates) <case OP_PERIOD>: Only
+       set accept bits for single-byte characters.
+       (group_nodes_into_DFAstates): Don't rely on characters 0 .. 127
+       being single byte encoded and the rest multi-byte.
+       * posix/bug-regex19.c (tests): Add new tests.
+       (do_mb_tests): Initialize t to *test.
+       (main): Fail even on do_mb_tests errors.
+
  2003-11-23  Ulrich Drepper  <drepper@redhat.com>
  
         * posix/regexec.c: Correct several memory allocation problems.
diff --git a/posix/bug-regex19.c b/posix/bug-regex19.c

index 01093e60acd247b9ed7020e01865c8f760949fcf..f24e0aa730ca2526cecb4d05608de17657b3e39a 100644 (file)
--- a/posix/bug-regex19.c
+++ b/posix/bug-regex19.c
@@ -102,6 +102,20 @@ static struct test_s
    {ERE, ".\\b.", "=A=", 0, 0},
    {ERE, ".\\b.", "==", 0, -1},
    {ERE, ".\\b.", "ABA", 0, -1},
+  {ERE, "[^k]\\b[^k]", "AA~", 0, 1},
+  {ERE, "[^k]\\b[^k]", "=A=", 0, 0},
+  {ERE, "[^k]\\b[^k]", "Ak~kA~", 0, 4},
+  {ERE, "[^k]\\b[^k]", "==", 0, -1},
+  {ERE, "[^k]\\b[^k]", "ABA", 0, -1},
+  {ERE, "[^k]\\b[^k]", "Ak~", 0, -1},
+  {ERE, "[^k]\\b[^k]", "k=k", 0, -1},
+  {ERE, "[^C]\\b[^C]", "AA~", 0, 1},
+  {ERE, "[^C]\\b[^C]", "=A=", 0, 0},
+  {ERE, "[^C]\\b[^C]", "AC~CA~", 0, 4},
+  {ERE, "[^C]\\b[^C]", "==", 0, -1},
+  {ERE, "[^C]\\b[^C]", "ABA", 0, -1},
+  {ERE, "[^C]\\b[^C]", "AC~", 0, -1},
+  {ERE, "[^C]\\b[^C]", "C=C", 0, -1},
    {ERE, "\\<(A|!|.B)", "A=AC", 0, 0},
    {ERE, "\\<(A|!|.B)", "=AC", 0, 1},
    {ERE, "\\<(A|!|.B)", "!AC", 0, 1},
@@ -140,12 +154,38 @@ static struct test_s
    {ERE, ".\\<.", "AA~", 0, -1},
    {ERE, ".\\<.", "==", 0, -1},
    {ERE, ".\\<.", "ABA", 0, -1},
+  {ERE, "[^k]\\<[^k]", "=k=A=", 0, 2},
+  {ERE, "[^k]\\<[^k]", "kk~", 0, -1},
+  {ERE, "[^k]\\<[^k]", "==", 0, -1},
+  {ERE, "[^k]\\<[^k]", "ABA", 0, -1},
+  {ERE, "[^k]\\<[^k]", "=k=", 0, -1},
+  {ERE, "[^C]\\<[^C]", "=C=A=", 0, 2},
+  {ERE, "[^C]\\<[^C]", "CC~", 0, -1},
+  {ERE, "[^C]\\<[^C]", "==", 0, -1},
+  {ERE, "[^C]\\<[^C]", "ABA", 0, -1},
+  {ERE, "[^C]\\<[^C]", "=C=", 0, -1},
    {ERE, ".\\B.", "ABA", 0, 0},
    {ERE, ".\\B.", "=BDC", 0, 1},
+  {ERE, "[^k]\\B[^k]", "kkkABA", 0, 3},
+  {ERE, "[^k]\\B[^k]", "kBk", 0, -1},
+  {ERE, "[^C]\\B[^C]", "CCCABA", 0, 3},
+  {ERE, "[^C]\\B[^C]", "CBC", 0, -1},
    {ERE, ".(\\b|\\B).", "=~AB", 0, 1},
    {ERE, ".(\\b|\\B).", "A=C", 0, 0},
    {ERE, ".(\\b|\\B).", "ABC", 0, 0},
    {ERE, ".(\\b|\\B).", "=~\\!", 0, -1},
+  {ERE, "[^k](\\b|\\B)[^k]", "=~AB", 0, 1},
+  {ERE, "[^k](\\b|\\B)[^k]", "A=C", 0, 0},
+  {ERE, "[^k](\\b|\\B)[^k]", "ABC", 0, 0},
+  {ERE, "[^k](\\b|\\B)[^k]", "=~kBD", 0, 3},
+  {ERE, "[^k](\\b|\\B)[^k]", "=~\\!", 0, -1},
+  {ERE, "[^k](\\b|\\B)[^k]", "=~kB", 0, -1},
+  {ERE, "[^C](\\b|\\B)[^C]", "=~AB", 0, 1},
+  {ERE, "[^C](\\b|\\B)[^C]", "A=C", 0, 0},
+  {ERE, "[^C](\\b|\\B)[^C]", "ABC", 0, 0},
+  {ERE, "[^C](\\b|\\B)[^C]", "=~CBD", 0, 3},
+  {ERE, "[^C](\\b|\\B)[^C]", "=~\\!", 0, -1},
+  {ERE, "[^C](\\b|\\B)[^C]", "=~CB", 0, -1},
    {ERE, "\\b([A]|[!]|.B)", "A=AC", 0, 0},
    {ERE, "\\b([A]|[!]|.B)", "=AC", 0, 1},
    {ERE, "\\b([A]|[!]|.B)", "!AC", 0, 1},
@@ -288,6 +328,7 @@ do_mb_tests (const struct test_s *test)
    char string[strlen (test->string) * 4 + 1];
    char fail[8 + sizeof ("UTF-8 ")];
  
+  t = *test;
    t.pattern = pattern;
    t.string = string;
    strcpy (fail, "UTF-8 ");
@@ -367,9 +408,7 @@ main (void)
           ret = 1;
         }
        ret |= do_one_test (&tests[i], "UTF-8 ");
-      // Until the implementation is fixed, ignore the results of the
-      // MB tests.
-      /* ret |= */do_mb_tests (&tests[i]);
+      ret |= do_mb_tests (&tests[i]);
      }
  
    return ret;
diff --git a/posix/regcomp.c b/posix/regcomp.c

index 6fb1844599004d097f7c7e4203796dffe4d95e09..52f1fa23e108f3f3edd4cd19fb4ff6819706c6f3 100644 (file)
--- a/posix/regcomp.c
+++ b/posix/regcomp.c
@@ -607,6 +607,9 @@ free_dfa_content (re_dfa_t *dfa)
        }
    re_free (dfa->state_table);
    re_free (dfa->word_char);
+#ifdef RE_ENABLE_I18N
+  re_free (dfa->sb_char);
+#endif  
  #ifdef DEBUG
    re_free (dfa->re_str);
  #endif
@@ -831,7 +834,7 @@ init_dfa (dfa, pat_len)
  
    dfa->subexps_alloc = 1;
    dfa->subexps = re_malloc (re_subexp_t, dfa->subexps_alloc);
-  dfa->word_char = NULL;
+  /* dfa->word_char = NULL; */
  
    dfa->mb_cur_max = MB_CUR_MAX;
  #ifdef _LIBC
@@ -841,6 +844,25 @@ init_dfa (dfa, pat_len)
    dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
                        != 0);
  #endif
+#ifdef RE_ENABLE_I18N
+  if (dfa->mb_cur_max > 1)
+    {
+      int i, j, ch;
+
+      dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
+      if (BE (dfa->sb_char == NULL, 0))
+       return REG_ESPACE;
+#ifdef _LIBC
+      if (dfa->is_utf8)
+       memset (dfa->sb_char, 255, sizeof (unsigned int) * BITSET_UINTS / 2);
+      else
+#endif
+       for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
+         for (j = 0; j < UINT_BITS; ++j, ++ch)
+           if (btowc (ch) != WEOF)
+             dfa->sb_char[i] |= 1 << j;
+    }
+#endif
  
    if (BE (dfa->nodes == NULL || dfa->state_table == NULL
           || dfa->subexps == NULL, 0))
@@ -1311,6 +1333,8 @@ duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
           if (BE (err != REG_NOERROR, 0))
             return err;
           dfa->nexts[clone_node] = dfa->nexts[org_node];
+         if (clone_dest == -1)
+           break;
           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
           if (BE (ret < 0, 0))
             return REG_ESPACE;
@@ -1348,6 +1372,8 @@ duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
           err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
           if (BE (err != REG_NOERROR, 0))
             return err;
+         if (clone_dest == -1)
+           break;
           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
           if (BE (ret < 0, 0))
             return REG_ESPACE;
@@ -1366,13 +1392,16 @@ duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
               err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
               if (BE (err != REG_NOERROR, 0))
                 return err;
-             ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
-             if (BE (ret < 0, 0))
-               return REG_ESPACE;
-             err = duplicate_node_closure (dfa, org_dest, clone_dest,
-                                           root_node, constraint);
-             if (BE (err != REG_NOERROR, 0))
-               return err;
+             if (clone_dest != -1)
+               {
+                 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
+                 if (BE (ret < 0, 0))
+                   return REG_ESPACE;
+                 err = duplicate_node_closure (dfa, org_dest, clone_dest,
+                                               root_node, constraint);
+                 if (BE (err != REG_NOERROR, 0))
+                   return err;
+               }
             }
           else
             {
@@ -1387,6 +1416,8 @@ duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
           err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
           if (BE (err != REG_NOERROR, 0))
             return err;
+         if (clone_dest == -1)
+           break;
           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
           if (BE (ret < 0, 0))
             return REG_ESPACE;
@@ -1426,7 +1457,21 @@ duplicate_node (new_idx, dfa, org_idx, constraint)
       int *new_idx, org_idx;
       unsigned int constraint;
  {
-  int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx], 1);
+  int dup_idx;
+
+  if (dfa->nodes[org_idx].type == CHARACTER
+      && (((constraint & NEXT_WORD_CONSTRAINT)
+          && !dfa->nodes[org_idx].word_char)
+         || ((constraint & NEXT_NOTWORD_CONSTRAINT)
+             && dfa->nodes[org_idx].word_char)))
+    {
+      /* \<!, \>W etc. can never match.  Don't duplicate them, instead
+        tell the caller they shouldn't be added to edests.  */
+      *new_idx = -1;
+      return REG_NOERROR;
+    }
+
+  dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx], 1);
    if (BE (dup_idx == -1, 0))
      return REG_ESPACE;
    dfa->nodes[dup_idx].constraint = constraint;
@@ -1614,6 +1659,7 @@ peek_token (token, input, syntax)
    c = re_string_peek_byte (input, 0);
    token->opr.c = c;
  
+  token->word_char = 0;
  #ifdef RE_ENABLE_I18N
    token->mb_partial = 0;
    if (input->mb_cur_max > 1 &&
@@ -1636,6 +1682,17 @@ peek_token (token, input, syntax)
        c2 = re_string_peek_byte_case (input, 1);
        token->opr.c = c2;
        token->type = CHARACTER;
+#ifdef RE_ENABLE_I18N
+      if (input->mb_cur_max > 1)
+       {
+         wint_t wc = re_string_wchar_at (input,
+                                         re_string_cur_idx (input) + 1);
+         token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
+       }
+      else
+#endif
+       token->word_char = IS_WORD_CHAR (c2) != 0;
+
        switch (c2)
         {
         case '|':
@@ -1739,6 +1796,16 @@ peek_token (token, input, syntax)
      }
  
    token->type = CHARACTER;
+#ifdef RE_ENABLE_I18N
+  if (input->mb_cur_max > 1)
+    {
+      wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
+      token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
+    }
+  else
+#endif
+    token->word_char = IS_WORD_CHAR (token->opr.c);
+
    switch (c)
      {
      case '\n':
@@ -2140,6 +2207,8 @@ parse_expression (regexp, preg, token, syntax, nest, err)
  
        /* Then we can these characters as normal characters.  */
        token->type = CHARACTER;
+      /* mb_partial and word_char bits should be initialized already
+        by peek_token.  */
        tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
        if (BE (tree == NULL, 0))
         {
@@ -2475,6 +2544,8 @@ parse_dup_op (dup_elem, regexp, dfa, token, syntax, err)
    re_string_set_index (regexp, start_idx);
    *token = start_token;
    token->type = CHARACTER;
+  /* mb_partial and word_char bits should be already initialized by
+     peek_token.  */
    return dup_elem;
  }
  
@@ -2952,7 +3023,6 @@ parse_bracket_exp (regexp, dfa, token, syntax, err)
    if (token->type == OP_NON_MATCH_LIST)
      {
  #ifdef RE_ENABLE_I18N
-      int i;
        mbcset->non_match = 1;
  #else /* not RE_ENABLE_I18N */
        non_match = 1;
@@ -2966,12 +3036,6 @@ parse_bracket_exp (regexp, dfa, token, syntax, err)
           *err = REG_BADPAT;
           goto parse_bracket_exp_free_return;
         }
-#ifdef RE_ENABLE_I18N
-      if (dfa->mb_cur_max > 1)
-       for (i = 0; i < SBC_MAX; ++i)
-         if (__btowc (i) == WEOF)
-           bitset_set (sbcset, i);
-#endif /* RE_ENABLE_I18N */
      }
  
    /* We treat the first ']' as a normal character.  */
@@ -3126,6 +3190,11 @@ parse_bracket_exp (regexp, dfa, token, syntax, err)
    if (non_match)
  #endif /* not RE_ENABLE_I18N */
      bitset_not (sbcset);
+#ifdef RE_ENABLE_I18N
+  /* Ensure only single byte characters are set.  */
+  if (dfa->mb_cur_max > 1)
+    bitset_mask (sbcset, dfa->sb_char);
+#endif /* RE_ENABLE_I18N */
  
    /* Build a tree for simple bracket.  */
    br_token.type = SIMPLE_BRACKET;
@@ -3493,16 +3562,11 @@ build_charclass_op (dfa, trans, class_name, extra, not, err)
    if (not)
      {
  #ifdef RE_ENABLE_I18N
-      int i;
        /*
        if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
         bitset_set(cset->sbcset, '\0');
        */
        mbcset->non_match = 1;
-      if (dfa->mb_cur_max > 1)
-       for (i = 0; i < SBC_MAX; ++i)
-         if (__btowc (i) == WEOF)
-           bitset_set (sbcset, i);
  #else /* not RE_ENABLE_I18N */
        non_match = 1;
  #endif /* not RE_ENABLE_I18N */
@@ -3536,6 +3600,12 @@ build_charclass_op (dfa, trans, class_name, extra, not, err)
  #endif /* not RE_ENABLE_I18N */
      bitset_not (sbcset);
  
+#ifdef RE_ENABLE_I18N
+  /* Ensure only single byte characters are set.  */
+  if (dfa->mb_cur_max > 1)
+    bitset_mask (sbcset, dfa->sb_char);
+#endif
+
    /* Build a tree for simple bracket.  */
    br_token.type = SIMPLE_BRACKET;
    br_token.opr.sbcset = sbcset;
diff --git a/posix/regex_internal.h b/posix/regex_internal.h

index 5111f6d793ca738e082a27ba4ff5ab3c2e0918ac..f8e99ee06aff21e4e6751fb9be35f49c090ca00a 100644 (file)
--- a/posix/regex_internal.h
+++ b/posix/regex_internal.h
@@ -133,6 +133,7 @@ typedef unsigned int *re_bitset_ptr_t;
  static inline void bitset_not (bitset set);
  static inline void bitset_merge (bitset dest, const bitset src);
  static inline void bitset_not_merge (bitset dest, const bitset src);
+static inline void bitset_mask (bitset dest, const bitset src);
  
  #define PREV_WORD_CONSTRAINT 0x0001
  #define PREV_NOTWORD_CONSTRAINT 0x0002
@@ -281,8 +282,11 @@ typedef struct
    unsigned int constraint : 10;        /* context constraint */
    unsigned int duplicated : 1;
  #ifdef RE_ENABLE_I18N
+  /* These 2 bits can be moved into the union if needed (e.g. if running out
+     of bits; move opr.c to opr.c.c and move the flags to opr.c.flags).  */
    unsigned int mb_partial : 1;
  #endif
+  unsigned int word_char : 1;
  } re_token_t;
  
  #define IS_EPSILON_NODE(type) ((type) & EPSILON_BIT)
@@ -601,6 +605,7 @@ struct re_dfa_t
    re_dfastate_t *init_state_begbuf;
    bin_tree_t *str_tree;
    bin_tree_storage_t *str_tree_storage;
+  re_bitset_ptr_t sb_char;
    int str_tree_storage_idx;
  
    /* number of subexpressions `re_nsub' is in regex_t.  */
@@ -711,6 +716,16 @@ bitset_not_merge (dest, src)
      dest[i] |= ~src[i];
  }
  
+static inline void
+bitset_mask (dest, src)
+     bitset dest;
+     const bitset src;
+{
+  int bitset_i;
+  for (bitset_i = 0; bitset_i < BITSET_UINTS; ++bitset_i)
+    dest[bitset_i] &= src[bitset_i];
+}
+
  #if defined RE_ENABLE_I18N && !defined RE_NO_INTERNAL_PROTOTYPES
  /* Inline functions for re_string.  */
  static inline int
diff --git a/posix/regexec.c b/posix/regexec.c

index 0b524856ca760e1b703d5eaffd529dd791c8cef6..58ac9c82c4f8a7df205c06aa9493bcf997ae6a8c 100644 (file)
--- a/posix/regexec.c
+++ b/posix/regexec.c
@@ -3341,7 +3341,12 @@ group_nodes_into_DFAstates (preg, state, dests_node, dests_ch)
         }
        else if (type == OP_PERIOD)
         {
-         bitset_set_all (accepts);
+#ifdef RE_ENABLE_I18N
+         if (dfa->mb_cur_max > 1)
+           bitset_merge (accepts, dfa->sb_char);
+         else
+#endif   
+           bitset_set_all (accepts);
           if (!(preg->syntax & RE_DOT_NEWLINE))
             bitset_clear (accepts, '\n');
           if (preg->syntax & RE_DOT_NOT_NULL)
@@ -3362,8 +3367,6 @@ group_nodes_into_DFAstates (preg, state, dests_node, dests_ch)
          match it the context.  */
        if (constraint)
         {
-         int word_char_max;
-
           if (constraint & NEXT_NEWLINE_CONSTRAINT)
             {
               int accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
@@ -3379,16 +3382,28 @@ group_nodes_into_DFAstates (preg, state, dests_node, dests_ch)
               continue;
             }
  
-         /* This assumes ASCII compatible locale.  We cannot say
-            anything about the non-ascii chars.  */
-         word_char_max
-           = dfa->mb_cur_max > 1 ? BITSET_UINTS / 2 : BITSET_UINTS;
           if (constraint & NEXT_WORD_CONSTRAINT)
-           for (j = 0; j < word_char_max; ++j)
-             accepts[j] &= dfa->word_char[j];
+           {
+#ifdef RE_ENABLE_I18N
+             if (dfa->mb_cur_max > 1)
+               for (j = 0; j < BITSET_UINTS; ++j)
+                 accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]);
+             else
+#endif
+               for (j = 0; j < BITSET_UINTS; ++j)
+                 accepts[j] &= dfa->word_char[j];
+           }
           if (constraint & NEXT_NOTWORD_CONSTRAINT)
-           for (j = 0; j < word_char_max; ++j)
-             accepts[j] &= ~dfa->word_char[j];
+           {
+#ifdef RE_ENABLE_I18N
+             if (dfa->mb_cur_max > 1)
+               for (j = 0; j < BITSET_UINTS; ++j)
+                 accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]);
+             else
+#endif
+               for (j = 0; j < BITSET_UINTS; ++j)
+                 accepts[j] &= ~dfa->word_char[j];
+           }
         }
  
        /* Then divide `accepts' into DFA states, or create a new
author	Ulrich Drepper <drepper@redhat.com>
	Mon, 24 Nov 2003 19:30:51 +0000 (19:30 +0000)
committer	Ulrich Drepper <drepper@redhat.com>
	Mon, 24 Nov 2003 19:30:51 +0000 (19:30 +0000)
ChangeLog		patch \| blob \| history
posix/bug-regex19.c		patch \| blob \| history
posix/regcomp.c		patch \| blob \| history
posix/regex_internal.h		patch \| blob \| history
posix/regexec.c		patch \| blob \| history