regex: Allow any single char to be SIMPLE

author Karl Williamson <public@khwilliamson.com>

Sat, 6 Oct 2012 20:57:38 +0000 (14:57 -0600)

committer Karl Williamson <public@khwilliamson.com>

Tue, 9 Oct 2012 17:16:05 +0000 (11:16 -0600)
author Karl Williamson <public@khwilliamson.com>
Sat, 6 Oct 2012 20:57:38 +0000 (14:57 -0600)
committer Karl Williamson <public@khwilliamson.com>
Tue, 9 Oct 2012 17:16:05 +0000 (11:16 -0600)
diff --git a/regcomp.c b/regcomp.c

index 7d9e3c4..128bbbb 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -229,8 +229,7 @@ typedef struct RExC_state_t {
  #define        HASWIDTH        0x01    /* Known to match non-null strings. */
  
  /* Simple enough to be STAR/PLUS operand; in an EXACT node must be a single
- * character, and if utf8, must be invariant.  Note that this is not the same
- * thing as REGNODE_SIMPLE */
+ * character.  Note that this is not the same thing as REGNODE_SIMPLE */
  #define        SIMPLE          0x02
  #define        SPSTART         0x04    /* Starts with * or +. */
  #define TRYAGAIN       0x08    /* Weeded out a declaration. */
@@ -9917,13 +9916,13 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32
       * it (by setting <*flagp>, and potentially populating it with a single
       * character.
       *
-     * If <len> is non-zero, this function assumes that the node has already
-     * been populated, and just does the sizing.  In this case <code_point>
-     * should be the final code point that has already been placed into the
-     * node.  This value will be ignored except that under some circumstances
-     * <*flagp> is set based on it.
+     * If <len> (the length in bytes) is non-zero, this function assumes that
+     * the node has already been populated, and just does the sizing.  In this
+     * case <code_point> should be the final code point that has already been
+     * placed into the node.  This value will be ignored except that under some
+     * circumstances <*flagp> is set based on it.
       *
-     * If <len is zero, the function assumes that the node is to contain only
+     * If <len> is zero, the function assumes that the node is to contain only
       * the single character given by <code_point> and calculates what <len>
       * should be.  In pass 1, it sizes the node appropriately.  In pass 2, it
       * additionally will populate the node's STRING with <code_point>, if <len>
@@ -9974,8 +9973,12 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32
      }
  
      *flagp |= HASWIDTH;
-    if (len == 1 && (code_point != LATIN_SMALL_LETTER_SHARP_S
-                     || ! FOLD || ! DEPENDS_SEMANTICS))
+
+    /* A single character node is SIMPLE, except for the special-cased SHARP S
+     * under /di. */
+    if ((len == 1 || (UTF && len == UNISKIP(code_point)))
+        && (code_point != LATIN_SMALL_LETTER_SHARP_S
+            || ! FOLD || ! DEPENDS_SEMANTICS))
      {
          *flagp |= SIMPLE;
      }
diff --git a/regexec.c b/regexec.c

index f2833ac..05a96ac 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -5507,21 +5507,21 @@ NULL
         } \
      }
  
-       case STAR:              /*  /A*B/ where A is width 1 */
+        case STAR:             /*  /A*B/ where A is width 1 char */
             ST.paren = 0;
             ST.min = 0;
             ST.max = REG_INFTY;
             scan = NEXTOPER(scan);
             goto repeat;
  
-       case PLUS:              /*  /A+B/ where A is width 1 */
+        case PLUS:             /*  /A+B/ where A is width 1 char */
             ST.paren = 0;
             ST.min = 1;
             ST.max = REG_INFTY;
             scan = NEXTOPER(scan);
             goto repeat;
  
-       case CURLYN:            /*  /(A){m,n}B/ where A is width 1 */
+       case CURLYN:            /*  /(A){m,n}B/ where A is width 1 char */
             ST.paren = scan->flags;     /* Which paren to set */
             ST.lastparen      = rex->lastparen;
             ST.lastcloseparen = rex->lastcloseparen;
@@ -5537,7 +5537,7 @@ NULL
              scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
             goto repeat;
  
-       case CURLY:             /*  /A{m,n}B/ where A is width 1 */
+       case CURLY:             /*  /A{m,n}B/ where A is width 1 char */
             ST.paren = 0;
             ST.min = ARG1(scan);  /* min to match */
             ST.max = ARG2(scan);  /* max to match */
@@ -6344,25 +6344,30 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
         scan = loceol;
         break;
      case EXACT:
-       /* To get here, EXACTish nodes must have *byte* length == 1.  That
-        * means they match only characters in the string that can be expressed
-        * as a single byte.  For non-utf8 strings, that means a simple match.
-        * For utf8 strings, the character matched must be an invariant, or
-        * downgradable to a single byte.  The pattern's utf8ness is
-        * irrelevant, as since it's a single byte, it either isn't utf8, or if
-        * it is, it's an invariant */
-
         c = (U8)*STRING(p);
-       assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
  
         if (! utf8_target || UNI_IS_INVARIANT(c)) {
             while (scan < loceol && UCHARAT(scan) == c) {
                 scan++;
             }
         }
+       else if (UTF_PATTERN) {
+            STRLEN scan_char_len;
+
+           loceol = PL_regeol;
+
+           while (hardcount < max
+                   && scan + (scan_char_len = UTF8SKIP(scan)) < loceol
+                   && scan_char_len <= STR_LEN(p)
+                   && memEQ(scan, STRING(p), scan_char_len))
+            {
+               scan += scan_char_len;
+               hardcount++;
+           }
+        }
         else {
  
-           /* Here, the string is utf8, and the pattern char is different
+           /* Here, the string is utf8, the pattern isn't, but <c> is different
              * in utf8 than not, so can't compare them directly.  Outside the
              * loop, find the two utf8 bytes that represent c, and then
              * look for those in sequence in the utf8 string */
@@ -6398,17 +6403,19 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
      case EXACTFU:
         utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
  
-       /* The comments for the EXACT case above apply as well to these fold
-        * ones */
-
      do_exactf:
         c = (U8)*STRING(p);
  
-       if (utf8_target || OP(p) == EXACTFU_SS) { /* Use full Unicode fold matching */
+       if (utf8_target
+            || OP(p) == EXACTFU_SS
+            || (UTF_PATTERN && ! UTF8_IS_INVARIANT(c)))
+        {
+            /* Use full Unicode fold matching */
             char *tmpeol = loceol;
+            STRLEN pat_len = (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1;
             while (hardcount < max
                     && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
-                                  STRING(p), NULL, 1, cBOOL(UTF_PATTERN), utf8_flags))
+                       STRING(p), NULL, pat_len, cBOOL(UTF_PATTERN), utf8_flags))
             {
                 scan = tmpeol;
                 tmpeol = loceol;
author	Karl Williamson <public@khwilliamson.com>
	Sat, 6 Oct 2012 20:57:38 +0000 (14:57 -0600)
committer	Karl Williamson <public@khwilliamson.com>
	Tue, 9 Oct 2012 17:16:05 +0000 (11:16 -0600)
regcomp.c		patch \| blob \| history
regexec.c		patch \| blob \| history