regcomp.c: Use POSIXA, NPOSIXA

author Karl Williamson <public@khwilliamson.com>

Fri, 20 Jul 2012 16:23:14 +0000 (10:23 -0600)

committer Karl Williamson <public@khwilliamson.com>

Wed, 25 Jul 2012 03:13:49 +0000 (21:13 -0600)
author Karl Williamson <public@khwilliamson.com>
Fri, 20 Jul 2012 16:23:14 +0000 (10:23 -0600)
committer Karl Williamson <public@khwilliamson.com>
Wed, 25 Jul 2012 03:13:49 +0000 (21:13 -0600)
diff --git a/regcomp.c b/regcomp.c

index 5a87e9c..8f48844 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -11124,6 +11124,10 @@ S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
   * changed since initialization, then there is a run-time definition. */
  #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len)
  
+/* This converts the named class defined in regcomp.h to its equivalent class
+ * number defined in handy.h. */
+#define namedclass_to_classnum(class)  ((class) / 2)
+
  /*
     parse a class specification and produce either an ANYOF node that
     matches the pattern or perhaps will be optimized into an EXACTish node
@@ -11865,6 +11869,7 @@ parseit:
       * Check if this is the case for this class */
      if (element_count == 1) {
          U8 op = END;
+        U8 arg = 0;
  
          if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or
                                                [:digit:] or \p{foo} */
@@ -11942,7 +11947,26 @@ parseit:
                      op = (invert) ? NVERTWS : VERTWS;
                      break;
  
+                case ANYOF_MAX:
+                    break;
  
+                default:
+                    /* A generic posix class.  All the /a ones can be handled
+                     * by the POSIXA opcode.  And all are closed under folding
+                     * in the ASCII range, so FOLD doesn't matter */
+                    if (AT_LEAST_ASCII_RESTRICTED
+                        || (! LOC && namedclass == ANYOF_ASCII))
+                    {
+                        /* The odd numbered ones are the complements of the
+                         * next-lower even number one */
+                        if (namedclass % 2 == 1) {
+                            invert = ! invert;
+                            namedclass--;
+                        }
+                        arg = namedclass_to_classnum(namedclass);
+                        op = (invert) ? NPOSIXA : POSIXA;
+                    }
+                    break;
              }
          }
          else if (value == prevvalue) {
@@ -11994,7 +12018,12 @@ parseit:
  
              ret = reg_node(pRExC_state, op);
  
-            if (PL_regkind[op] == EXACT) {
+            if (PL_regkind[op] == POSIXD) {
+                if (! SIZE_ONLY) {
+                    FLAGS(ret) = arg;
+                }
+            }
+            else if (PL_regkind[op] == EXACT) {
                  alloc_maybe_populate_EXACT(pRExC_state, ret, 0, value);
              }
  
@@ -13543,6 +13572,15 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
  
         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
      }
+    else if (k == POSIXD) {
+        U8 index = FLAGS(o) * 2;
+        if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) {
+            Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
+        }
+        else {
+            sv_catpv(sv, anyofs[index]);
+        }
+    }
      else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
  #else
diff --git a/regexec.c b/regexec.c

index af64a69..dca278b 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -1814,6 +1814,20 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                 !is_HORIZWS_latin1(s)
             );      
             break;
+       case POSIXA:
+           /* Don't need to worry about utf8, as it can match only a single
+            * byte invariant character.  The flag in this node type is the
+            * class number to pass to _generic_isCC() to build a mask for
+            * searching in PL_charclass[] */
+           REXEC_FBC_CLASS_SCAN( _generic_isCC_A(*s, FLAGS(c)));
+           break;
+       case NPOSIXA:
+           REXEC_FBC_CSCAN(
+               !_generic_isCC_A(*s, FLAGS(c)),
+               !_generic_isCC_A(*s, FLAGS(c))
+           );
+           break;
+
         case AHOCORASICKC:
         case AHOCORASICK: 
             {
@@ -3881,6 +3895,26 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
                 DIGITA, NDIGITA, isDIGIT_A,
                 digit, "0");
  
+        case POSIXA:
+            if (locinput >= PL_regeol || ! _generic_isCC_A(nextchr, FLAGS(scan))) {
+                sayNO;
+            }
+            /* Matched a utf8-invariant, so don't have to worry about utf8 */
+            nextchr = UCHARAT(++locinput);
+            break;
+        case NPOSIXA:
+            if (locinput >= PL_regeol || _generic_isCC_A(nextchr, FLAGS(scan))) {
+                sayNO;
+            }
+            if (utf8_target) {
+                locinput += PL_utf8skip[nextchr];
+                nextchr = UCHARAT(locinput);
+            }
+            else {
+                nextchr = UCHARAT(++locinput);
+            }
+            break;
+
         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
                        a Unicode extended Grapheme Cluster */
             /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
@@ -6298,6 +6332,24 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
             scan++;
         }
         break;
+
+    case POSIXA:
+       while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
+           scan++;
+       }
+       break;
+    case NPOSIXA:
+       if (utf8_target) {
+           while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
+               scan += UTF8SKIP(scan);
+           }
+       }
+       else {
+           while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
+               scan++;
+           }
+       }
+       break;
      case NALNUMA:
         if (utf8_target) {
             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
diff --git a/t/re/re_tests b/t/re/re_tests

index 46332b4..3d28155 100644 (file)
--- a/t/re/re_tests
+++ b/t/re/re_tests
@@ -1692,4 +1692,8 @@ ab[c\\\](??{"x"})]{3}d    ab\\](d y       -       -
  [^\n]+ \nb     y       $&      b
  [^\n]+ a\n     y       $&      a
  
+# /a has no effect on properties
+(?a:\p{Any})   \x{100} y       $&      \x{100}
+(?aa:\p{Any})  \x{100} y       $&      \x{100}
+
  # vim: softtabstop=0 noexpandtab
author	Karl Williamson <public@khwilliamson.com>
	Fri, 20 Jul 2012 16:23:14 +0000 (10:23 -0600)
committer	Karl Williamson <public@khwilliamson.com>
	Wed, 25 Jul 2012 03:13:49 +0000 (21:13 -0600)
regcomp.c		patch \| blob \| history
regexec.c		patch \| blob \| history
t/re/re_tests		patch \| blob \| history