Polymorphic regexps.

author Jarkko Hietaniemi <jhi@iki.fi>

Sun, 17 Dec 2000 05:31:37 +0000 (05:31 +0000)

committer Jarkko Hietaniemi <jhi@iki.fi>

Sun, 17 Dec 2000 05:31:37 +0000 (05:31 +0000)
author Jarkko Hietaniemi <jhi@iki.fi>
Sun, 17 Dec 2000 05:31:37 +0000 (05:31 +0000)
committer Jarkko Hietaniemi <jhi@iki.fi>
Sun, 17 Dec 2000 05:31:37 +0000 (05:31 +0000)
diff --git a/embed.h b/embed.h

index 64c1eaf..3b54154 100644 (file)
--- a/embed.h
+++ b/embed.h
@@ -543,6 +543,7 @@
  #define ref                    Perl_ref
  #define refkids                        Perl_refkids
  #define regdump                        Perl_regdump
+#define regclass_swash         Perl_regclass_swash
  #define pregexec               Perl_pregexec
  #define pregfree               Perl_pregfree
  #define pregcomp               Perl_pregcomp
@@ -995,7 +996,6 @@
  #define regbranch              S_regbranch
  #define reguni                 S_reguni
  #define regclass               S_regclass
-#define regclassutf8           S_regclassutf8
  #define regcurly               S_regcurly
  #define reg_node               S_reg_node
  #define regpiece               S_regpiece
@@ -1025,7 +1025,6 @@
  #define regrepeat_hard         S_regrepeat_hard
  #define regtry                 S_regtry
  #define reginclass             S_reginclass
-#define reginclassutf8         S_reginclassutf8
  #define regcppush              S_regcppush
  #define regcppop               S_regcppop
  #define regcp_set_to           S_regcp_set_to
@@ -2015,6 +2014,7 @@
  #define ref(a,b)               Perl_ref(aTHX_ a,b)
  #define refkids(a,b)           Perl_refkids(aTHX_ a,b)
  #define regdump(a)             Perl_regdump(aTHX_ a)
+#define regclass_swash(a,b,c)  Perl_regclass_swash(aTHX_ a,b,c)
  #define pregexec(a,b,c,d,e,f,g)        Perl_pregexec(aTHX_ a,b,c,d,e,f,g)
  #define pregfree(a)            Perl_pregfree(aTHX_ a)
  #define pregcomp(a,b,c)                Perl_pregcomp(aTHX_ a,b,c)
@@ -2459,7 +2459,6 @@
  #define regbranch(a,b,c)       S_regbranch(aTHX_ a,b,c)
  #define reguni(a,b,c,d)                S_reguni(aTHX_ a,b,c,d)
  #define regclass(a)            S_regclass(aTHX_ a)
-#define regclassutf8(a)                S_regclassutf8(aTHX_ a)
  #define regcurly(a)            S_regcurly(aTHX_ a)
  #define reg_node(a,b)          S_reg_node(aTHX_ a,b)
  #define regpiece(a,b)          S_regpiece(aTHX_ a,b)
@@ -2487,8 +2486,7 @@
  #define regrepeat(a,b)         S_regrepeat(aTHX_ a,b)
  #define regrepeat_hard(a,b,c)  S_regrepeat_hard(aTHX_ a,b,c)
  #define regtry(a,b)            S_regtry(aTHX_ a,b)
-#define reginclass(a,b)                S_reginclass(aTHX_ a,b)
-#define reginclassutf8(a,b)    S_reginclassutf8(aTHX_ a,b)
+#define reginclass(a,b,c)      S_reginclass(aTHX_ a,b,c)
  #define regcppush(a)           S_regcppush(aTHX_ a)
  #define regcppop()             S_regcppop(aTHX)
  #define regcp_set_to(a)                S_regcp_set_to(aTHX_ a)
@@ -3950,6 +3948,8 @@
  #define refkids                        Perl_refkids
  #define Perl_regdump           CPerlObj::Perl_regdump
  #define regdump                        Perl_regdump
+#define Perl_regclass_swash    CPerlObj::Perl_regclass_swash
+#define regclass_swash         Perl_regclass_swash
  #define Perl_pregexec          CPerlObj::Perl_pregexec
  #define pregexec               Perl_pregexec
  #define Perl_pregfree          CPerlObj::Perl_pregfree
@@ -4787,8 +4787,6 @@
  #define reguni                 S_reguni
  #define S_regclass             CPerlObj::S_regclass
  #define regclass               S_regclass
-#define S_regclassutf8         CPerlObj::S_regclassutf8
-#define regclassutf8           S_regclassutf8
  #define S_regcurly             CPerlObj::S_regcurly
  #define regcurly               S_regcurly
  #define S_reg_node             CPerlObj::S_reg_node
@@ -4845,8 +4843,6 @@
  #define regtry                 S_regtry
  #define S_reginclass           CPerlObj::S_reginclass
  #define reginclass             S_reginclass
-#define S_reginclassutf8       CPerlObj::S_reginclassutf8
-#define reginclassutf8         S_reginclassutf8
  #define S_regcppush            CPerlObj::S_regcppush
  #define regcppush              S_regcppush
  #define S_regcppop             CPerlObj::S_regcppop
diff --git a/embed.pl b/embed.pl

index 9e2bd9c..32f3ddc 100755 (executable)
--- a/embed.pl
+++ b/embed.pl
@@ -1873,6 +1873,7 @@ Ap        |void   |push_scope
  p      |OP*    |ref            |OP* o|I32 type
  p      |OP*    |refkids        |OP* o|I32 type
  Ap     |void   |regdump        |regexp* r
+Ap     |SV*    |regclass_swash |struct regnode *n|bool doinit|SV **initsvp
  Ap     |I32    |pregexec       |regexp* prog|char* stringarg \
                                 |char* strend|char* strbeg|I32 minend \
                                 |SV* screamer|U32 nosave
@@ -2366,7 +2367,6 @@ s |regnode*|regatom       |struct RExC_state_t*|I32 *
  s      |regnode*|regbranch     |struct RExC_state_t*|I32 *|I32
  s      |void   |reguni         |struct RExC_state_t*|UV|char *|STRLEN*
  s      |regnode*|regclass      |struct RExC_state_t*
-s      |regnode*|regclassutf8  |struct RExC_state_t*
  s      |I32    |regcurly       |char *
  s      |regnode*|reg_node      |struct RExC_state_t*|U8
  s      |regnode*|regpiece      |struct RExC_state_t*|I32 *
@@ -2401,8 +2401,7 @@ s |I32    |regmatch       |regnode *prog
  s      |I32    |regrepeat      |regnode *p|I32 max
  s      |I32    |regrepeat_hard |regnode *p|I32 max|I32 *lp
  s      |I32    |regtry         |regexp *prog|char *startpos
-s      |bool   |reginclass     |regnode *p|I32 c
-s      |bool   |reginclassutf8 |regnode *f|U8* p
+s      |bool   |reginclass     |regnode *n|U8 *p|bool do_utf8sv_is_utf8
  s      |CHECKPOINT|regcppush   |I32 parenfloor
  s      |char*|regcppop
  s      |char*|regcp_set_to     |I32 ss
diff --git a/mg.c b/mg.c

index f97c6ce..a61d167 100644 (file)
--- a/mg.c
+++ b/mg.c
@@ -391,7 +391,7 @@ Perl_magic_len(pTHX_ SV *sv, MAGIC *mg)
      case '5': case '6': case '7': case '8': case '9': case '&':
         if (PL_curpm && (rx = PL_curpm->op_pmregexp)) {
  
-           paren = atoi(mg->mg_ptr);
+           paren = atoi(mg->mg_ptr); /* $& is in [0] */
           getparen:
             if (paren <= rx->nparens &&
                 (s1 = rx->startp[paren]) != -1 &&
@@ -399,17 +399,15 @@ Perl_magic_len(pTHX_ SV *sv, MAGIC *mg)
             {
                 i = t1 - s1;
               getlen:
-               if (i > 0 && (PL_curpm->op_pmdynflags & PMdf_UTF8) && !IN_BYTE) {
-                   char *s = rx->subbeg + s1;
+               if (i > 0 && DO_UTF8(PL_reg_sv)) {
+                   char *s    = rx->subbeg + s1;
                     char *send = rx->subbeg + t1;
-                   i = 0;
-                   while (s < send) {
-                       s += UTF8SKIP(s);
-                       i++;
-                   }
+
+                   i = Perl_utf8_length((U8*)s, (U8*)send);
                 }
-               if (i >= 0)
-                   return i;
+               if (i < 0)
+                   Perl_croak(aTHX_ "panic: magic_len: %d", i);
+               return i;
             }
         }
         return 0;
@@ -604,7 +602,7 @@ Perl_magic_get(pTHX_ SV *sv, MAGIC *mg)
              * Pre-threads, this was paren = atoi(GvENAME((GV*)mg->mg_obj));
              * XXX Does the new way break anything?
              */
-           paren = atoi(mg->mg_ptr);
+           paren = atoi(mg->mg_ptr); /* $& is in [0] */
           getparen:
             if (paren <= rx->nparens &&
                 (s1 = rx->startp[paren]) != -1 &&
@@ -623,7 +621,7 @@ Perl_magic_get(pTHX_ SV *sv, MAGIC *mg)
                         PL_tainted = FALSE;
                     }
                     sv_setpvn(sv, s, i);
-                   if ((PL_curpm->op_pmdynflags & PMdf_UTF8) && !IN_BYTE)
+                   if (DO_UTF8(PL_reg_sv))
                         SvUTF8_on(sv);
                     else
                         SvUTF8_off(sv);
diff --git a/objXSUB.h b/objXSUB.h

index 43537d3..60c6e90 100644 (file)
--- a/objXSUB.h
+++ b/objXSUB.h
@@ -1263,6 +1263,10 @@
  #define Perl_regdump           pPerl->Perl_regdump
  #undef  regdump
  #define regdump                        Perl_regdump
+#undef  Perl_regclass_swash
+#define Perl_regclass_swash    pPerl->Perl_regclass_swash
+#undef  regclass_swash
+#define regclass_swash         Perl_regclass_swash
  #undef  Perl_pregexec
  #define Perl_pregexec          pPerl->Perl_pregexec
  #undef  pregexec
diff --git a/pp_ctl.c b/pp_ctl.c

index d079e4a..aff5815 100644 (file)
--- a/pp_ctl.c
+++ b/pp_ctl.c
@@ -157,7 +157,7 @@ PP(pp_substcont)
      register char *m = cx->sb_m;
      char *orig = cx->sb_orig;
      register REGEXP *rx = cx->sb_rx;
-
+    
      rxres_restore(&cx->sb_rxres, rx);
  
      if (cx->sb_iters++) {
@@ -176,8 +176,8 @@ PP(pp_substcont)
                                       : (REXEC_COPY_STR|REXEC_IGNOREPOS|REXEC_NOT_FIRST))))
         {
             SV *targ = cx->sb_targ;
-           sv_catpvn(dstr, s, cx->sb_strend - s);
  
+           sv_catpvn(dstr, s, cx->sb_strend - s);
             cx->sb_rxtainted |= RX_MATCH_TAINTED(rx);
  
             (void)SvOOK_off(targ);
@@ -189,9 +189,11 @@ PP(pp_substcont)
             sv_free(dstr);
  
             TAINT_IF(cx->sb_rxtainted & 1);
+           if (pm->op_pmdynflags & PMdf_UTF8)
+               SvUTF8_on(targ); /* could also copy SvUTF8(dstr)? */
             PUSHs(sv_2mortal(newSViv((I32)cx->sb_iters - 1)));
  
-           (void)SvPOK_only(targ);
+           (void)SvPOK_only_UTF8(targ);
             TAINT_IF(cx->sb_rxtainted);
             SvSETMAGIC(targ);
             SvTAINT(targ);
@@ -209,7 +211,8 @@ PP(pp_substcont)
         cx->sb_strend = s + (cx->sb_strend - m);
      }
      cx->sb_m = m = rx->startp[0] + orig;
-    sv_catpvn(dstr, s, m-s);
+    if (m > s)
+       sv_catpvn(dstr, s, m-s);
      cx->sb_s = rx->endp[0] + orig;
      { /* Update the pos() information. */
         SV *sv = cx->sb_targ;
diff --git a/pp_hot.c b/pp_hot.c

index 6a5b96f..2904d9f 100644 (file)
--- a/pp_hot.c
+++ b/pp_hot.c
@@ -1179,6 +1179,7 @@ PP(pp_match)
         TARG = DEFSV;
         EXTEND(SP,1);
      }
+    PL_reg_sv = TARG;
      PUTBACK;                           /* EVAL blocks need stack_sp. */
      s = SvPV(TARG, len);
      strend = s + len;
@@ -1268,27 +1269,25 @@ play_it_again:
         RX_MATCH_TAINTED_on(rx);
      TAINT_IF(RX_MATCH_TAINTED(rx));
      if (gimme == G_ARRAY) {
-       I32 iters, i, len;
+       I32 nparens, i, len;
  
-       iters = rx->nparens;
-       if (global && !iters)
+       nparens = rx->nparens;
+       if (global && !nparens)
             i = 1;
         else
             i = 0;
         SPAGAIN;                        /* EVAL blocks could move the stack. */
-       EXTEND(SP, iters + i);
-       EXTEND_MORTAL(iters + i);
-       for (i = !i; i <= iters; i++) {
+       EXTEND(SP, nparens + i);
+       EXTEND_MORTAL(nparens + i);
+       for (i = !i; i <= nparens; i++) {
             PUSHs(sv_newmortal());
             /*SUPPRESS 560*/
             if ((rx->startp[i] != -1) && rx->endp[i] != -1 ) {
                 len = rx->endp[i] - rx->startp[i];
                 s = rx->startp[i] + truebase;
                 sv_setpvn(*SP, s, len);
-               if ((pm->op_pmdynflags & PMdf_UTF8) && !IN_BYTE) {
+               if (DO_UTF8(TARG))
                     SvUTF8_on(*SP);
-                   sv_utf8_downgrade(*SP, TRUE);
-               }
             }
         }
         if (global) {
@@ -1298,7 +1297,7 @@ play_it_again:
             r_flags |= REXEC_IGNOREPOS | REXEC_NOT_FIRST;
             goto play_it_again;
         }
-       else if (!iters)
+       else if (!nparens)
             XPUSHs(&PL_sv_yes);
         LEAVE_SCOPE(oldsave);
         RETURN;
@@ -1831,6 +1830,7 @@ PP(pp_subst)
         TARG = DEFSV;
         EXTEND(SP,1);
      }
+    PL_reg_sv = TARG;
      if (SvFAKE(TARG) && SvREADONLY(TARG))
         sv_force_normal(TARG);
      if (SvREADONLY(TARG)
@@ -1847,7 +1847,7 @@ PP(pp_subst)
      if (PL_tainted)
         rxtainted |= 2;
      TAINT_NOT;
-
+    
    force_it:
      if (!pm || !s)
         DIE(aTHX_ "panic: do_subst");
@@ -2004,6 +2004,8 @@ PP(pp_subst)
         rxtainted |= RX_MATCH_TAINTED(rx);
         dstr = NEWSV(25, len);
         sv_setpvn(dstr, m, s-m);
+       if (DO_UTF8(TARG))
+           SvUTF8_on(dstr);
         PL_curpm = pm;
         if (!c) {
             register PERL_CONTEXT *cx;
@@ -2030,7 +2032,8 @@ PP(pp_subst)
                 sv_catpvn(dstr, c, clen);
             if (once)
                 break;
-       } while (CALLREGEXEC(aTHX_ rx, s, strend, orig, s == m, TARG, NULL, r_flags));
+       } while (CALLREGEXEC(aTHX_ rx, s, strend, orig, s == m,
+                            TARG, NULL, r_flags));
         sv_catpvn(dstr, s, strend - s);
  
         (void)SvOOK_off(TARG);
diff --git a/proto.h b/proto.h

index 4fc260e..1bcb5cd 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -616,6 +616,7 @@ PERL_CALLCONV void  Perl_push_scope(pTHX);
  PERL_CALLCONV OP*      Perl_ref(pTHX_ OP* o, I32 type);
  PERL_CALLCONV OP*      Perl_refkids(pTHX_ OP* o, I32 type);
  PERL_CALLCONV void     Perl_regdump(pTHX_ regexp* r);
+PERL_CALLCONV SV*      Perl_regclass_swash(pTHX_ struct regnode *n, bool doinit, SV **initsvp);
  PERL_CALLCONV I32      Perl_pregexec(pTHX_ regexp* prog, char* stringarg, char* strend, char* strbeg, I32 minend, SV* screamer, U32 nosave);
  PERL_CALLCONV void     Perl_pregfree(pTHX_ struct regexp* r);
  PERL_CALLCONV regexp*  Perl_pregcomp(pTHX_ char* exp, char* xend, PMOP* pm);
@@ -1111,7 +1112,6 @@ STATIC regnode*   S_regatom(pTHX_ struct RExC_state_t*, I32 *);
  STATIC regnode*        S_regbranch(pTHX_ struct RExC_state_t*, I32 *, I32);
  STATIC void    S_reguni(pTHX_ struct RExC_state_t*, UV, char *, STRLEN*);
  STATIC regnode*        S_regclass(pTHX_ struct RExC_state_t*);
-STATIC regnode*        S_regclassutf8(pTHX_ struct RExC_state_t*);
  STATIC I32     S_regcurly(pTHX_ char *);
  STATIC regnode*        S_reg_node(pTHX_ struct RExC_state_t*, U8);
  STATIC regnode*        S_regpiece(pTHX_ struct RExC_state_t*, I32 *);
@@ -1141,8 +1141,7 @@ STATIC I32        S_regmatch(pTHX_ regnode *prog);
  STATIC I32     S_regrepeat(pTHX_ regnode *p, I32 max);
  STATIC I32     S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp);
  STATIC I32     S_regtry(pTHX_ regexp *prog, char *startpos);
-STATIC bool    S_reginclass(pTHX_ regnode *p, I32 c);
-STATIC bool    S_reginclassutf8(pTHX_ regnode *f, U8* p);
+STATIC bool    S_reginclass(pTHX_ regnode *n, U8 *p, bool do_utf8sv_is_utf8);
  STATIC CHECKPOINT      S_regcppush(pTHX_ I32 parenfloor);
  STATIC char*   S_regcppop(pTHX);
  STATIC char*   S_regcp_set_to(pTHX_ I32 ss);
diff --git a/regcomp.c b/regcomp.c

index aae2ced..69a9f91 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -118,7 +118,7 @@ typedef struct RExC_state_t {
      char       *end;                   /* End of input for compile */
      char       *parse;                 /* Input-scan pointer. */
      I32                whilem_seen;            /* number of WHILEM in this expr */
-    regnode    *emit;                  /* Code-emit pointer; &regdummy = don't */
+    regnode    *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
      I32                naughty;                /* How bad is this pattern? */
      I32                sawback;                /* Did we see \1, ...? */
      U32                seen;
@@ -234,8 +234,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  #define LOC (RExC_flags16 & PMf_LOCALE)
  #define FOLD (RExC_flags16 & PMf_FOLD)
  
-#define OOB_CHAR8              1234
-#define OOB_UTF8               123456
+#define OOB_UNICODE            12345678
  #define OOB_NAMEDCLASS         -1
  
  #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
@@ -1196,7 +1195,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg
                 break;
             }
         }
-       else if (strchr((char*)PL_simple,OP(scan)) || PL_regkind[(U8)OP(scan)] == ANYUTF8) {
+       else if (strchr((char*)PL_simple,OP(scan))) {
             int value;
  
             if (flags & SCF_DO_SUBSTR) {
@@ -1210,20 +1209,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg
                 /* Some of the logic below assumes that switching
                    locale on will only add false positives. */
                 switch (PL_regkind[(U8)OP(scan)]) {
-               case ANYUTF8:
                 case SANY:
-               case SANYUTF8:
-               case ALNUMUTF8:
-               case ANYOFUTF8:
-               case ALNUMLUTF8:
-               case NALNUMUTF8:
-               case NALNUMLUTF8:
-               case SPACEUTF8:
-               case NSPACEUTF8:
-               case SPACELUTF8:
-               case NSPACELUTF8:
-               case DIGITUTF8:
-               case NDIGITUTF8:
                 default:
                   do_default:
                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
@@ -1750,7 +1736,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
             /* turn .* into ^.* with an implied $*=1 */
             int type = OP(NEXTOPER(first));
  
-           if (type == REG_ANY || type == ANYUTF8)
+           if (type == REG_ANY)
                 type = ROPT_ANCH_MBOL;
             else
                 type = ROPT_ANCH_SBOL;
@@ -1850,8 +1836,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
             longest_fixed_length = 0;
         }
         if (r->regstclass 
-           && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == ANYUTF8
-               || OP(r->regstclass) == SANYUTF8 || OP(r->regstclass) == SANY))
+           && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
             r->regstclass = NULL;
         if ((!r->anchored_substr || r->anchored_offset) && stclass_flag
             && !(data.start_class->flags & ANYOF_EOS)
@@ -1866,6 +1851,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
                        struct regnode_charclass_class);
             r->regstclass = (regnode*)RExC_rx->data->data[n];
             r->reganch &= ~ROPT_SKIP;   /* Used in find_byclass(). */
+           PL_regdata = r->data; /* for regprop() */
             DEBUG_r((sv = sv_newmortal(),
                      regprop(sv, (regnode*)data.start_class),
                      PerlIO_printf(Perl_debug_log, "synthetic stclass `%s'.\n",
@@ -1933,7 +1919,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
         r->reganch |= ROPT_EVAL_SEEN;
      Newz(1002, r->startp, RExC_npar, I32);
      Newz(1002, r->endp, RExC_npar, I32);
-    PL_regdata = r->data; /* for regprop() ANYOFUTF8 */
+    PL_regdata = r->data; /* for regprop() */
      DEBUG_r(regdump(r));
      return(r);
  }
@@ -2556,26 +2542,17 @@ tryagain:
         break;
      case '.':
         nextchar(pRExC_state);
-       if (UTF) {
-           if (RExC_flags16 & PMf_SINGLELINE)
-               ret = reg_node(pRExC_state, SANYUTF8);
-           else
-               ret = reg_node(pRExC_state, ANYUTF8);
-           *flagp |= HASWIDTH;
-       }
-       else {
-           if (RExC_flags16 & PMf_SINGLELINE)
-               ret = reg_node(pRExC_state, SANY);
-           else
-               ret = reg_node(pRExC_state, REG_ANY);
-           *flagp |= HASWIDTH|SIMPLE;
-       }
+       if (RExC_flags16 & PMf_SINGLELINE)
+           ret = reg_node(pRExC_state, SANY);
+       else
+           ret = reg_node(pRExC_state, REG_ANY);
+       *flagp |= HASWIDTH|SIMPLE;
         RExC_naughty++;
         break;
      case '[':
      {
         char *oregcomp_parse = ++RExC_parse;
-       ret = (UTF ? regclassutf8(pRExC_state) : regclass(pRExC_state));
+       ret = regclass(pRExC_state);
         if (*RExC_parse != ']') {
             RExC_parse = oregcomp_parse;
             vFAIL("Unmatched [");
@@ -2659,20 +2636,14 @@ tryagain:
                 is_utf8_mark((U8*)"~");         /* preload table */
             break;
         case 'w':
-           ret = reg_node(pRExC_state, 
-               UTF
-                   ? (LOC ? ALNUMLUTF8 : ALNUMUTF8)
-                   : (LOC ? ALNUML     : ALNUM));
+           ret = reg_node(pRExC_state, LOC ? ALNUML     : ALNUM);
             *flagp |= HASWIDTH|SIMPLE;
             nextchar(pRExC_state);
             if (UTF && !PL_utf8_alnum)
                 is_utf8_alnum((U8*)"a");        /* preload table */
             break;
         case 'W':
-           ret = reg_node(pRExC_state, 
-               UTF
-                   ? (LOC ? NALNUMLUTF8 : NALNUMUTF8)
-                   : (LOC ? NALNUML     : NALNUM));
+           ret = reg_node(pRExC_state, LOC ? NALNUML     : NALNUM);
             *flagp |= HASWIDTH|SIMPLE;
             nextchar(pRExC_state);
             if (UTF && !PL_utf8_alnum)
@@ -2681,10 +2652,7 @@ tryagain:
         case 'b':
             RExC_seen_zerolen++;
             RExC_seen |= REG_SEEN_LOOKBEHIND;
-           ret = reg_node(pRExC_state, 
-               UTF
-                   ? (LOC ? BOUNDLUTF8 : BOUNDUTF8)
-                   : (LOC ? BOUNDL     : BOUND));
+           ret = reg_node(pRExC_state, LOC ? BOUNDL     : BOUND);
             *flagp |= SIMPLE;
             nextchar(pRExC_state);
             if (UTF && !PL_utf8_alnum)
@@ -2693,44 +2661,35 @@ tryagain:
         case 'B':
             RExC_seen_zerolen++;
             RExC_seen |= REG_SEEN_LOOKBEHIND;
-           ret = reg_node(pRExC_state, 
-               UTF
-                   ? (LOC ? NBOUNDLUTF8 : NBOUNDUTF8)
-                   : (LOC ? NBOUNDL     : NBOUND));
+           ret = reg_node(pRExC_state, LOC ? NBOUNDL     : NBOUND);
             *flagp |= SIMPLE;
             nextchar(pRExC_state);
             if (UTF && !PL_utf8_alnum)
                 is_utf8_alnum((U8*)"a");        /* preload table */
             break;
         case 's':
-           ret = reg_node(pRExC_state, 
-               UTF
-                   ? (LOC ? SPACELUTF8 : SPACEUTF8)
-                   : (LOC ? SPACEL     : SPACE));
+           ret = reg_node(pRExC_state, LOC ? SPACEL     : SPACE);
             *flagp |= HASWIDTH|SIMPLE;
             nextchar(pRExC_state);
             if (UTF && !PL_utf8_space)
                 is_utf8_space((U8*)" ");        /* preload table */
             break;
         case 'S':
-           ret = reg_node(pRExC_state, 
-               UTF
-                   ? (LOC ? NSPACELUTF8 : NSPACEUTF8)
-                   : (LOC ? NSPACEL     : NSPACE));
+           ret = reg_node(pRExC_state, LOC ? NSPACEL     : NSPACE);
             *flagp |= HASWIDTH|SIMPLE;
             nextchar(pRExC_state);
             if (UTF && !PL_utf8_space)
                 is_utf8_space((U8*)" ");        /* preload table */
             break;
         case 'd':
-           ret = reg_node(pRExC_state, UTF ? DIGITUTF8 : DIGIT);
+           ret = reg_node(pRExC_state, DIGIT);
             *flagp |= HASWIDTH|SIMPLE;
             nextchar(pRExC_state);
             if (UTF && !PL_utf8_digit)
                 is_utf8_digit((U8*)"1");        /* preload table */
             break;
         case 'D':
-           ret = reg_node(pRExC_state, UTF ? NDIGITUTF8 : NDIGIT);
+           ret = reg_node(pRExC_state, NDIGIT);
             *flagp |= HASWIDTH|SIMPLE;
             nextchar(pRExC_state);
             if (UTF && !PL_utf8_digit)
@@ -2754,7 +2713,7 @@ tryagain:
                     RExC_end = RExC_parse + 2;
                 RExC_parse--;
  
-               ret = regclassutf8(pRExC_state);
+               ret = regclass(pRExC_state);
  
                 RExC_end = oldregxend;
                 RExC_parse--;
@@ -3194,58 +3153,108 @@ S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
  STATIC regnode *
  S_regclass(pTHX_ RExC_state_t *pRExC_state)
  {
-    register U32 value;
-    register I32 lastvalue = OOB_CHAR8;
-    register I32 range = 0;
+    register UV value;
+    register IV lastvalue = OOB_UNICODE;
+    register IV range = 0;
      register regnode *ret;
      STRLEN numlen;
-    I32 namedclass;
+    IV namedclass;
      char *rangebegin;
      bool need_class = 0;
+    SV *listsv;
+    register char *e;
+    UV n;
+
+    ret = reganode(pRExC_state, ANYOF, 0);
+
+    if (!SIZE_ONLY)
+       ANYOF_FLAGS(ret) = 0;
+
+    if (*RExC_parse == '^') {  /* Complement of range. */
+       RExC_naughty++;
+       RExC_parse++;
+       if (!SIZE_ONLY)
+           ANYOF_FLAGS(ret) |= ANYOF_INVERT;
+    }
  
-    ret = reg_node(pRExC_state, ANYOF);
      if (SIZE_ONLY)
         RExC_size += ANYOF_SKIP;
      else {
-       ret->flags = 0;
-       ANYOF_BITMAP_ZERO(ret);
         RExC_emit += ANYOF_SKIP;
         if (FOLD)
             ANYOF_FLAGS(ret) |= ANYOF_FOLD;
         if (LOC)
             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
-    }
-    if (*RExC_parse == '^') {  /* Complement of range. */
-       RExC_naughty++;
-       RExC_parse++;
-       if (!SIZE_ONLY)
-           ANYOF_FLAGS(ret) |= ANYOF_INVERT;
+       ANYOF_BITMAP_ZERO(ret);
+       listsv = newSVpvn("# comment\n", 10);
      }
  
      if (!SIZE_ONLY && ckWARN(WARN_REGEXP))
         checkposixcc(pRExC_state);
  
      if (*RExC_parse == ']' || *RExC_parse == '-')
-       goto skipcond;          /* allow 1st char to be ] or - */
+       goto charclassloop;             /* allow 1st char to be ] or - */
+
      while (RExC_parse < RExC_end && *RExC_parse != ']') {
-       skipcond:
-       namedclass = OOB_NAMEDCLASS;
+
+    charclassloop:
+
+       namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
+
         if (!range)
             rangebegin = RExC_parse;
-       value = UCHARAT(RExC_parse++);
+       if (UTF) {
+           value = utf8_to_uv((U8*)RExC_parse,
+                              RExC_end - RExC_parse,
+                              &numlen, 0);
+           RExC_parse += numlen;
+       }
+       else
+           value = UCHARAT(RExC_parse++);
         if (value == '[')
             namedclass = regpposixcc(pRExC_state, value);
         else if (value == '\\') {
-           value = UCHARAT(RExC_parse++);
+           if (UTF) {
+               value = utf8_to_uv((U8*)RExC_parse,
+                                  RExC_end - RExC_parse,
+                                  &numlen, 0);
+               RExC_parse += numlen;
+           }
+           else
+               value = UCHARAT(RExC_parse++);
             /* Some compilers cannot handle switching on 64-bit integer
-            * values, therefore the 'value' cannot be an UV. --jhi */
-           switch (value) {
+            * values, therefore value cannot be an UV.  Yes, this will
+            * be a problem later if we want switch on Unicode.  --jhi */
+           switch ((I32)value) {
             case 'w':   namedclass = ANYOF_ALNUM;       break;
             case 'W':   namedclass = ANYOF_NALNUM;      break;
             case 's':   namedclass = ANYOF_SPACE;       break;
             case 'S':   namedclass = ANYOF_NSPACE;      break;
             case 'd':   namedclass = ANYOF_DIGIT;       break;
             case 'D':   namedclass = ANYOF_NDIGIT;      break;
+           case 'p':
+           case 'P':
+               if (*RExC_parse == '{') {
+                   e = strchr(RExC_parse++, '}');
+                    if (!e)
+                        vFAIL("Missing right brace on \\p{}");
+                   n = e - RExC_parse;
+               }
+               else {
+                   e = RExC_parse;
+                   n = 1;
+               }
+               if (!SIZE_ONLY) {
+                   if (value == 'p')
+                       Perl_sv_catpvf(aTHX_ listsv,
+                                      "+utf8::%.*s\n", (int)n, RExC_parse);
+                   else
+                       Perl_sv_catpvf(aTHX_ listsv,
+                                      "!utf8::%.*s\n", (int)n, RExC_parse);
+               }
+               RExC_parse = e + 1;
+               ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
+               continue;
             case 'n':   value = '\n';                   break;
             case 'r':   value = '\r';                   break;
             case 't':   value = '\t';                   break;
@@ -3259,9 +3268,21 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
             case 'a':   value = '\057';                 break;
  #endif
             case 'x':
-               numlen = 0;             /* disallow underscores */
-               value = (UV)scan_hex(RExC_parse, 2, &numlen);
-               RExC_parse += numlen;
+               if (*RExC_parse == '{') {
+                   e = strchr(RExC_parse++, '}');
+                    if (!e) 
+                        vFAIL("Missing right brace on \\x{}");
+                   numlen = 1;         /* allow underscores */
+                   value = (UV)scan_hex(RExC_parse,
+                                        e - RExC_parse,
+                                        &numlen);
+                   RExC_parse = e + 1;
+               }
+               else {
+                   numlen = 0;         /* disallow underscores */
+                   value = (UV)scan_hex(RExC_parse, 2, &numlen);
+                   RExC_parse += numlen;
+               }
                 break;
             case 'c':
                 value = UCHARAT(RExC_parse++);
@@ -3275,16 +3296,22 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                 break;
             default:
                 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
-
-                   vWARN2(RExC_parse, "Unrecognized escape \\%c in character class passed through", (int)value);
+                   vWARN2(RExC_parse,
+                          "Unrecognized escape \\%c in character class passed through",
+                          (int)value);
                 break;
             }
-       }
-       if (namedclass > OOB_NAMEDCLASS) {
-           if (!need_class && !SIZE_ONLY)
+       } /* end of \blah */
+
+       if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
+
+           if (!SIZE_ONLY && !need_class)
                 ANYOF_CLASS_ZERO(ret);
+
             need_class = 1;
-           if (range) { /* a-\d, a-[:digit:] */
+
+           /* a bad range like a-\d, a-[:digit:] ? */
+           if (range) {
                 if (!SIZE_ONLY) {
                     if (ckWARN(WARN_REGEXP))
                         vWARN4(RExC_parse,
@@ -3292,11 +3319,21 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                                RExC_parse - rangebegin,
                                RExC_parse - rangebegin,
                                rangebegin);
-                   ANYOF_BITMAP_SET(ret, lastvalue);
-                   ANYOF_BITMAP_SET(ret, '-');
+                   if (lastvalue < 256) {
+                       ANYOF_BITMAP_SET(ret, lastvalue);
+                       ANYOF_BITMAP_SET(ret, '-');
+                   }
+                   else {
+                       ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
+                       Perl_sv_catpvf(aTHX_ listsv,
+                                      /* 0x002D is Unicode for '-' */
+                                      "%04"UVxf"\n002D\n", (UV)lastvalue);
+                   }
                 }
-               range = 0; /* this is not a true range */
+
+               range = 0; /* this was not a true range */
             }
+
             if (!SIZE_ONLY) {
                 switch (namedclass) {
                 case ANYOF_ALNUM:
@@ -3307,6 +3344,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                             if (isALNUM(value))
                                 ANYOF_BITMAP_SET(ret, value);
                     }
+                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");    
                     break;
                 case ANYOF_NALNUM:
                     if (LOC)
@@ -3316,42 +3354,17 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                             if (!isALNUM(value))
                                 ANYOF_BITMAP_SET(ret, value);
                     }
+                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
                     break;
-               case ANYOF_SPACE:
-                   if (LOC)
-                       ANYOF_CLASS_SET(ret, ANYOF_SPACE);
-                   else {
-                       for (value = 0; value < 256; value++)
-                           if (isSPACE(value))
-                               ANYOF_BITMAP_SET(ret, value);
-                   }
-                   break;
-               case ANYOF_NSPACE:
+               case ANYOF_ALNUMC:
                     if (LOC)
-                       ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
+                       ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
                     else {
                         for (value = 0; value < 256; value++)
-                           if (!isSPACE(value))
+                           if (isALNUMC(value))
                                 ANYOF_BITMAP_SET(ret, value);
                     }
-                   break;
-               case ANYOF_DIGIT:
-                   if (LOC)
-                       ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
-                   else {
-                       for (value = '0'; value <= '9'; value++)
-                           ANYOF_BITMAP_SET(ret, value);
-                   }
-                   break;
-               case ANYOF_NDIGIT:
-                   if (LOC)
-                       ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
-                   else {
-                       for (value = 0; value < '0'; value++)
-                           ANYOF_BITMAP_SET(ret, value);
-                       for (value = '9' + 1; value < 256; value++)
-                           ANYOF_BITMAP_SET(ret, value);
-                   }
+                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
                     break;
                 case ANYOF_NALNUMC:
                     if (LOC)
@@ -3361,15 +3374,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                             if (!isALNUMC(value))
                                 ANYOF_BITMAP_SET(ret, value);
                     }
-                   break;
-               case ANYOF_ALNUMC:
-                   if (LOC)
-                       ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
-                   else {
-                       for (value = 0; value < 256; value++)
-                           if (isALNUMC(value))
-                               ANYOF_BITMAP_SET(ret, value);
-                   }
+                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
                     break;
                 case ANYOF_ALPHA:
                     if (LOC)
@@ -3379,6 +3384,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                             if (isALPHA(value))
                                 ANYOF_BITMAP_SET(ret, value);
                     }
+                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
                     break;
                 case ANYOF_NALPHA:
                     if (LOC)
@@ -3388,6 +3394,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                             if (!isALPHA(value))
                                 ANYOF_BITMAP_SET(ret, value);
                     }
+                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
                     break;
                 case ANYOF_ASCII:
                     if (LOC)
@@ -3402,6 +3409,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                                 ANYOF_BITMAP_SET(ret, value);
  #endif /* EBCDIC */
                     }
+                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
                     break;
                 case ANYOF_NASCII:
                     if (LOC)
@@ -3416,6 +3424,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                                 ANYOF_BITMAP_SET(ret, value);
  #endif /* EBCDIC */
                     }
+                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
                     break;
                 case ANYOF_BLANK:
                     if (LOC)
@@ -3425,6 +3434,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                             if (isBLANK(value))
                                 ANYOF_BITMAP_SET(ret, value);
                     }
+                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
                     break;
                 case ANYOF_NBLANK:
                     if (LOC)
@@ -3434,6 +3444,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                             if (!isBLANK(value))
                                 ANYOF_BITMAP_SET(ret, value);
                     }
+                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
                     break;
                 case ANYOF_CNTRL:
                     if (LOC)
@@ -3443,7 +3454,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                             if (isCNTRL(value))
                                 ANYOF_BITMAP_SET(ret, value);
                     }
-                   lastvalue = OOB_CHAR8;
+                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
                     break;
                 case ANYOF_NCNTRL:
                     if (LOC)
@@ -3453,6 +3464,29 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                             if (!isCNTRL(value))
                                 ANYOF_BITMAP_SET(ret, value);
                     }
+                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
+                   break;
+               case ANYOF_DIGIT:
+                   if (LOC)
+                       ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
+                   else {
+                       /* consecutive digits assumed */
+                       for (value = '0'; value <= '9'; value++)
+                           ANYOF_BITMAP_SET(ret, value);
+                   }
+                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
+                   break;
+               case ANYOF_NDIGIT:
+                   if (LOC)
+                       ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
+                   else {
+                       /* consecutive digits assumed */
+                       for (value = 0; value < '0'; value++)
+                           ANYOF_BITMAP_SET(ret, value);
+                       for (value = '9' + 1; value < 256; value++)
+                           ANYOF_BITMAP_SET(ret, value);
+                   }
+                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
                     break;
                 case ANYOF_GRAPH:
                     if (LOC)
@@ -3462,6 +3496,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                             if (isGRAPH(value))
                                 ANYOF_BITMAP_SET(ret, value);
                     }
+                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
                     break;
                 case ANYOF_NGRAPH:
                     if (LOC)
@@ -3471,6 +3506,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                             if (!isGRAPH(value))
                                 ANYOF_BITMAP_SET(ret, value);
                     }
+                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
                     break;
                 case ANYOF_LOWER:
                     if (LOC)
@@ -3480,6 +3516,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                             if (isLOWER(value))
                                 ANYOF_BITMAP_SET(ret, value);
                     }
+                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
                     break;
                 case ANYOF_NLOWER:
                     if (LOC)
@@ -3489,6 +3526,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                             if (!isLOWER(value))
                                 ANYOF_BITMAP_SET(ret, value);
                     }
+                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
                     break;
                 case ANYOF_PRINT:
                     if (LOC)
@@ -3498,6 +3536,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                             if (isPRINT(value))
                                 ANYOF_BITMAP_SET(ret, value);
                     }
+                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
                     break;
                 case ANYOF_NPRINT:
                     if (LOC)
@@ -3507,6 +3546,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                             if (!isPRINT(value))
                                 ANYOF_BITMAP_SET(ret, value);
                     }
+                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
                     break;
                 case ANYOF_PSXSPC:
                     if (LOC)
@@ -3516,6 +3556,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                             if (isPSXSPC(value))
                                 ANYOF_BITMAP_SET(ret, value);
                     }
+                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
                     break;
                 case ANYOF_NPSXSPC:
                     if (LOC)
@@ -3525,6 +3566,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                             if (!isPSXSPC(value))
                                 ANYOF_BITMAP_SET(ret, value);
                     }
+                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
                     break;
                 case ANYOF_PUNCT:
                     if (LOC)
@@ -3534,6 +3576,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                             if (isPUNCT(value))
                                 ANYOF_BITMAP_SET(ret, value);
                     }
+                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
                     break;
                 case ANYOF_NPUNCT:
                     if (LOC)
@@ -3543,6 +3586,27 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                             if (!isPUNCT(value))
                                 ANYOF_BITMAP_SET(ret, value);
                     }
+                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
+                   break;
+               case ANYOF_SPACE:
+                   if (LOC)
+                       ANYOF_CLASS_SET(ret, ANYOF_SPACE);
+                   else {
+                       for (value = 0; value < 256; value++)
+                           if (isSPACE(value))
+                               ANYOF_BITMAP_SET(ret, value);
+                   }
+                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
+                   break;
+               case ANYOF_NSPACE:
+                   if (LOC)
+                       ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
+                   else {
+                       for (value = 0; value < 256; value++)
+                           if (!isSPACE(value))
+                               ANYOF_BITMAP_SET(ret, value);
+                   }
+                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
                     break;
                 case ANYOF_UPPER:
                     if (LOC)
@@ -3552,6 +3616,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                             if (isUPPER(value))
                                 ANYOF_BITMAP_SET(ret, value);
                     }
+                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
                     break;
                 case ANYOF_NUPPER:
                     if (LOC)
@@ -3561,6 +3626,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                             if (!isUPPER(value))
                                 ANYOF_BITMAP_SET(ret, value);
                     }
+                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
                     break;
                 case ANYOF_XDIGIT:
                     if (LOC)
@@ -3570,6 +3636,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                             if (isXDIGIT(value))
                                 ANYOF_BITMAP_SET(ret, value);
                     }
+                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
                     break;
                 case ANYOF_NXDIGIT:
                     if (LOC)
@@ -3579,6 +3646,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                             if (!isXDIGIT(value))
                                 ANYOF_BITMAP_SET(ret, value);
                     }
+                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
                     break;
                 default:
                     vFAIL("Invalid [::] class");
@@ -3588,7 +3656,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                     ANYOF_FLAGS(ret) |= ANYOF_CLASS;
                 continue;
             }
-       }
+       } /* end of namedclass \blah */
+
         if (range) {
             if (lastvalue > value) /* b-a */ {
                 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
@@ -3596,14 +3665,16 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                               RExC_parse - rangebegin,
                               rangebegin);
             }
-           range = 0;
+           range = 0; /* not a true range */
         }
         else {
-           lastvalue = value;
+           lastvalue = value; /* save the beginning of the range */
             if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
                 RExC_parse[1] != ']') {
                 RExC_parse++;
-               if (namedclass > OOB_NAMEDCLASS) { /* \w-, [:word:]- */
+
+               /* a bad range like \w-, [:word:]- ? */
+               if (namedclass > OOB_NAMEDCLASS) {
                     if (ckWARN(WARN_REGEXP))
                         vWARN4(RExC_parse,
                                "False [] range \"%*.*s\"",
@@ -3613,325 +3684,89 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                     if (!SIZE_ONLY)
                         ANYOF_BITMAP_SET(ret, '-');
                 } else
-                   range = 1;
-               continue;       /* do it next time */
+                   range = 1;  /* yeah, it's a range! */
+               continue;       /* but do it the next time */
             }
         }
+
         /* now is the next time */
         if (!SIZE_ONLY) {
+           if (lastvalue < 256 && value < 256) {
  #ifndef ASCIIish /* EBCDIC, for example. */
-           if ((isLOWER(lastvalue) && isLOWER(value)) ||
-               (isUPPER(lastvalue) && isUPPER(value)))
-           {
-               I32 i;
-               if (isLOWER(lastvalue)) {
-                   for (i = lastvalue; i <= value; i++)
-                       if (isLOWER(i))
-                           ANYOF_BITMAP_SET(ret, i);
-               } else {
-                   for (i = lastvalue; i <= value; i++)
-                       if (isUPPER(i))
-                           ANYOF_BITMAP_SET(ret, i);
+               if ((isLOWER(lastvalue) && isLOWER(value)) ||
+                   (isUPPER(lastvalue) && isUPPER(value)))
+               {
+                   IV i;
+                   if (isLOWER(lastvalue)) {
+                       for (i = lastvalue; i <= value; i++)
+                           if (isLOWER(i))
+                               ANYOF_BITMAP_SET(ret, i);
+                   } else {
+                       for (i = lastvalue; i <= value; i++)
+                           if (isUPPER(i))
+                               ANYOF_BITMAP_SET(ret, i);
+                   }
                 }
-           }
-           else
+               else
  #endif
-               for ( ; lastvalue <= value; lastvalue++)
-                   ANYOF_BITMAP_SET(ret, lastvalue);
+                   for ( ; lastvalue <= value; lastvalue++)
+                       ANYOF_BITMAP_SET(ret, lastvalue);
+           } else {
+               ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
+               if (lastvalue < value)
+                   Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
+                                  (UV)lastvalue, (UV)value);
+               else
+                   Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
+                                  (UV)value);
+           }
          }
-       range = 0;
+
+       range = 0; /* this range (if it was one) is done now */
      }
+
      if (need_class) {
         if (SIZE_ONLY)
             RExC_size += ANYOF_CLASS_ADD_SKIP;
         else
             RExC_emit += ANYOF_CLASS_ADD_SKIP;
      }
+
      /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
      if (!SIZE_ONLY &&
-       (ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD) {
+       (ANYOF_FLAGS(ret) &
+        /* If the only flag is folding (plus possibly inversion). */
+        (ANYOF_FLAGS_ALL ^ ANYOF_INVERT) == ANYOF_FOLD)) {
         for (value = 0; value < 256; ++value) {
             if (ANYOF_BITMAP_TEST(ret, value)) {
-               I32 cf = PL_fold[value];
-               ANYOF_BITMAP_SET(ret, cf);
+               IV fold = PL_fold[value];
+
+               if (fold != value)
+                   ANYOF_BITMAP_SET(ret, fold);
             }
         }
         ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
      }
+
      /* optimize inverted simple patterns (e.g. [^a-z]) */
-    if (!SIZE_ONLY && (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
+    if (!SIZE_ONLY &&
+       /* If the only flag is inversion. */
+       (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
         for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
             ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
         ANYOF_FLAGS(ret) = 0;
      }
-    return ret;
-}
  
-STATIC regnode *
-S_regclassutf8(pTHX_ RExC_state_t *pRExC_state)
-{
-    register char *e;
-    register U32 value;
-    register U32 lastvalue = OOB_UTF8;
-    register I32 range = 0;
-    register regnode *ret;
-    STRLEN numlen;
-    I32 n;
-    SV *listsv;
-    U8 flags = 0;
-    I32 namedclass;
-    char *rangebegin;
-
-    if (*RExC_parse == '^') {  /* Complement of range. */
-       RExC_naughty++;
-       RExC_parse++;
-       if (!SIZE_ONLY)
-           flags |= ANYOF_INVERT;
-    }
-    if (!SIZE_ONLY) {
-       if (FOLD)
-           flags |= ANYOF_FOLD;
-       if (LOC)
-           flags |= ANYOF_LOCALE;
-       listsv = newSVpvn("# comment\n", 10);
-    }
-
-    if (!SIZE_ONLY && ckWARN(WARN_REGEXP))
-       checkposixcc(pRExC_state);
-
-    if (*RExC_parse == ']' || *RExC_parse == '-')
-       goto skipcond;          /* allow 1st char to be ] or - */
-
-    while (RExC_parse < RExC_end && *RExC_parse != ']') {
-       skipcond:
-       namedclass = OOB_NAMEDCLASS;
-       if (!range)
-           rangebegin = RExC_parse;
-       value = utf8_to_uv((U8*)RExC_parse,
-                              RExC_end - RExC_parse,
-                              &numlen, 0);
-       RExC_parse += numlen;
-       if (value == '[')
-           namedclass = regpposixcc(pRExC_state, value);
-       else if (value == '\\') {
-           value = (U32)utf8_to_uv((U8*)RExC_parse,
-                                       RExC_end - RExC_parse,
-                                       &numlen, 0);
-           RExC_parse += numlen;
-           /* Some compilers cannot handle switching on 64-bit integer
-            * values, therefore value cannot be an UV.  Yes, this will
-            * be a problem later if we want switch on Unicode.  --jhi */
-           switch (value) {
-           case 'w':           namedclass = ANYOF_ALNUM;               break;
-           case 'W':           namedclass = ANYOF_NALNUM;              break;
-           case 's':           namedclass = ANYOF_SPACE;               break;
-           case 'S':           namedclass = ANYOF_NSPACE;              break;
-           case 'd':           namedclass = ANYOF_DIGIT;               break;
-           case 'D':           namedclass = ANYOF_NDIGIT;              break;
-           case 'p':
-           case 'P':
-               if (*RExC_parse == '{') {
-                   e = strchr(RExC_parse++, '}');
-                    if (!e)
-                        vFAIL("Missing right brace on \\p{}");
-                   n = e - RExC_parse;
-               }
-               else {
-                   e = RExC_parse;
-                   n = 1;
-               }
-               if (!SIZE_ONLY) {
-                   if (value == 'p')
-                       Perl_sv_catpvf(aTHX_ listsv,
-                                      "+utf8::%.*s\n", (int)n, RExC_parse);
-                   else
-                       Perl_sv_catpvf(aTHX_ listsv,
-                                      "!utf8::%.*s\n", (int)n, RExC_parse);
-               }
-               RExC_parse = e + 1;
-               lastvalue = OOB_UTF8;
-               continue;
-           case 'n':           value = '\n';           break;
-           case 'r':           value = '\r';           break;
-           case 't':           value = '\t';           break;
-           case 'f':           value = '\f';           break;
-           case 'b':           value = '\b';           break;
-#ifdef ASCIIish
-           case 'e':           value = '\033';         break;
-           case 'a':           value = '\007';         break;
-#else
-           case 'e':           value = '\047';         break;
-           case 'a':           value = '\057';         break;
-#endif
-           case 'x':
-               if (*RExC_parse == '{') {
-                   e = strchr(RExC_parse++, '}');
-                    if (!e) 
-                        vFAIL("Missing right brace on \\x{}");
-                   numlen = 1;         /* allow underscores */
-                   value = (UV)scan_hex(RExC_parse,
-                                    e - RExC_parse,
-                                    &numlen);
-                   RExC_parse = e + 1;
-               }
-               else {
-                   numlen = 0;         /* disallow underscores */
-                   value = (UV)scan_hex(RExC_parse, 2, &numlen);
-                   RExC_parse += numlen;
-               }
-               break;
-           case 'c':
-               value = UCHARAT(RExC_parse++);
-               value = toCTRL(value);
-               break;
-           case '0': case '1': case '2': case '3': case '4':
-           case '5': case '6': case '7': case '8': case '9':
-               numlen = 0;             /* disallow underscores */
-               value = (UV)scan_oct(--RExC_parse, 3, &numlen);
-               RExC_parse += numlen;
-               break;
-           default:
-               if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
-                   vWARN2(RExC_parse,
-                          "Unrecognized escape \\%c in character class passed through",
-                          (int)value);
-               break;
-           }
-       }
-       if (namedclass > OOB_NAMEDCLASS) {
-           if (range) { /* a-\d, a-[:digit:] */
-               if (!SIZE_ONLY) {
-                   if (ckWARN(WARN_REGEXP))
-                       vWARN4(RExC_parse,
-                              "False [] range \"%*.*s\"",
-                              RExC_parse - rangebegin,
-                              RExC_parse - rangebegin,
-                              rangebegin);
-                   Perl_sv_catpvf(aTHX_ listsv,
-                                  /* 0x002D is Unicode for '-' */
-                                  "%04"UVxf"\n002D\n", (UV)lastvalue);
-               }
-               range = 0;
-           }
-           if (!SIZE_ONLY) {
-               switch (namedclass) {
-               case ANYOF_ALNUM:
-                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");    break;
-               case ANYOF_NALNUM:
-                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");    break;
-               case ANYOF_ALNUMC:
-                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");   break;
-               case ANYOF_NALNUMC:
-                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");   break;
-               case ANYOF_ALPHA:
-                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");   break;
-               case ANYOF_NALPHA:
-                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");   break;
-               case ANYOF_ASCII:
-                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");   break;
-               case ANYOF_NASCII:
-                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");   break;
-               case ANYOF_CNTRL:
-                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");   break;
-               case ANYOF_NCNTRL:
-                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");   break;
-               case ANYOF_GRAPH:
-                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");   break;
-               case ANYOF_NGRAPH:
-                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");   break;
-               case ANYOF_DIGIT:
-                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");   break;
-               case ANYOF_NDIGIT:
-                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");   break;
-               case ANYOF_LOWER:
-                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");   break;
-               case ANYOF_NLOWER:
-                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");   break;
-               case ANYOF_PRINT:
-                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");   break;
-               case ANYOF_NPRINT:
-                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");   break;
-               case ANYOF_PUNCT:
-                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");   break;
-               case ANYOF_NPUNCT:
-                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");   break;
-               case ANYOF_SPACE:
-                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");break;
-               case ANYOF_NSPACE:
-                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");break;
-               case ANYOF_BLANK:
-                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");   break;
-               case ANYOF_NBLANK:
-                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");   break;
-               case ANYOF_PSXSPC:
-                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");   break;
-               case ANYOF_NPSXSPC:
-                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");   break;
-               case ANYOF_UPPER:
-                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");   break;
-               case ANYOF_NUPPER:
-                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");   break;
-               case ANYOF_XDIGIT:
-                   Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");  break;
-               case ANYOF_NXDIGIT:
-                   Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");  break;
-               }
-               continue;
-           }
-       }
-        if (range) {
-           if (lastvalue > value) { /* b-a */
-               Simple_vFAIL4("Invalid [] range \"%*.*s\"",
-                             RExC_parse - rangebegin,
-                             RExC_parse - rangebegin,
-                             rangebegin);
-           }
-           range = 0;
-       }
-       else {
-           lastvalue = value;
-           if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
-               RExC_parse[1] != ']') {
-               RExC_parse++;
-               if (namedclass > OOB_NAMEDCLASS) { /* \w-, [:word:]- */
-                   if (ckWARN(WARN_REGEXP))
-                       vWARN4(RExC_parse,
-                              "False [] range \"%*.*s\"",
-                              RExC_parse - rangebegin,
-                              RExC_parse - rangebegin,
-                              rangebegin);
-                   if (!SIZE_ONLY)
-                       Perl_sv_catpvf(aTHX_ listsv,
-                                      /* 0x002D is Unicode for '-' */
-                                      "002D\n");
-               } else
-                   range = 1;
-               continue;       /* do it next time */
-           }
-       }
-       /* now is the next time */
-       if (!SIZE_ONLY)
-           Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
-                          (UV)lastvalue, (UV)value);
-       range = 0;
-    }
-
-    ret = reganode(pRExC_state, ANYOFUTF8, 0);
-
-    if (!SIZE_ONLY) {
-       SV *rv = swash_init("utf8", "", listsv, 1, 0);
-#ifdef DEBUGGING
+    if (!SIZE_ONLY) { 
         AV *av = newAV();
-       av_push(av, rv);
-       av_push(av, listsv);
-       rv = newRV_inc((SV*)av);
-#else
-       SvREFCNT_dec(listsv);
-#endif
+       SV *rv;
+
+       av_store(av, 0, listsv);
+       av_store(av, 1, NULL);
+       rv = newRV_noinc((SV*)av);
         n = add_data(pRExC_state, 1, "s");
         RExC_rx->data->data[n] = (void*)rv;
-       ARG1_SET(ret, flags);
-       ARG2_SET(ret, n);
+       ARG_SET(ret, n);
      }
  
      return ret;
@@ -4269,7 +4104,7 @@ Perl_regdump(pTHX_ regexp *r)
  STATIC void
  S_put_byte(pTHX_ SV *sv, int c)
  {
-    if (isCNTRL(c) || c == 127 || c == 255)
+    if (isCNTRL(c) || c == 127 || c == 255 || !isPRINT(c))
         Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
      else if (c == '-' || c == ']' || c == '\\' || c == '^')
         Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
@@ -4311,8 +4146,7 @@ Perl_regprop(pTHX_ SV *sv, regnode *o)
         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
      else if (k == ANYOF) {
         int i, rangestart = -1;
-       bool anyofutf8 = OP(o) == ANYOFUTF8;
-       U8 flags = anyofutf8 ? ARG1(o) : o->flags;
+       U8 flags = ANYOF_FLAGS(o);
         const char * const anyofs[] = { /* Should be syncronized with
                                          * ANYOF_ #xdefines in regcomp.h */
             "\\w",
@@ -4354,78 +4188,93 @@ Perl_regprop(pTHX_ SV *sv, regnode *o)
         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
         if (flags & ANYOF_INVERT)
             sv_catpv(sv, "^");
-       if (OP(o) == ANYOF) {
-           for (i = 0; i <= 256; i++) {
-               if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
-                   if (rangestart == -1)
-                       rangestart = i;
-               } else if (rangestart != -1) {
-                   if (i <= rangestart + 3)
-                       for (; rangestart < i; rangestart++)
-                           put_byte(sv, rangestart);
-                   else {
+       for (i = 0; i <= 256; i++) {
+           if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
+               if (rangestart == -1)
+                   rangestart = i;
+           } else if (rangestart != -1) {
+               if (i <= rangestart + 3)
+                   for (; rangestart < i; rangestart++)
                         put_byte(sv, rangestart);
-                       sv_catpv(sv, "-");
-                       put_byte(sv, i - 1);
-                   }
-                   rangestart = -1;
+               else {
+                   put_byte(sv, rangestart);
+                   sv_catpv(sv, "-");
+                   put_byte(sv, i - 1);
                 }
+               rangestart = -1;
             }
-           if (o->flags & ANYOF_CLASS)
-               for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
-                   if (ANYOF_CLASS_TEST(o,i))
-                       sv_catpv(sv, anyofs[i]);
         }
-       else {
-           SV *rv = (SV*)PL_regdata->data[ARG2(o)];
-           AV *av = (AV*)SvRV((SV*)rv);
-           SV *sw = *av_fetch(av, 0, FALSE);
-           SV *lv = *av_fetch(av, 1, FALSE);
-           UV i;
-           U8 s[UTF8_MAXLEN+1];
-           for (i = 0; i <= 256; i++) { /* just the first 256 */
-               U8 *e = uv_to_utf8(s, i);
-               if (i < 256 && swash_fetch(sw, s)) {
-                   if (rangestart == -1)
-                       rangestart = i;
-               } else if (rangestart != -1) {
-                   U8 *p;
-
-                   if (i <= rangestart + 3)
-                       for (; rangestart < i; rangestart++) {
-                           for(e = uv_to_utf8(s, rangestart), p = s; p < e; p++)
-                               put_byte(sv, *p);
+
+       if (o->flags & ANYOF_CLASS)
+           for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
+               if (ANYOF_CLASS_TEST(o,i))
+                   sv_catpv(sv, anyofs[i]);
+
+       if (flags & ANYOF_UNICODE)
+           sv_catpv(sv, "{unicode}");
+
+       {
+           SV *lv;
+           SV *sw = regclass_swash(o, FALSE, &lv);
+           
+           if (lv) {
+               if (sw) {
+                   UV i;
+                   U8 s[UTF8_MAXLEN+1];
+                   
+                   for (i = 0; i <= 256; i++) { /* just the first 256 */
+                       U8 *e = uv_to_utf8(s, i);
+                       
+                       if (i < 256 && swash_fetch(sw, s)) {
+                           if (rangestart == -1)
+                               rangestart = i;
+                       } else if (rangestart != -1) {
+                           U8 *p;
+                           
+                           if (i <= rangestart + 3)
+                               for (; rangestart < i; rangestart++) {
+                                   for(e = uv_to_utf8(s, rangestart), p = s; p < e; p++)
+                                       put_byte(sv, *p);
+                               }
+                           else {
+                               for (e = uv_to_utf8(s, rangestart), p = s; p < e; p++)
+                                   put_byte(sv, *p);
+                               sv_catpv(sv, "-");
+                                   for (e = uv_to_utf8(s, i - 1), p = s; p < e; p++)
+                                       put_byte(sv, *p);
+                               }
+                               rangestart = -1;
+                           }
                         }
-                   else {
-                       for (e = uv_to_utf8(s, rangestart), p = s; p < e; p++)
-                           put_byte(sv, *p);
-                       sv_catpv(sv, "-");
-                       for (e = uv_to_utf8(s, i - 1), p = s; p < e; p++)
-                           put_byte(sv, *p);
-                   }
-                   rangestart = -1;
+                       
+                   sv_catpv(sv, "..."); /* et cetera */
                 }
-           }
-           sv_catpv(sv, "...");
-           {
-               char *s = savepv(SvPVX(lv));
-
-               while(*s && *s != '\n') s++;
-               if (*s == '\n') {
-                   char *t = ++s;
  
-                   while (*s) {
-                       if (*s == '\n')
-                           *s = ' ';
-                       s++;
+               {
+                   char *s = savepv(SvPVX(lv));
+                   char *origs = s;
+                   
+                   while(*s && *s != '\n') s++;
+                   
+                   if (*s == '\n') {
+                       char *t = ++s;
+                       
+                       while (*s) {
+                           if (*s == '\n')
+                               *s = ' ';
+                           s++;
+                       }
+                       if (s[-1] == ' ')
+                           s[-1] = 0;
+                       
+                       sv_catpv(sv, t);
                     }
-                   if (s[-1] == ' ')
-                       s[-1] = 0;
-
-                   sv_catpv(sv, t);
+                   
+                   Safefree(origs);
                 }
             }
         }
+
         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
      }
      else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
@@ -4486,16 +4335,6 @@ Perl_pregfree(pTHX_ struct regexp *r)
         while (--n >= 0) {
             switch (r->data->what[n]) {
             case 's':
-#ifdef DEBUGGING
-               {
-                   SV *rv = (SV*)r->data->data[n];
-                   AV *av = (AV*)SvRV((SV*)rv);
-                   SV *sw = *av_fetch(av, 0, FALSE);
-                   SV *lv = *av_fetch(av, 1, FALSE);
-                   SvREFCNT_dec(sw);
-                   SvREFCNT_dec(lv);
-               }
-#endif
                 SvREFCNT_dec((SV*)r->data->data[n]);
                 break;
             case 'f':
@@ -4657,4 +4496,3 @@ clear_re(pTHXo_ void *r)
  {
      ReREFCNT_dec((regexp *)r);
  }
-
diff --git a/regcomp.h b/regcomp.h

index 284cf2f..c8094e1 100644 (file)
--- a/regcomp.h
+++ b/regcomp.h
@@ -88,12 +88,13 @@ struct regnode_2 {
  };
  
  #define ANYOF_BITMAP_SIZE      32      /* 256 b/(8 b/B) */
-#define ANYOF_CLASSBITMAP_SIZE  4
+#define ANYOF_CLASSBITMAP_SIZE  4      /* up to 32 (8*4) named classes */
  
  struct regnode_charclass {
      U8 flags;
      U8  type;
      U16 next_off;
+    U32 arg1;
      char bitmap[ANYOF_BITMAP_SIZE];
  };
  
@@ -101,6 +102,7 @@ struct regnode_charclass_class {
      U8 flags;
      U8  type;
      U16 next_off;
+    U32 arg1;
      char bitmap[ANYOF_BITMAP_SIZE];
      char classflags[ANYOF_CLASSBITMAP_SIZE];
  };
@@ -180,13 +182,21 @@ struct regnode_charclass_class {
  
  /* Flags for node->flags of ANYOF */
  
-#define ANYOF_CLASS    0x08
-#define ANYOF_INVERT   0x04
-#define ANYOF_FOLD     0x02
-#define ANYOF_LOCALE   0x01
+#define ANYOF_CLASS            0x08
+#define ANYOF_INVERT           0x04
+#define ANYOF_FOLD             0x02
+#define ANYOF_LOCALE           0x01
  
  /* Used for regstclass only */
-#define ANYOF_EOS      0x10            /* Can match an empty string too */
+#define ANYOF_EOS              0x10            /* Can match an empty string too */
+
+/* There is a character or a range past 0xff */
+#define ANYOF_UNICODE          0x20
+
+/* Are there any runtime flags on in this node? */
+#define ANYOF_RUNTIME(s)       (ANYOF_FLAGS(s) & 0x0f)
+
+#define ANYOF_FLAGS_ALL                0xff
  
  /* Character classes for node->classflags of ANYOF */
  /* Should be synchronized with a table in regprop() */
@@ -220,7 +230,7 @@ struct regnode_charclass_class {
  #define ANYOF_NXDIGIT  25
  #define ANYOF_PSXSPC   26      /* POSIX space: \s plus the vertical tab */
  #define ANYOF_NPSXSPC  27
-#define ANYOF_BLANK    28      /* GNU extension: space and tab */
+#define ANYOF_BLANK    28      /* GNU extension: space and tab: non-vertical space */
  #define ANYOF_NBLANK   29
  
  #define ANYOF_MAX      32
@@ -238,7 +248,6 @@ struct regnode_charclass_class {
  #define ANYOF_CLASS_SIZE       (sizeof(struct regnode_charclass_class))
  
  #define ANYOF_FLAGS(p)         ((p)->flags)
-#define ANYOF_FLAGS_ALL                0xff
  
  #define ANYOF_BIT(c)           (1 << ((c) & 7))
  
@@ -300,12 +309,14 @@ EXTCONST U8 PL_varies[] = {
  EXTCONST U8 PL_simple[];
  #else
  EXTCONST U8 PL_simple[] = {
-    REG_ANY, ANYUTF8, SANY, SANYUTF8, ANYOF, ANYOFUTF8,
-    ALNUM, ALNUMUTF8, ALNUML, ALNUMLUTF8,
-    NALNUM, NALNUMUTF8, NALNUML, NALNUMLUTF8,
-    SPACE, SPACEUTF8, SPACEL, SPACELUTF8,
-    NSPACE, NSPACEUTF8, NSPACEL, NSPACELUTF8,
-    DIGIT, DIGITUTF8, NDIGIT, NDIGITUTF8, 0
+    REG_ANY,   SANY,
+    ANYOF,
+    ALNUM,     ALNUML,
+    NALNUM,    NALNUML,
+    SPACE,     SPACEL,
+    NSPACE,    NSPACEL,
+    DIGIT,     NDIGIT,
+    0
  };
  #endif
  
diff --git a/regcomp.sym b/regcomp.sym

index bb5f8f8..59284f4 100644 (file)
--- a/regcomp.sym
+++ b/regcomp.sym
@@ -16,46 +16,27 @@ EOL         EOL,    no      Match "" at end of line.
  MEOL           EOL,    no      Same, assuming multiline.
  SEOL           EOL,    no      Same, assuming singleline.
  BOUND          BOUND,  no      Match "" at any word boundary
-BOUNDUTF8      BOUND,  no      Match "" at any word boundary
  BOUNDL         BOUND,  no      Match "" at any word boundary
-BOUNDLUTF8     BOUND,  no      Match "" at any word boundary
  NBOUND         NBOUND, no      Match "" at any word non-boundary
-NBOUNDUTF8     NBOUND, no      Match "" at any word non-boundary
  NBOUNDL                NBOUND, no      Match "" at any word non-boundary
-NBOUNDLUTF8    NBOUND, no      Match "" at any word non-boundary
  GPOS           GPOS,   no      Matches where last m//g left off.
  
  # [Special] alternatives
  REG_ANY                REG_ANY,    no  Match any one character (except newline).
-ANYUTF8                REG_ANY,    no  Match any one Unicode character (except newline).
  SANY           REG_ANY,    no  Match any one character.
-SANYUTF8       REG_ANY,    no  Match any one Unicode character.
  ANYOF          ANYOF,  sv      Match character in (or not in) this class.
-ANYOFUTF8      ANYOF,  sv 1    Match character in (or not in) this class.
  ALNUM          ALNUM,  no      Match any alphanumeric character
-ALNUMUTF8      ALNUM,  no      Match any alphanumeric character in utf8
  ALNUML         ALNUM,  no      Match any alphanumeric char in locale
-ALNUMLUTF8     ALNUM,  no      Match any alphanumeric char in locale+utf8
  NALNUM         NALNUM, no      Match any non-alphanumeric character
-NALNUMUTF8     NALNUM, no      Match any non-alphanumeric character in utf8
  NALNUML                NALNUM, no      Match any non-alphanumeric char in locale
-NALNUMLUTF8    NALNUM, no      Match any non-alphanumeric char in locale+utf8
  SPACE          SPACE,  no      Match any whitespace character
-SPACEUTF8      SPACE,  no      Match any whitespace character in utf8
  SPACEL         SPACE,  no      Match any whitespace char in locale
-SPACELUTF8     SPACE,  no      Match any whitespace char in locale+utf8
  NSPACE         NSPACE, no      Match any non-whitespace character
-NSPACEUTF8     NSPACE, no      Match any non-whitespace character in utf8
  NSPACEL                NSPACE, no      Match any non-whitespace char in locale
-NSPACELUTF8    NSPACE, no      Match any non-whitespace char in locale+utf8
  DIGIT          DIGIT,  no      Match any numeric character
-DIGITUTF8      DIGIT,  no      Match any numeric character in utf8
  DIGITL         DIGIT,  no      Match any numeric character in locale
-DIGITLUTF8     DIGIT,  no      Match any numeric character in locale+utf8
  NDIGIT         NDIGIT, no      Match any non-numeric character
-NDIGITUTF8     NDIGIT, no      Match any non-numeric character in utf8
  NDIGITL                NDIGIT, no      Match any non-numeric character in locale
-NDIGITLUTF8    NDIGIT, no      Match any non-numeric character in locale+utf8
  CLUMP          CLUMP,  no      Match any combining character sequence
  
  # BRANCH       The set of branches constituting a single choice are hooked
diff --git a/regexec.c b/regexec.c

index 5e821ba..ac91bea 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -105,13 +105,6 @@
   * Forwards.
   */
  
-#define REGINCLASS(p,c)  (ANYOF_FLAGS(p) ? reginclass(p,c) : ANYOF_BITMAP_TEST(p,c))
-#ifdef DEBUGGING
-#   define REGINCLASSUTF8(f,p)  (ARG1(f) ? reginclassutf8(f,p) : swash_fetch(*av_fetch((AV*)SvRV((SV*)PL_regdata->data[ARG2(f)]),0,FALSE),p))
-#else
-#   define REGINCLASSUTF8(f,p)  (ARG1(f) ? reginclassutf8(f,p) : swash_fetch((SV*)PL_regdata->data[ARG2(f)],p))
-#endif
-
  #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
  #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
  
@@ -738,7 +731,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
  
         t = s;
         if (prog->reganch & ROPT_UTF8) {        
-           PL_regdata = prog->data;    /* Used by REGINCLASS UTF logic */
+           PL_regdata = prog->data;
             PL_bostr = startpos;
         }
          s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1);
@@ -840,25 +833,13 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
         unsigned int c2;
         char *e;
         register I32 tmp = 1;   /* Scratch variable? */
+       register bool do_utf8 = DO_UTF8(PL_reg_sv);
  
         /* We know what class it must start with. */
         switch (OP(c)) {
-       case ANYOFUTF8:
-           while (s < strend) {
-               if (REGINCLASSUTF8(c, (U8*)s)) {
-                   if (tmp && (norun || regtry(prog, s)))
-                       goto got_it;
-                   else
-                       tmp = doevery;
-               }
-               else
-                   tmp = 1;
-               s += UTF8SKIP(s);
-           }
-           break;
         case ANYOF:
             while (s < strend) {
-               if (REGINCLASS(c, *(U8*)s)) {
+               if (reginclass(c, (U8*)s, do_utf8)) {
                     if (tmp && (norun || regtry(prog, s)))
                         goto got_it;
                     else
@@ -866,7 +847,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
                 }
                 else
                     tmp = 1;
-               s++;
+               s += do_utf8 ? UTF8SKIP(s) : 1;
             }
             break;
         case EXACTF:
@@ -912,42 +893,40 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
             PL_reg_flags |= RF_tainted;
             /* FALL THROUGH */
         case BOUND:
-           tmp = (s != startpos) ? UCHARAT(s - 1) : '\n';
-           tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
-           while (s < strend) {
-               if (tmp == !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
-                   tmp = !tmp;
-                   if ((norun || regtry(prog, s)))
-                       goto got_it;
+           if (do_utf8) {
+               if (s == startpos)
+                   tmp = '\n';
+               else {
+                   U8 *r = reghop((U8*)s, -1);
+                   
+                   tmp = (I32)utf8_to_uv(r, s - (char*)r, 0, 0);
+               }
+               tmp = ((OP(c) == BOUND ?
+                       isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
+               while (s < strend) {
+                   if (tmp == !(OP(c) == BOUND ?
+                                swash_fetch(PL_utf8_alnum, (U8*)s) :
+                                isALNUM_LC_utf8((U8*)s)))
+                   {
+                       tmp = !tmp;
+                       if ((norun || regtry(prog, s)))
+                           goto got_it;
+                   }
+                   s += UTF8SKIP(s);
                 }
-               s++;
             }
-           if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
-               goto got_it;
-           break;
-       case BOUNDLUTF8:
-           PL_reg_flags |= RF_tainted;
-           /* FALL THROUGH */
-       case BOUNDUTF8:
-           if (s == startpos)
-               tmp = '\n';
             else {
-               U8 *r = reghop((U8*)s, -1);
-
-               tmp = (I32)utf8_to_uv(r, s - (char*)r, 0, 0);
-           }
-           tmp = ((OP(c) == BOUNDUTF8 ?
-                   isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
-           while (s < strend) {
-               if (tmp == !(OP(c) == BOUNDUTF8 ?
-                            swash_fetch(PL_utf8_alnum, (U8*)s) :
-                            isALNUM_LC_utf8((U8*)s)))
-               {
-                   tmp = !tmp;
-                   if ((norun || regtry(prog, s)))
-                       goto got_it;
+               tmp = (s != startpos) ? UCHARAT(s - 1) : '\n';
+               tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
+               while (s < strend) {
+                   if (tmp ==
+                       !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
+                       tmp = !tmp;
+                       if ((norun || regtry(prog, s)))
+                           goto got_it;
+                   }
+                   s++;
                 }
-               s += UTF8SKIP(s);
             }
             if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
                 goto got_it;
@@ -956,365 +935,382 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
             PL_reg_flags |= RF_tainted;
             /* FALL THROUGH */
         case NBOUND:
-           tmp = (s != startpos) ? UCHARAT(s - 1) : '\n';
-           tmp = ((OP(c) == NBOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
-           while (s < strend) {
-               if (tmp == !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
-                   tmp = !tmp;
-               else if ((norun || regtry(prog, s)))
-                   goto got_it;
-               s++;
+           if (do_utf8) {
+               if (s == startpos)
+                   tmp = '\n';
+               else {
+                   U8 *r = reghop((U8*)s, -1);
+                   
+                   tmp = (I32)utf8_to_uv(r, s - (char*)r, 0, 0);
+               }
+               tmp = ((OP(c) == NBOUND ?
+                       isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
+               while (s < strend) {
+                   if (tmp == !(OP(c) == NBOUND ?
+                                swash_fetch(PL_utf8_alnum, (U8*)s) :
+                                isALNUM_LC_utf8((U8*)s)))
+                       tmp = !tmp;
+                   else if ((norun || regtry(prog, s)))
+                       goto got_it;
+                   s += UTF8SKIP(s);
+               }
             }
-           if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
-               goto got_it;
-           break;
-       case NBOUNDLUTF8:
-           PL_reg_flags |= RF_tainted;
-           /* FALL THROUGH */
-       case NBOUNDUTF8:
-           if (s == startpos)
-               tmp = '\n';
             else {
-               U8 *r = reghop((U8*)s, -1);
-
-               tmp = (I32)utf8_to_uv(r, s - (char*)r, 0, 0);
-           }
-           tmp = ((OP(c) == NBOUNDUTF8 ?
-                   isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
-           while (s < strend) {
-               if (tmp == !(OP(c) == NBOUNDUTF8 ?
-                            swash_fetch(PL_utf8_alnum, (U8*)s) :
-                            isALNUM_LC_utf8((U8*)s)))
-                   tmp = !tmp;
-               else if ((norun || regtry(prog, s)))
-                   goto got_it;
-               s += UTF8SKIP(s);
+               tmp = (s != startpos) ? UCHARAT(s - 1) : '\n';
+               tmp = ((OP(c) == NBOUND ?
+                       isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
+               while (s < strend) {
+                   if (tmp ==
+                       !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
+                       tmp = !tmp;
+                   else if ((norun || regtry(prog, s)))
+                       goto got_it;
+                   s++;
+               }
             }
             if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
                 goto got_it;
             break;
         case ALNUM:
-           while (s < strend) {
-               if (isALNUM(*s)) {
-                   if (tmp && (norun || regtry(prog, s)))
-                       goto got_it;
+           if (do_utf8) {
+               while (s < strend) {
+                   if (swash_fetch(PL_utf8_alnum, (U8*)s)) {
+                       if (tmp && (norun || regtry(prog, s)))
+                           goto got_it;
+                       else
+                           tmp = doevery;
+                   }
                     else
-                       tmp = doevery;
+                       tmp = 1;
+                   s += UTF8SKIP(s);
                 }
-               else
-                   tmp = 1;
-               s++;
             }
-           break;
-       case ALNUMUTF8:
-           while (s < strend) {
-               if (swash_fetch(PL_utf8_alnum, (U8*)s)) {
-                   if (tmp && (norun || regtry(prog, s)))
-                       goto got_it;
+           else {
+               while (s < strend) {
+                   if (isALNUM(*s)) {
+                       if (tmp && (norun || regtry(prog, s)))
+                           goto got_it;
+                       else
+                           tmp = doevery;
+                   }
                     else
-                       tmp = doevery;
+                       tmp = 1;
+                   s++;
                 }
-               else
-                   tmp = 1;
-               s += UTF8SKIP(s);
             }
             break;
         case ALNUML:
             PL_reg_flags |= RF_tainted;
-           while (s < strend) {
-               if (isALNUM_LC(*s)) {
-                   if (tmp && (norun || regtry(prog, s)))
-                       goto got_it;
+           if (do_utf8) {
+               while (s < strend) {
+                   if (isALNUM_LC_utf8((U8*)s)) {
+                       if (tmp && (norun || regtry(prog, s)))
+                           goto got_it;
+                       else
+                           tmp = doevery;
+                   }
                     else
-                       tmp = doevery;
+                       tmp = 1;
+                   s += UTF8SKIP(s);
                 }
-               else
-                   tmp = 1;
-               s++;
             }
-           break;
-       case ALNUMLUTF8:
-           PL_reg_flags |= RF_tainted;
-           while (s < strend) {
-               if (isALNUM_LC_utf8((U8*)s)) {
-                   if (tmp && (norun || regtry(prog, s)))
-                       goto got_it;
+           else {
+               while (s < strend) {
+                   if (isALNUM_LC(*s)) {
+                       if (tmp && (norun || regtry(prog, s)))
+                           goto got_it;
+                       else
+                           tmp = doevery;
+                   }
                     else
-                       tmp = doevery;
+                       tmp = 1;
+                   s++;
                 }
-               else
-                   tmp = 1;
-               s += UTF8SKIP(s);
             }
             break;
         case NALNUM:
-           while (s < strend) {
-               if (!isALNUM(*s)) {
-                   if (tmp && (norun || regtry(prog, s)))
-                       goto got_it;
+           if (do_utf8) {
+               while (s < strend) {
+                   if (!swash_fetch(PL_utf8_alnum, (U8*)s)) {
+                       if (tmp && (norun || regtry(prog, s)))
+                           goto got_it;
+                       else
+                           tmp = doevery;
+                   }
                     else
-                       tmp = doevery;
+                       tmp = 1;
+                   s += UTF8SKIP(s);
                 }
-               else
-                   tmp = 1;
-               s++;
             }
-           break;
-       case NALNUMUTF8:
-           while (s < strend) {
-               if (!swash_fetch(PL_utf8_alnum, (U8*)s)) {
-                   if (tmp && (norun || regtry(prog, s)))
-                       goto got_it;
+           else {
+               while (s < strend) {
+                   if (!isALNUM(*s)) {
+                       if (tmp && (norun || regtry(prog, s)))
+                           goto got_it;
+                       else
+                           tmp = doevery;
+                   }
                     else
-                       tmp = doevery;
+                       tmp = 1;
+                   s++;
                 }
-               else
-                   tmp = 1;
-               s += UTF8SKIP(s);
             }
             break;
         case NALNUML:
             PL_reg_flags |= RF_tainted;
-           while (s < strend) {
-               if (!isALNUM_LC(*s)) {
-                   if (tmp && (norun || regtry(prog, s)))
-                       goto got_it;
+           if (do_utf8) {
+               while (s < strend) {
+                   if (!isALNUM_LC_utf8((U8*)s)) {
+                       if (tmp && (norun || regtry(prog, s)))
+                           goto got_it;
+                       else
+                           tmp = doevery;
+                   }
                     else
-                       tmp = doevery;
+                       tmp = 1;
+                   s += UTF8SKIP(s);
                 }
-               else
-                   tmp = 1;
-               s++;
             }
-           break;
-       case NALNUMLUTF8:
-           PL_reg_flags |= RF_tainted;
-           while (s < strend) {
-               if (!isALNUM_LC_utf8((U8*)s)) {
-                   if (tmp && (norun || regtry(prog, s)))
-                       goto got_it;
+           else {
+               while (s < strend) {
+                   if (!isALNUM_LC(*s)) {
+                       if (tmp && (norun || regtry(prog, s)))
+                           goto got_it;
+                       else
+                           tmp = doevery;
+                   }
                     else
-                       tmp = doevery;
+                       tmp = 1;
+                   s++;
                 }
-               else
-                   tmp = 1;
-               s += UTF8SKIP(s);
             }
             break;
         case SPACE:
-           while (s < strend) {
-               if (isSPACE(*s)) {
-                   if (tmp && (norun || regtry(prog, s)))
-                       goto got_it;
+           if (do_utf8) {
+               while (s < strend) {
+                   if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s)) {
+                       if (tmp && (norun || regtry(prog, s)))
+                           goto got_it;
+                       else
+                           tmp = doevery;
+                   }
                     else
-                       tmp = doevery;
+                       tmp = 1;
+                   s += UTF8SKIP(s);
                 }
-               else
-                   tmp = 1;
-               s++;
             }
-           break;
-       case SPACEUTF8:
-           while (s < strend) {
-               if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s)) {
-                   if (tmp && (norun || regtry(prog, s)))
-                       goto got_it;
+           else {
+               while (s < strend) {
+                   if (isSPACE(*s)) {
+                       if (tmp && (norun || regtry(prog, s)))
+                           goto got_it;
+                       else
+                           tmp = doevery;
+                   }
                     else
-                       tmp = doevery;
+                       tmp = 1;
+                   s++;
                 }
-               else
-                   tmp = 1;
-               s += UTF8SKIP(s);
             }
             break;
         case SPACEL:
             PL_reg_flags |= RF_tainted;
-           while (s < strend) {
-               if (isSPACE_LC(*s)) {
-                   if (tmp && (norun || regtry(prog, s)))
-                       goto got_it;
+           if (do_utf8) {
+               while (s < strend) {
+                   if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
+                       if (tmp && (norun || regtry(prog, s)))
+                           goto got_it;
+                       else
+                           tmp = doevery;
+                   }
                     else
-                       tmp = doevery;
+                       tmp = 1;
+                   s += UTF8SKIP(s);
                 }
-               else
-                   tmp = 1;
-               s++;
             }
-           break;
-       case SPACELUTF8:
-           PL_reg_flags |= RF_tainted;
-           while (s < strend) {
-               if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
-                   if (tmp && (norun || regtry(prog, s)))
-                       goto got_it;
+           else {
+               while (s < strend) {
+                   if (isSPACE_LC(*s)) {
+                       if (tmp && (norun || regtry(prog, s)))
+                           goto got_it;
+                       else
+                           tmp = doevery;
+                   }
                     else
-                       tmp = doevery;
+                       tmp = 1;
+                   s++;
                 }
-               else
-                   tmp = 1;
-               s += UTF8SKIP(s);
             }
             break;
         case NSPACE:
-           while (s < strend) {
-               if (!isSPACE(*s)) {
-                   if (tmp && (norun || regtry(prog, s)))
-                       goto got_it;
+           if (do_utf8) {
+               while (s < strend) {
+                   if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s))) {
+                       if (tmp && (norun || regtry(prog, s)))
+                           goto got_it;
+                       else
+                           tmp = doevery;
+                   }
                     else
-                       tmp = doevery;
+                       tmp = 1;
+                   s += UTF8SKIP(s);
                 }
-               else
-                   tmp = 1;
-               s++;
             }
-           break;
-       case NSPACEUTF8:
-           while (s < strend) {
-               if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s))) {
-                   if (tmp && (norun || regtry(prog, s)))
-                       goto got_it;
+           else {
+               while (s < strend) {
+                   if (!isSPACE(*s)) {
+                       if (tmp && (norun || regtry(prog, s)))
+                           goto got_it;
+                       else
+                           tmp = doevery;
+                   }
                     else
-                       tmp = doevery;
+                       tmp = 1;
+                   s++;
                 }
-               else
-                   tmp = 1;
-               s += UTF8SKIP(s);
             }
             break;
         case NSPACEL:
             PL_reg_flags |= RF_tainted;
-           while (s < strend) {
-               if (!isSPACE_LC(*s)) {
-                   if (tmp && (norun || regtry(prog, s)))
-                       goto got_it;
+           if (do_utf8) {
+               while (s < strend) {
+                   if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
+                       if (tmp && (norun || regtry(prog, s)))
+                           goto got_it;
+                       else
+                           tmp = doevery;
+                   }
                     else
-                       tmp = doevery;
+                       tmp = 1;
+                   s += UTF8SKIP(s);
                 }
-               else
-                   tmp = 1;
-               s++;
             }
-           break;
-       case NSPACELUTF8:
-           PL_reg_flags |= RF_tainted;
-           while (s < strend) {
-               if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
-                   if (tmp && (norun || regtry(prog, s)))
-                       goto got_it;
+           else {
+               while (s < strend) {
+                   if (!isSPACE_LC(*s)) {
+                       if (tmp && (norun || regtry(prog, s)))
+                           goto got_it;
+                       else
+                           tmp = doevery;
+                   }
                     else
-                       tmp = doevery;
+                       tmp = 1;
+                   s++;
                 }
-               else
-                   tmp = 1;
-               s += UTF8SKIP(s);
             }
             break;
         case DIGIT:
-           while (s < strend) {
-               if (isDIGIT(*s)) {
-                   if (tmp && (norun || regtry(prog, s)))
-                       goto got_it;
+           if (do_utf8) {
+               while (s < strend) {
+                   if (swash_fetch(PL_utf8_digit,(U8*)s)) {
+                       if (tmp && (norun || regtry(prog, s)))
+                           goto got_it;
+                       else
+                           tmp = doevery;
+                   }
                     else
-                       tmp = doevery;
+                       tmp = 1;
+                   s += UTF8SKIP(s);
                 }
-               else
-                   tmp = 1;
-               s++;
             }
-           break;
-       case DIGITUTF8:
-           while (s < strend) {
-               if (swash_fetch(PL_utf8_digit,(U8*)s)) {
-                   if (tmp && (norun || regtry(prog, s)))
-                       goto got_it;
+           else {
+               while (s < strend) {
+                   if (isDIGIT(*s)) {
+                       if (tmp && (norun || regtry(prog, s)))
+                           goto got_it;
+                       else
+                           tmp = doevery;
+                   }
                     else
-                       tmp = doevery;
+                       tmp = 1;
+                   s++;
                 }
-               else
-                   tmp = 1;
-               s += UTF8SKIP(s);
             }
             break;
         case DIGITL:
             PL_reg_flags |= RF_tainted;
-           while (s < strend) {
-               if (isDIGIT_LC(*s)) {
-                   if (tmp && (norun || regtry(prog, s)))
-                       goto got_it;
+           if (do_utf8) {
+               while (s < strend) {
+                   if (isDIGIT_LC_utf8((U8*)s)) {
+                       if (tmp && (norun || regtry(prog, s)))
+                           goto got_it;
+                       else
+                           tmp = doevery;
+                   }
                     else
-                       tmp = doevery;
+                       tmp = 1;
+                   s += UTF8SKIP(s);
                 }
-               else
-                   tmp = 1;
-               s++;
             }
-           break;
-       case DIGITLUTF8:
-           PL_reg_flags |= RF_tainted;
-           while (s < strend) {
-               if (isDIGIT_LC_utf8((U8*)s)) {
-                   if (tmp && (norun || regtry(prog, s)))
-                       goto got_it;
+           else {
+               while (s < strend) {
+                   if (isDIGIT_LC(*s)) {
+                       if (tmp && (norun || regtry(prog, s)))
+                           goto got_it;
+                       else
+                           tmp = doevery;
+                   }
                     else
-                       tmp = doevery;
+                       tmp = 1;
+                   s++;
                 }
-               else
-                   tmp = 1;
-               s += UTF8SKIP(s);
             }
             break;
         case NDIGIT:
-           while (s < strend) {
-               if (!isDIGIT(*s)) {
-                   if (tmp && (norun || regtry(prog, s)))
-                       goto got_it;
+           if (do_utf8) {
+               while (s < strend) {
+                   if (!swash_fetch(PL_utf8_digit,(U8*)s)) {
+                       if (tmp && (norun || regtry(prog, s)))
+                           goto got_it;
+                       else
+                           tmp = doevery;
+                   }
                     else
-                       tmp = doevery;
+                       tmp = 1;
+                   s += UTF8SKIP(s);
                 }
-               else
-                   tmp = 1;
-               s++;
             }
-           break;
-       case NDIGITUTF8:
-           while (s < strend) {
-               if (!swash_fetch(PL_utf8_digit,(U8*)s)) {
-                   if (tmp && (norun || regtry(prog, s)))
-                       goto got_it;
+           else {
+               while (s < strend) {
+                   if (!isDIGIT(*s)) {
+                       if (tmp && (norun || regtry(prog, s)))
+                           goto got_it;
+                       else
+                           tmp = doevery;
+                   }
                     else
-                       tmp = doevery;
+                       tmp = 1;
+                   s++;
                 }
-               else
-                   tmp = 1;
-               s += UTF8SKIP(s);
             }
             break;
         case NDIGITL:
             PL_reg_flags |= RF_tainted;
-           while (s < strend) {
-               if (!isDIGIT_LC(*s)) {
-                   if (tmp && (norun || regtry(prog, s)))
-                       goto got_it;
+           if (do_utf8) {
+               while (s < strend) {
+                   if (!isDIGIT_LC_utf8((U8*)s)) {
+                       if (tmp && (norun || regtry(prog, s)))
+                           goto got_it;
+                       else
+                           tmp = doevery;
+                   }
                     else
-                       tmp = doevery;
+                       tmp = 1;
+                   s += UTF8SKIP(s);
                 }
-               else
-                   tmp = 1;
-               s++;
             }
-           break;
-       case NDIGITLUTF8:
-           PL_reg_flags |= RF_tainted;
-           while (s < strend) {
-               if (!isDIGIT_LC_utf8((U8*)s)) {
-                   if (tmp && (norun || regtry(prog, s)))
-                       goto got_it;
+           else {
+               while (s < strend) {
+                   if (!isDIGIT_LC(*s)) {
+                       if (tmp && (norun || regtry(prog, s)))
+                           goto got_it;
+                       else
+                           tmp = doevery;
+                   }
                     else
-                       tmp = doevery;
+                       tmp = 1;
+                   s++;
                 }
-               else
-                   tmp = 1;
-               s += UTF8SKIP(s);
             }
             break;
         default:
@@ -1606,6 +1602,11 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
         if (minlen && PL_regkind[(U8)OP(prog->regstclass)] != EXACT)
             /* don't bother with what can't match */
             strend = HOPc(strend, -(minlen - 1));
+       DEBUG_r({
+           SV *prop = sv_newmortal();
+           regprop(prop, c);
+           PerlIO_printf(Perl_debug_log, "Matching stclass `%s' against `%s'\n", SvPVX(prop), s);
+       });
         if (find_byclass(prog, c, s, strend, startpos, 0))
             goto got_it;
         DEBUG_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
@@ -1619,7 +1620,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
                 last = screaminstr(sv, prog->float_substr, s - strbeg,
                                    end_shift, &scream_pos, 1); /* last one */
                 if (!last)
-                   last = scream_olds; /* Only one occurence. */
+                   last = scream_olds; /* Only one occurrence. */
             }
             else {
                 STRLEN len;
@@ -1891,6 +1892,7 @@ S_regmatch(pTHX_ regnode *prog)
      int minmod = 0, sw = 0, logical = 0;
      I32 unwind = 0;
      I32 firstcp = PL_savestack_ix;
+    register bool do_utf8 = DO_UTF8(PL_reg_sv);
  
  #ifdef DEBUGGING
      PL_regindent++;
@@ -2009,8 +2011,8 @@ S_regmatch(pTHX_ regnode *prog)
             if (PL_regeol != locinput)
                 sayNO;
             break;
-       case SANYUTF8:
-           if (nextchr & 0x80) {
+       case SANY:
+           if (DO_UTF8(PL_reg_sv)) {
                 locinput += PL_utf8skip[nextchr];
                 if (locinput > PL_regeol)
                     sayNO;
@@ -2021,13 +2023,8 @@ S_regmatch(pTHX_ regnode *prog)
                 sayNO;
             nextchr = UCHARAT(++locinput);
             break;
-       case SANY:
-           if (!nextchr && locinput >= PL_regeol)
-               sayNO;
-           nextchr = UCHARAT(++locinput);
-           break;
-       case ANYUTF8:
-           if (nextchr & 0x80) {
+       case REG_ANY:
+           if (DO_UTF8(PL_reg_sv)) {
                 locinput += PL_utf8skip[nextchr];
                 if (locinput > PL_regeol)
                     sayNO;
@@ -2038,11 +2035,6 @@ S_regmatch(pTHX_ regnode *prog)
                 sayNO;
             nextchr = UCHARAT(++locinput);
             break;
-       case REG_ANY:
-           if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
-               sayNO;
-           nextchr = UCHARAT(++locinput);
-           break;
         case EXACT:
             s = STRING(scan);
             ln = STR_LEN(scan);
@@ -2099,22 +2091,24 @@ S_regmatch(pTHX_ regnode *prog)
             locinput += ln;
             nextchr = UCHARAT(locinput);
             break;
-       case ANYOFUTF8:
-           if (!REGINCLASSUTF8(scan, (U8*)locinput))
-               sayNO;
-           if (locinput >= PL_regeol)
-               sayNO;
-           locinput += PL_utf8skip[nextchr];
-           nextchr = UCHARAT(locinput);
-           break;
         case ANYOF:
-           if (nextchr < 0)
+           if (do_utf8) {
+               if (!reginclass(scan, (U8*)locinput, do_utf8))
+                   sayNO;
+               if (locinput >= PL_regeol)
+                   sayNO;
+               locinput += PL_utf8skip[nextchr];
                 nextchr = UCHARAT(locinput);
-           if (!REGINCLASS(scan, nextchr))
-               sayNO;
-           if (!nextchr && locinput >= PL_regeol)
-               sayNO;
-           nextchr = UCHARAT(++locinput);
+           }
+           else {
+               if (nextchr < 0)
+                   nextchr = UCHARAT(locinput);
+               if (!reginclass(scan, (U8*)locinput, do_utf8))
+                   sayNO;
+               if (!nextchr && locinput >= PL_regeol)
+                   sayNO;
+               nextchr = UCHARAT(++locinput);
+           }
             break;
         case ALNUML:
             PL_reg_flags |= RF_tainted;
@@ -2122,19 +2116,8 @@ S_regmatch(pTHX_ regnode *prog)
         case ALNUM:
             if (!nextchr)
                 sayNO;
-           if (!(OP(scan) == ALNUM
-                 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
-               sayNO;
-           nextchr = UCHARAT(++locinput);
-           break;
-       case ALNUMLUTF8:
-           PL_reg_flags |= RF_tainted;
-           /* FALL THROUGH */
-       case ALNUMUTF8:
-           if (!nextchr)
-               sayNO;
-           if (nextchr & 0x80) {
-               if (!(OP(scan) == ALNUMUTF8
+           if (do_utf8) {
+               if (!(OP(scan) == ALNUM
                       ? swash_fetch(PL_utf8_alnum, (U8*)locinput)
                       : isALNUM_LC_utf8((U8*)locinput)))
                 {
@@ -2144,7 +2127,7 @@ S_regmatch(pTHX_ regnode *prog)
                 nextchr = UCHARAT(locinput);
                 break;
             }
-           if (!(OP(scan) == ALNUMUTF8
+           if (!(OP(scan) == ALNUM
                   ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
                 sayNO;
             nextchr = UCHARAT(++locinput);
@@ -2155,19 +2138,8 @@ S_regmatch(pTHX_ regnode *prog)
         case NALNUM:
             if (!nextchr && locinput >= PL_regeol)
                 sayNO;
-           if (OP(scan) == NALNUM
-               ? isALNUM(nextchr) : isALNUM_LC(nextchr))
-               sayNO;
-           nextchr = UCHARAT(++locinput);
-           break;
-       case NALNUMLUTF8:
-           PL_reg_flags |= RF_tainted;
-           /* FALL THROUGH */
-       case NALNUMUTF8:
-           if (!nextchr && locinput >= PL_regeol)
-               sayNO;
-           if (nextchr & 0x80) {
-               if (OP(scan) == NALNUMUTF8
+           if (do_utf8) {
+               if (OP(scan) == NALNUM
                     ? swash_fetch(PL_utf8_alnum, (U8*)locinput)
                     : isALNUM_LC_utf8((U8*)locinput))
                 {
@@ -2177,7 +2149,7 @@ S_regmatch(pTHX_ regnode *prog)
                 nextchr = UCHARAT(locinput);
                 break;
             }
-           if (OP(scan) == NALNUMUTF8
+           if (OP(scan) == NALNUM
                 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
                 sayNO;
             nextchr = UCHARAT(++locinput);
@@ -2189,42 +2161,38 @@ S_regmatch(pTHX_ regnode *prog)
         case BOUND:
         case NBOUND:
             /* was last char in word? */
-           ln = (locinput != PL_regbol) ? UCHARAT(locinput - 1) : PL_regprev;
-           if (OP(scan) == BOUND || OP(scan) == NBOUND) {
-               ln = isALNUM(ln);
-               n = isALNUM(nextchr);
-           }
-           else {
-               ln = isALNUM_LC(ln);
-               n = isALNUM_LC(nextchr);
-           }
-           if (((!ln) == (!n)) == (OP(scan) == BOUND || OP(scan) == BOUNDL))
-               sayNO;
-           break;
-       case BOUNDLUTF8:
-       case NBOUNDLUTF8:
-           PL_reg_flags |= RF_tainted;
-           /* FALL THROUGH */
-       case BOUNDUTF8:
-       case NBOUNDUTF8:
-           /* was last char in word? */
-           if (locinput == PL_regbol)
-               ln = PL_regprev;
-           else {
-               U8 *r = reghop((U8*)locinput, -1);
-
-               ln = utf8_to_uv(r, s - (char*)r, 0, 0);
-           }
-           if (OP(scan) == BOUNDUTF8 || OP(scan) == NBOUNDUTF8) {
-               ln = isALNUM_uni(ln);
-               n = swash_fetch(PL_utf8_alnum, (U8*)locinput);
+           if (do_utf8) {
+               if (locinput == PL_regbol)
+                   ln = PL_regprev;
+               else {
+                   U8 *r = reghop((U8*)locinput, -1);
+                   
+                   ln = utf8_to_uv(r, s - (char*)r, 0, 0);
+               }
+               if (OP(scan) == BOUND || OP(scan) == NBOUND) {
+                   ln = isALNUM_uni(ln);
+                   n = swash_fetch(PL_utf8_alnum, (U8*)locinput);
+               }
+               else {
+                   ln = isALNUM_LC_uni(ln);
+                   n = isALNUM_LC_utf8((U8*)locinput);
+               }
             }
             else {
-               ln = isALNUM_LC_uni(ln);
-               n = isALNUM_LC_utf8((U8*)locinput);
+               ln = (locinput != PL_regbol) ?
+                   UCHARAT(locinput - 1) : PL_regprev;
+               if (OP(scan) == BOUND || OP(scan) == NBOUND) {
+                   ln = isALNUM(ln);
+                   n = isALNUM(nextchr);
+               }
+               else {
+                   ln = isALNUM_LC(ln);
+                   n = isALNUM_LC(nextchr);
+               }
             }
-           if (((!ln) == (!n)) == (OP(scan) == BOUNDUTF8 || OP(scan) == BOUNDLUTF8))
-               sayNO;
+           if (((!ln) == (!n)) == (OP(scan) == BOUND ||
+                                   OP(scan) == BOUNDL))
+                   sayNO;
             break;
         case SPACEL:
             PL_reg_flags |= RF_tainted;
@@ -2232,32 +2200,29 @@ S_regmatch(pTHX_ regnode *prog)
         case SPACE:
             if (!nextchr)
                 sayNO;
-           if (!(OP(scan) == SPACE
-                 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
-               sayNO;
-           nextchr = UCHARAT(++locinput);
-           break;
-       case SPACELUTF8:
-           PL_reg_flags |= RF_tainted;
-           /* FALL THROUGH */
-       case SPACEUTF8:
-           if (!nextchr)
-               sayNO;
-           if (nextchr & 0x80) {
-               if (!(OP(scan) == SPACEUTF8
-                     ? swash_fetch(PL_utf8_space, (U8*)locinput)
-                     : isSPACE_LC_utf8((U8*)locinput)))
-               {
-                   sayNO;
+           if (DO_UTF8(PL_reg_sv)) {
+               if (nextchr & 0x80) {
+                   if (!(OP(scan) == SPACE
+                         ? swash_fetch(PL_utf8_space, (U8*)locinput)
+                         : isSPACE_LC_utf8((U8*)locinput)))
+                   {
+                       sayNO;
+                   }
+                   locinput += PL_utf8skip[nextchr];
+                   nextchr = UCHARAT(locinput);
+                   break;
                 }
-               locinput += PL_utf8skip[nextchr];
-               nextchr = UCHARAT(locinput);
-               break;
+               if (!(OP(scan) == SPACE
+                     ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
+                   sayNO;
+               nextchr = UCHARAT(++locinput);
+           }
+           else {
+               if (!(OP(scan) == SPACE
+                     ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
+                   sayNO;
+               nextchr = UCHARAT(++locinput);
             }
-           if (!(OP(scan) == SPACEUTF8
-                 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
-               sayNO;
-           nextchr = UCHARAT(++locinput);
             break;
         case NSPACEL:
             PL_reg_flags |= RF_tainted;
@@ -2265,19 +2230,8 @@ S_regmatch(pTHX_ regnode *prog)
         case NSPACE:
             if (!nextchr && locinput >= PL_regeol)
                 sayNO;
-           if (OP(scan) == NSPACE
-               ? isSPACE(nextchr) : isSPACE_LC(nextchr))
-               sayNO;
-           nextchr = UCHARAT(++locinput);
-           break;
-       case NSPACELUTF8:
-           PL_reg_flags |= RF_tainted;
-           /* FALL THROUGH */
-       case NSPACEUTF8:
-           if (!nextchr && locinput >= PL_regeol)
-               sayNO;
-           if (nextchr & 0x80) {
-               if (OP(scan) == NSPACEUTF8
+           if (DO_UTF8(PL_reg_sv)) {
+               if (OP(scan) == NSPACE
                     ? swash_fetch(PL_utf8_space, (U8*)locinput)
                     : isSPACE_LC_utf8((U8*)locinput))
                 {
@@ -2287,7 +2241,7 @@ S_regmatch(pTHX_ regnode *prog)
                 nextchr = UCHARAT(locinput);
                 break;
             }
-           if (OP(scan) == NSPACEUTF8
+           if (OP(scan) == NSPACE
                 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
                 sayNO;
             nextchr = UCHARAT(++locinput);
@@ -2298,19 +2252,8 @@ S_regmatch(pTHX_ regnode *prog)
         case DIGIT:
             if (!nextchr)
                 sayNO;
-           if (!(OP(scan) == DIGIT
-                 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
-               sayNO;
-           nextchr = UCHARAT(++locinput);
-           break;
-       case DIGITLUTF8:
-           PL_reg_flags |= RF_tainted;
-           /* FALL THROUGH */
-       case DIGITUTF8:
-           if (!nextchr)
-               sayNO;
-           if (nextchr & 0x80) {
-               if (!(OP(scan) == DIGITUTF8
+           if (DO_UTF8(PL_reg_sv)) {
+               if (!(OP(scan) == DIGIT
                       ? swash_fetch(PL_utf8_digit, (U8*)locinput)
                       : isDIGIT_LC_utf8((U8*)locinput)))
                 {
@@ -2320,7 +2263,7 @@ S_regmatch(pTHX_ regnode *prog)
                 nextchr = UCHARAT(locinput);
                 break;
             }
-           if (!(OP(scan) == DIGITUTF8
+           if (!(OP(scan) == DIGIT
                   ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
                 sayNO;
             nextchr = UCHARAT(++locinput);
@@ -2331,19 +2274,8 @@ S_regmatch(pTHX_ regnode *prog)
         case NDIGIT:
             if (!nextchr && locinput >= PL_regeol)
                 sayNO;
-           if (OP(scan) == NDIGIT
-               ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
-               sayNO;
-           nextchr = UCHARAT(++locinput);
-           break;
-       case NDIGITLUTF8:
-           PL_reg_flags |= RF_tainted;
-           /* FALL THROUGH */
-       case NDIGITUTF8:
-           if (!nextchr && locinput >= PL_regeol)
-               sayNO;
-           if (nextchr & 0x80) {
-               if (OP(scan) == NDIGITUTF8
+           if (DO_UTF8(PL_reg_sv)) {
+               if (OP(scan) == NDIGIT
                     ? swash_fetch(PL_utf8_digit, (U8*)locinput)
                     : isDIGIT_LC_utf8((U8*)locinput))
                 {
@@ -2353,7 +2285,7 @@ S_regmatch(pTHX_ regnode *prog)
                 nextchr = UCHARAT(locinput);
                 break;
             }
-           if (OP(scan) == NDIGITUTF8
+           if (OP(scan) == NDIGIT
                 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
                 sayNO;
             nextchr = UCHARAT(++locinput);
@@ -3461,30 +3393,33 @@ S_regrepeat(pTHX_ regnode *p, I32 max)
      register I32 c;
      register char *loceol = PL_regeol;
      register I32 hardcount = 0;
+    register bool do_utf8 = DO_UTF8(PL_reg_sv);
  
      scan = PL_reginput;
      if (max != REG_INFTY && max < loceol - scan)
        loceol = scan + max;
      switch (OP(p)) {
      case REG_ANY:
-       while (scan < loceol && *scan != '\n')
-           scan++;
-       break;
-    case SANY:
-       scan = loceol;
-       break;
-    case ANYUTF8:
-       loceol = PL_regeol;
-       while (scan < loceol && *scan != '\n') {
-           scan += UTF8SKIP(scan);
-           hardcount++;
+       if (DO_UTF8(PL_reg_sv)) {
+           loceol = PL_regeol;
+           while (scan < loceol && *scan != '\n') {
+               scan += UTF8SKIP(scan);
+               hardcount++;
+           }
+       } else {
+           while (scan < loceol && *scan != '\n')
+               scan++;
         }
         break;
-    case SANYUTF8:
-       loceol = PL_regeol;
-       while (scan < loceol) {
-           scan += UTF8SKIP(scan);
-           hardcount++;
+    case SANY:
+       if (DO_UTF8(PL_reg_sv)) {
+           loceol = PL_regeol;
+           while (scan < loceol) {
+               scan += UTF8SKIP(scan);
+               hardcount++;
+           }
+       } else {
+           scan = loceol;
         }
         break;
      case EXACT:                /* length of string is 1 */
@@ -3505,135 +3440,144 @@ S_regrepeat(pTHX_ regnode *p, I32 max)
                (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
             scan++;
         break;
-    case ANYOFUTF8:
-       loceol = PL_regeol;
-       while (scan < loceol && REGINCLASSUTF8(p, (U8*)scan)) {
-           scan += UTF8SKIP(scan);
-           hardcount++;
-       }
-       break;
      case ANYOF:
-       while (scan < loceol && REGINCLASS(p, *scan))
-           scan++;
+       if (do_utf8) {
+           loceol = PL_regeol;
+           while (scan < loceol && reginclass(p, (U8*)scan, do_utf8)) {
+               scan += UTF8SKIP(scan);
+               hardcount++;
+           }
+       } else {
+           while (scan < loceol && reginclass(p, (U8*)scan, do_utf8))
+               scan++;
+       }
         break;
      case ALNUM:
-       while (scan < loceol && isALNUM(*scan))
-           scan++;
-       break;
-    case ALNUMUTF8:
-       loceol = PL_regeol;
-       while (scan < loceol && swash_fetch(PL_utf8_alnum, (U8*)scan)) {
-           scan += UTF8SKIP(scan);
-           hardcount++;
+       if (DO_UTF8(PL_reg_sv)) {
+           loceol = PL_regeol;
+           while (scan < loceol && swash_fetch(PL_utf8_alnum, (U8*)scan)) {
+               scan += UTF8SKIP(scan);
+               hardcount++;
+           }
+       } else {
+           while (scan < loceol && isALNUM(*scan))
+               scan++;
         }
         break;
      case ALNUML:
         PL_reg_flags |= RF_tainted;
-       while (scan < loceol && isALNUM_LC(*scan))
-           scan++;
-       break;
-    case ALNUMLUTF8:
-       PL_reg_flags |= RF_tainted;
-       loceol = PL_regeol;
-       while (scan < loceol && isALNUM_LC_utf8((U8*)scan)) {
-           scan += UTF8SKIP(scan);
-           hardcount++;
+       if (DO_UTF8(PL_reg_sv)) {
+           loceol = PL_regeol;
+           while (scan < loceol && isALNUM_LC_utf8((U8*)scan)) {
+               scan += UTF8SKIP(scan);
+               hardcount++;
+           }
+       } else {
+           while (scan < loceol && isALNUM_LC(*scan))
+               scan++;
         }
         break;
-       break;
      case NALNUM:
-       while (scan < loceol && !isALNUM(*scan))
-           scan++;
-       break;
-    case NALNUMUTF8:
-       loceol = PL_regeol;
-       while (scan < loceol && !swash_fetch(PL_utf8_alnum, (U8*)scan)) {
-           scan += UTF8SKIP(scan);
-           hardcount++;
+       if (DO_UTF8(PL_reg_sv)) {
+           loceol = PL_regeol;
+           while (scan < loceol && !swash_fetch(PL_utf8_alnum, (U8*)scan)) {
+               scan += UTF8SKIP(scan);
+               hardcount++;
+           }
+       } else {
+           while (scan < loceol && !isALNUM(*scan))
+               scan++;
         }
         break;
      case NALNUML:
         PL_reg_flags |= RF_tainted;
-       while (scan < loceol && !isALNUM_LC(*scan))
-           scan++;
-       break;
-    case NALNUMLUTF8:
-       PL_reg_flags |= RF_tainted;
-       loceol = PL_regeol;
-       while (scan < loceol && !isALNUM_LC_utf8((U8*)scan)) {
-           scan += UTF8SKIP(scan);
-           hardcount++;
+       if (DO_UTF8(PL_reg_sv)) {
+           loceol = PL_regeol;
+           while (scan < loceol && !isALNUM_LC_utf8((U8*)scan)) {
+               scan += UTF8SKIP(scan);
+               hardcount++;
+           }
+       } else {
+           while (scan < loceol && !isALNUM_LC(*scan))
+               scan++;
         }
         break;
      case SPACE:
-       while (scan < loceol && isSPACE(*scan))
-           scan++;
-       break;
-    case SPACEUTF8:
-       loceol = PL_regeol;
-       while (scan < loceol && (*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
-           scan += UTF8SKIP(scan);
-           hardcount++;
+       if (DO_UTF8(PL_reg_sv)) {
+           loceol = PL_regeol;
+           while (scan < loceol &&
+                  (*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
+               scan += UTF8SKIP(scan);
+               hardcount++;
+           }
+       } else {
+           while (scan < loceol && isSPACE(*scan))
+               scan++;
         }
         break;
      case SPACEL:
         PL_reg_flags |= RF_tainted;
-       while (scan < loceol && isSPACE_LC(*scan))
-           scan++;
-       break;
-    case SPACELUTF8:
-       PL_reg_flags |= RF_tainted;
-       loceol = PL_regeol;
-       while (scan < loceol && (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
-           scan += UTF8SKIP(scan);
-           hardcount++;
+       if (DO_UTF8(PL_reg_sv)) {
+           loceol = PL_regeol;
+           while (scan < loceol &&
+                  (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
+               scan += UTF8SKIP(scan);
+               hardcount++;
+           }
+       } else {
+           while (scan < loceol && isSPACE_LC(*scan))
+               scan++;
         }
         break;
      case NSPACE:
-       while (scan < loceol && !isSPACE(*scan))
-           scan++;
-       break;
-    case NSPACEUTF8:
-       loceol = PL_regeol;
-       while (scan < loceol && !(*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
-           scan += UTF8SKIP(scan);
-           hardcount++;
+       if (DO_UTF8(PL_reg_sv)) {
+           loceol = PL_regeol;
+           while (scan < loceol &&
+                  !(*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
+               scan += UTF8SKIP(scan);
+               hardcount++;
+           }
+       } else {
+           while (scan < loceol && !isSPACE(*scan))
+               scan++;
+           break;
         }
-       break;
      case NSPACEL:
         PL_reg_flags |= RF_tainted;
-       while (scan < loceol && !isSPACE_LC(*scan))
-           scan++;
-       break;
-    case NSPACELUTF8:
-       PL_reg_flags |= RF_tainted;
-       loceol = PL_regeol;
-       while (scan < loceol && !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
-           scan += UTF8SKIP(scan);
-           hardcount++;
+       if (DO_UTF8(PL_reg_sv)) {
+           loceol = PL_regeol;
+           while (scan < loceol &&
+                  !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
+               scan += UTF8SKIP(scan);
+               hardcount++;
+           }
+       } else {
+           while (scan < loceol && !isSPACE_LC(*scan))
+               scan++;
         }
         break;
      case DIGIT:
-       while (scan < loceol && isDIGIT(*scan))
-           scan++;
-       break;
-    case DIGITUTF8:
-       loceol = PL_regeol;
-       while (scan < loceol && swash_fetch(PL_utf8_digit,(U8*)scan)) {
-           scan += UTF8SKIP(scan);
-           hardcount++;
+       if (DO_UTF8(PL_reg_sv)) {
+           loceol = PL_regeol;
+           while (scan < loceol && swash_fetch(PL_utf8_digit,(U8*)scan)) {
+               scan += UTF8SKIP(scan);
+               hardcount++;
+           }
+       } else {
+           while (scan < loceol && isDIGIT(*scan))
+               scan++;
         }
         break;
-       break;
      case NDIGIT:
-       while (scan < loceol && !isDIGIT(*scan))
-           scan++;
-       break;
-    case NDIGITUTF8:
-       loceol = PL_regeol;
-       while (scan < loceol && !swash_fetch(PL_utf8_digit,(U8*)scan)) {
-           scan += UTF8SKIP(scan);
-           hardcount++;
+       if (DO_UTF8(PL_reg_sv)) {
+           loceol = PL_regeol;
+           while (scan < loceol && !swash_fetch(PL_utf8_digit,(U8*)scan)) {
+               scan += UTF8SKIP(scan);
+               hardcount++;
+           }
+       } else {
+           while (scan < loceol && !isDIGIT(*scan))
+               scan++;
         }
         break;
      default:           /* Called on something of 0 width. */
@@ -3712,102 +3656,139 @@ S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
  }
  
  /*
+- regclass_swash - prepare the utf8 swash
+*/
+
+SV *
+Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** initsvp)
+{
+    SV *sw = NULL;
+    SV *si = NULL;
+
+    if (PL_regdata && PL_regdata->count) {
+       U32 n = ARG(node);
+
+       if (PL_regdata->what[n] == 's') {
+           SV *rv = (SV*)PL_regdata->data[n];
+           AV *av = (AV*)SvRV((SV*)rv);
+           SV **a;
+           
+           si = *av_fetch(av, 0, FALSE);
+           a  =  av_fetch(av, 1, FALSE);
+           
+           if (a)
+               sw = *a;
+           else if (si && doinit) {
+               sw = swash_init("utf8", "", si, 1, 0);
+               (void)av_store(av, 1, sw);
+           }
+       }
+    }
+       
+    if (initsvp)
+       *initsvp = si;
+
+    return sw;
+}
+
+/*
   - reginclass - determine if a character falls into a character class
   */
  
  STATIC bool
-S_reginclass(pTHX_ register regnode *p, register I32 c)
+S_reginclass(pTHX_ register regnode *n, register U8* p, register bool do_utf8)
  {
-    char flags = ANYOF_FLAGS(p);
+    char flags = ANYOF_FLAGS(n);
      bool match = FALSE;
  
-    c &= 0xFF;
-    if (ANYOF_BITMAP_TEST(p, c))
-       match = TRUE;
-    else if (flags & ANYOF_FOLD) {
-       I32 cf;
-       if (flags & ANYOF_LOCALE) {
-           PL_reg_flags |= RF_tainted;
-           cf = PL_fold_locale[c];
+    if (do_utf8 || (flags & ANYOF_UNICODE)) {
+       if (do_utf8 && !ANYOF_RUNTIME(n)) {
+           STRLEN len;
+           UV c = utf8_to_uv_simple(p, &len);
+
+           if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
+               match = TRUE;
         }
-       else
-           cf = PL_fold[c];
-       if (ANYOF_BITMAP_TEST(p, cf))
-           match = TRUE;
-    }
  
-    if (!match && (flags & ANYOF_CLASS)) {
-       PL_reg_flags |= RF_tainted;
-       if (
-           (ANYOF_CLASS_TEST(p, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
-           (ANYOF_CLASS_TEST(p, ANYOF_NALNUM)  && !isALNUM_LC(c))  ||
-           (ANYOF_CLASS_TEST(p, ANYOF_SPACE)   &&  isSPACE_LC(c))  ||
-           (ANYOF_CLASS_TEST(p, ANYOF_NSPACE)  && !isSPACE_LC(c))  ||
-           (ANYOF_CLASS_TEST(p, ANYOF_DIGIT)   &&  isDIGIT_LC(c))  ||
-           (ANYOF_CLASS_TEST(p, ANYOF_NDIGIT)  && !isDIGIT_LC(c))  ||
-           (ANYOF_CLASS_TEST(p, ANYOF_ALNUMC)  &&  isALNUMC_LC(c)) ||
-           (ANYOF_CLASS_TEST(p, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
-           (ANYOF_CLASS_TEST(p, ANYOF_ALPHA)   &&  isALPHA_LC(c))  ||
-           (ANYOF_CLASS_TEST(p, ANYOF_NALPHA)  && !isALPHA_LC(c))  ||
-           (ANYOF_CLASS_TEST(p, ANYOF_ASCII)   &&  isASCII(c))     ||
-           (ANYOF_CLASS_TEST(p, ANYOF_NASCII)  && !isASCII(c))     ||
-           (ANYOF_CLASS_TEST(p, ANYOF_CNTRL)   &&  isCNTRL_LC(c))  ||
-           (ANYOF_CLASS_TEST(p, ANYOF_NCNTRL)  && !isCNTRL_LC(c))  ||
-           (ANYOF_CLASS_TEST(p, ANYOF_GRAPH)   &&  isGRAPH_LC(c))  ||
-           (ANYOF_CLASS_TEST(p, ANYOF_NGRAPH)  && !isGRAPH_LC(c))  ||
-           (ANYOF_CLASS_TEST(p, ANYOF_LOWER)   &&  isLOWER_LC(c))  ||
-           (ANYOF_CLASS_TEST(p, ANYOF_NLOWER)  && !isLOWER_LC(c))  ||
-           (ANYOF_CLASS_TEST(p, ANYOF_PRINT)   &&  isPRINT_LC(c))  ||
-           (ANYOF_CLASS_TEST(p, ANYOF_NPRINT)  && !isPRINT_LC(c))  ||
-           (ANYOF_CLASS_TEST(p, ANYOF_PUNCT)   &&  isPUNCT_LC(c))  ||
-           (ANYOF_CLASS_TEST(p, ANYOF_NPUNCT)  && !isPUNCT_LC(c))  ||
-           (ANYOF_CLASS_TEST(p, ANYOF_UPPER)   &&  isUPPER_LC(c))  ||
-           (ANYOF_CLASS_TEST(p, ANYOF_NUPPER)  && !isUPPER_LC(c))  ||
-           (ANYOF_CLASS_TEST(p, ANYOF_XDIGIT)  &&  isXDIGIT(c))    ||
-           (ANYOF_CLASS_TEST(p, ANYOF_NXDIGIT) && !isXDIGIT(c))    ||
-           (ANYOF_CLASS_TEST(p, ANYOF_PSXSPC)  &&  isPSXSPC(c))    ||
-           (ANYOF_CLASS_TEST(p, ANYOF_NPSXSPC) && !isPSXSPC(c))    ||
-           (ANYOF_CLASS_TEST(p, ANYOF_BLANK)   &&  isBLANK(c))     ||
-           (ANYOF_CLASS_TEST(p, ANYOF_NBLANK)  && !isBLANK(c))
-           ) /* How's that for a conditional? */
-       {
-           match = TRUE;
+       if (!match) {
+           SV *sw = regclass_swash(n, TRUE, 0);
+       
+           if (sw) {
+               if (swash_fetch(sw, p))
+                   match = TRUE;
+               else if (flags & ANYOF_FOLD) {
+                   U8 tmpbuf[UTF8_MAXLEN+1];
+                   
+                   if (flags & ANYOF_LOCALE) {
+                       PL_reg_flags |= RF_tainted;
+                       uv_to_utf8(tmpbuf, toLOWER_LC_utf8(p));
+                   }
+               else
+                   uv_to_utf8(tmpbuf, toLOWER_utf8(p));
+                   if (swash_fetch(sw, tmpbuf))
+                       match = TRUE;
+               }
+           }
         }
      }
+    else {
+       U8 c = *p;
  
-    return (flags & ANYOF_INVERT) ? !match : match;
-}
-
-STATIC bool
-S_reginclassutf8(pTHX_ regnode *f, U8 *p)
-{                                           
-    char flags = ARG1(f);
-    bool match = FALSE;
-#ifdef DEBUGGING
-    SV *rv = (SV*)PL_regdata->data[ARG2(f)];
-    AV *av = (AV*)SvRV((SV*)rv);
-    SV *sw = *av_fetch(av, 0, FALSE);
-    SV *lv = *av_fetch(av, 1, FALSE);
-#else
-    SV *sw = (SV*)PL_regdata->data[ARG2(f)];
-#endif
+       if (ANYOF_BITMAP_TEST(n, c))
+           match = TRUE;
+       else if (flags & ANYOF_FOLD) {
+           I32 f;
  
-    if (swash_fetch(sw, p))
-       match = TRUE;
-    else if (flags & ANYOF_FOLD) {
-       U8 tmpbuf[UTF8_MAXLEN+1];
-       if (flags & ANYOF_LOCALE) {
+           if (flags & ANYOF_LOCALE) {
+               PL_reg_flags |= RF_tainted;
+               f = PL_fold_locale[c];
+           }
+           else
+               f = PL_fold[c];
+           if (f != c && ANYOF_BITMAP_TEST(n, f))
+               match = TRUE;
+       }
+       
+       if (!match && (flags & ANYOF_CLASS)) {
             PL_reg_flags |= RF_tainted;
-           uv_to_utf8(tmpbuf, toLOWER_LC_utf8(p));
+           if (
+               (ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
+               (ANYOF_CLASS_TEST(n, ANYOF_NALNUM)  && !isALNUM_LC(c))  ||
+               (ANYOF_CLASS_TEST(n, ANYOF_SPACE)   &&  isSPACE_LC(c))  ||
+               (ANYOF_CLASS_TEST(n, ANYOF_NSPACE)  && !isSPACE_LC(c))  ||
+               (ANYOF_CLASS_TEST(n, ANYOF_DIGIT)   &&  isDIGIT_LC(c))  ||
+               (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT)  && !isDIGIT_LC(c))  ||
+               (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC)  &&  isALNUMC_LC(c)) ||
+               (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
+               (ANYOF_CLASS_TEST(n, ANYOF_ALPHA)   &&  isALPHA_LC(c))  ||
+               (ANYOF_CLASS_TEST(n, ANYOF_NALPHA)  && !isALPHA_LC(c))  ||
+               (ANYOF_CLASS_TEST(n, ANYOF_ASCII)   &&  isASCII(c))     ||
+               (ANYOF_CLASS_TEST(n, ANYOF_NASCII)  && !isASCII(c))     ||
+               (ANYOF_CLASS_TEST(n, ANYOF_CNTRL)   &&  isCNTRL_LC(c))  ||
+               (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL)  && !isCNTRL_LC(c))  ||
+               (ANYOF_CLASS_TEST(n, ANYOF_GRAPH)   &&  isGRAPH_LC(c))  ||
+               (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH)  && !isGRAPH_LC(c))  ||
+               (ANYOF_CLASS_TEST(n, ANYOF_LOWER)   &&  isLOWER_LC(c))  ||
+               (ANYOF_CLASS_TEST(n, ANYOF_NLOWER)  && !isLOWER_LC(c))  ||
+               (ANYOF_CLASS_TEST(n, ANYOF_PRINT)   &&  isPRINT_LC(c))  ||
+               (ANYOF_CLASS_TEST(n, ANYOF_NPRINT)  && !isPRINT_LC(c))  ||
+               (ANYOF_CLASS_TEST(n, ANYOF_PUNCT)   &&  isPUNCT_LC(c))  ||
+               (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT)  && !isPUNCT_LC(c))  ||
+               (ANYOF_CLASS_TEST(n, ANYOF_UPPER)   &&  isUPPER_LC(c))  ||
+               (ANYOF_CLASS_TEST(n, ANYOF_NUPPER)  && !isUPPER_LC(c))  ||
+               (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT)  &&  isXDIGIT(c))    ||
+               (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c))    ||
+               (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC)  &&  isPSXSPC(c))    ||
+               (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c))    ||
+               (ANYOF_CLASS_TEST(n, ANYOF_BLANK)   &&  isBLANK(c))     ||
+               (ANYOF_CLASS_TEST(n, ANYOF_NBLANK)  && !isBLANK(c))
+               ) /* How's that for a conditional? */
+           {
+               match = TRUE;
+           }
         }
-       else
-           uv_to_utf8(tmpbuf, toLOWER_utf8(p));
-       if (swash_fetch(sw, tmpbuf))
-           match = TRUE;
      }
  
-    /* UTF8 combined with ANYOF_CLASS is ill-defined. */
-
      return (flags & ANYOF_INVERT) ? !match : match;
  }
  
@@ -3815,17 +3796,20 @@ STATIC U8 *
  S_reghop(pTHX_ U8 *s, I32 off)
  {                               
      if (off >= 0) {
-       while (off-- && s < (U8*)PL_regeol)
+       while (off-- && s < (U8*)PL_regeol) {
+           /* XXX could check well-formedness here */
             s += UTF8SKIP(s);
+       }
      }
      else {
         while (off++) {
             if (s > (U8*)PL_bostr) {
                 s--;
-               if (*s & 0x80) {
-                   while (s > (U8*)PL_bostr && (*s & 0xc0) == 0x80)
+               if (UTF8_IS_CONTINUED(*s)) {
+                   while (s > (U8*)PL_bostr && UTF8_IS_CONTINUATION(*s))
                         s--;
-               }               /* XXX could check well-formedness here */
+               }
+               /* XXX could check well-formedness here */
             }
         }
      }
@@ -3836,8 +3820,10 @@ STATIC U8 *
  S_reghopmaybe(pTHX_ U8* s, I32 off)
  {
      if (off >= 0) {
-       while (off-- && s < (U8*)PL_regeol)
+       while (off-- && s < (U8*)PL_regeol) {
+           /* XXX could check well-formedness here */
             s += UTF8SKIP(s);
+       }
         if (off >= 0)
             return 0;
      }
@@ -3845,10 +3831,11 @@ S_reghopmaybe(pTHX_ U8* s, I32 off)
         while (off++) {
             if (s > (U8*)PL_bostr) {
                 s--;
-               if (*s & 0x80) {
-                   while (s > (U8*)PL_bostr && (*s & 0xc0) == 0x80)
+               if (UTF8_IS_CONTINUED(*s)) {
+                   while (s > (U8*)PL_bostr && UTF8_IS_CONTINUATION(*s))
                         s--;
-               }               /* XXX could check well-formedness here */
+               }
+               /* XXX could check well-formedness here */
             }
             else
                 break;
diff --git a/regnodes.h b/regnodes.h

index 89c78e6..00dc0ec 100644 (file)
--- a/regnodes.h
+++ b/regnodes.h
@@ -13,76 +13,57 @@
  #define        MEOL    7       /*  0x7 Same, assuming multiline. */
  #define        SEOL    8       /*  0x8 Same, assuming singleline. */
  #define        BOUND   9       /*  0x9 Match "" at any word boundary */
-#define        BOUNDUTF8       10      /*  0xa Match "" at any word boundary */
-#define        BOUNDL  11      /*  0xb Match "" at any word boundary */
-#define        BOUNDLUTF8      12      /*  0xc Match "" at any word boundary */
-#define        NBOUND  13      /*  0xd Match "" at any word non-boundary */
-#define        NBOUNDUTF8      14      /*  0xe Match "" at any word non-boundary */
-#define        NBOUNDL 15      /*  0xf Match "" at any word non-boundary */
-#define        NBOUNDLUTF8     16      /* 0x10 Match "" at any word non-boundary */
-#define        GPOS    17      /* 0x11 Matches where last m//g left off. */
-#define        REG_ANY 18      /* 0x12 Match any one character (except newline). */
-#define        ANYUTF8 19      /* 0x13 Match any one Unicode character (except newline). */
-#define        SANY    20      /* 0x14 Match any one character. */
-#define        SANYUTF8        21      /* 0x15 Match any one Unicode character. */
-#define        ANYOF   22      /* 0x16 Match character in (or not in) this class. */
-#define        ANYOFUTF8       23      /* 0x17 Match character in (or not in) this class. */
-#define        ALNUM   24      /* 0x18 Match any alphanumeric character */
-#define        ALNUMUTF8       25      /* 0x19 Match any alphanumeric character in utf8 */
-#define        ALNUML  26      /* 0x1a Match any alphanumeric char in locale */
-#define        ALNUMLUTF8      27      /* 0x1b Match any alphanumeric char in locale+utf8 */
-#define        NALNUM  28      /* 0x1c Match any non-alphanumeric character */
-#define        NALNUMUTF8      29      /* 0x1d Match any non-alphanumeric character in utf8 */
-#define        NALNUML 30      /* 0x1e Match any non-alphanumeric char in locale */
-#define        NALNUMLUTF8     31      /* 0x1f Match any non-alphanumeric char in locale+utf8 */
-#define        SPACE   32      /* 0x20 Match any whitespace character */
-#define        SPACEUTF8       33      /* 0x21 Match any whitespace character in utf8 */
-#define        SPACEL  34      /* 0x22 Match any whitespace char in locale */
-#define        SPACELUTF8      35      /* 0x23 Match any whitespace char in locale+utf8 */
-#define        NSPACE  36      /* 0x24 Match any non-whitespace character */
-#define        NSPACEUTF8      37      /* 0x25 Match any non-whitespace character in utf8 */
-#define        NSPACEL 38      /* 0x26 Match any non-whitespace char in locale */
-#define        NSPACELUTF8     39      /* 0x27 Match any non-whitespace char in locale+utf8 */
-#define        DIGIT   40      /* 0x28 Match any numeric character */
-#define        DIGITUTF8       41      /* 0x29 Match any numeric character in utf8 */
-#define        DIGITL  42      /* 0x2a Match any numeric character in locale */
-#define        DIGITLUTF8      43      /* 0x2b Match any numeric character in locale+utf8 */
-#define        NDIGIT  44      /* 0x2c Match any non-numeric character */
-#define        NDIGITUTF8      45      /* 0x2d Match any non-numeric character in utf8 */
-#define        NDIGITL 46      /* 0x2e Match any non-numeric character in locale */
-#define        NDIGITLUTF8     47      /* 0x2f Match any non-numeric character in locale+utf8 */
-#define        CLUMP   48      /* 0x30 Match any combining character sequence */
-#define        BRANCH  49      /* 0x31 Match this alternative, or the next... */
-#define        BACK    50      /* 0x32 Match "", "next" ptr points backward. */
-#define        EXACT   51      /* 0x33 Match this string (preceded by length). */
-#define        EXACTF  52      /* 0x34 Match this string, folded (prec. by length). */
-#define        EXACTFL 53      /* 0x35 Match this string, folded in locale (w/len). */
-#define        NOTHING 54      /* 0x36 Match empty string. */
-#define        TAIL    55      /* 0x37 Match empty string. Can jump here from outside. */
-#define        STAR    56      /* 0x38 Match this (simple) thing 0 or more times. */
-#define        PLUS    57      /* 0x39 Match this (simple) thing 1 or more times. */
-#define        CURLY   58      /* 0x3a Match this simple thing {n,m} times. */
-#define        CURLYN  59      /* 0x3b Match next-after-this simple thing  */
-#define        CURLYM  60      /* 0x3c Match this medium-complex thing {n,m} times. */
-#define        CURLYX  61      /* 0x3d Match this complex thing {n,m} times. */
-#define        WHILEM  62      /* 0x3e Do curly processing and see if rest matches. */
-#define        OPEN    63      /* 0x3f Mark this point in input as start of #n. */
-#define        CLOSE   64      /* 0x40 Analogous to OPEN. */
-#define        REF     65      /* 0x41 Match some already matched string */
-#define        REFF    66      /* 0x42 Match already matched string, folded */
-#define        REFFL   67      /* 0x43 Match already matched string, folded in loc. */
-#define        IFMATCH 68      /* 0x44 Succeeds if the following matches. */
-#define        UNLESSM 69      /* 0x45 Fails if the following matches. */
-#define        SUSPEND 70      /* 0x46 "Independent" sub-RE. */
-#define        IFTHEN  71      /* 0x47 Switch, should be preceeded by switcher . */
-#define        GROUPP  72      /* 0x48 Whether the group matched. */
-#define        LONGJMP 73      /* 0x49 Jump far away. */
-#define        BRANCHJ 74      /* 0x4a BRANCH with long offset. */
-#define        EVAL    75      /* 0x4b Execute some Perl code. */
-#define        MINMOD  76      /* 0x4c Next operator is not greedy. */
-#define        LOGICAL 77      /* 0x4d Next opcode should set the flag only. */
-#define        RENUM   78      /* 0x4e Group with independently numbered parens. */
-#define        OPTIMIZED       79      /* 0x4f Placeholder for dump. */
+#define        BOUNDL  10      /*  0xa Match "" at any word boundary */
+#define        NBOUND  11      /*  0xb Match "" at any word non-boundary */
+#define        NBOUNDL 12      /*  0xc Match "" at any word non-boundary */
+#define        GPOS    13      /*  0xd Matches where last m//g left off. */
+#define        REG_ANY 14      /*  0xe Match any one character (except newline). */
+#define        SANY    15      /*  0xf Match any one character. */
+#define        ANYOF   16      /* 0x10 Match character in (or not in) this class. */
+#define        ALNUM   17      /* 0x11 Match any alphanumeric character */
+#define        ALNUML  18      /* 0x12 Match any alphanumeric char in locale */
+#define        NALNUM  19      /* 0x13 Match any non-alphanumeric character */
+#define        NALNUML 20      /* 0x14 Match any non-alphanumeric char in locale */
+#define        SPACE   21      /* 0x15 Match any whitespace character */
+#define        SPACEL  22      /* 0x16 Match any whitespace char in locale */
+#define        NSPACE  23      /* 0x17 Match any non-whitespace character */
+#define        NSPACEL 24      /* 0x18 Match any non-whitespace char in locale */
+#define        DIGIT   25      /* 0x19 Match any numeric character */
+#define        DIGITL  26      /* 0x1a Match any numeric character in locale */
+#define        NDIGIT  27      /* 0x1b Match any non-numeric character */
+#define        NDIGITL 28      /* 0x1c Match any non-numeric character in locale */
+#define        CLUMP   29      /* 0x1d Match any combining character sequence */
+#define        BRANCH  30      /* 0x1e Match this alternative, or the next... */
+#define        BACK    31      /* 0x1f Match "", "next" ptr points backward. */
+#define        EXACT   32      /* 0x20 Match this string (preceded by length). */
+#define        EXACTF  33      /* 0x21 Match this string, folded (prec. by length). */
+#define        EXACTFL 34      /* 0x22 Match this string, folded in locale (w/len). */
+#define        NOTHING 35      /* 0x23 Match empty string. */
+#define        TAIL    36      /* 0x24 Match empty string. Can jump here from outside. */
+#define        STAR    37      /* 0x25 Match this (simple) thing 0 or more times. */
+#define        PLUS    38      /* 0x26 Match this (simple) thing 1 or more times. */
+#define        CURLY   39      /* 0x27 Match this simple thing {n,m} times. */
+#define        CURLYN  40      /* 0x28 Match next-after-this simple thing  */
+#define        CURLYM  41      /* 0x29 Match this medium-complex thing {n,m} times. */
+#define        CURLYX  42      /* 0x2a Match this complex thing {n,m} times. */
+#define        WHILEM  43      /* 0x2b Do curly processing and see if rest matches. */
+#define        OPEN    44      /* 0x2c Mark this point in input as start of #n. */
+#define        CLOSE   45      /* 0x2d Analogous to OPEN. */
+#define        REF     46      /* 0x2e Match some already matched string */
+#define        REFF    47      /* 0x2f Match already matched string, folded */
+#define        REFFL   48      /* 0x30 Match already matched string, folded in loc. */
+#define        IFMATCH 49      /* 0x31 Succeeds if the following matches. */
+#define        UNLESSM 50      /* 0x32 Fails if the following matches. */
+#define        SUSPEND 51      /* 0x33 "Independent" sub-RE. */
+#define        IFTHEN  52      /* 0x34 Switch, should be preceeded by switcher . */
+#define        GROUPP  53      /* 0x35 Whether the group matched. */
+#define        LONGJMP 54      /* 0x36 Jump far away. */
+#define        BRANCHJ 55      /* 0x37 BRANCH with long offset. */
+#define        EVAL    56      /* 0x38 Execute some Perl code. */
+#define        MINMOD  57      /* 0x39 Next operator is not greedy. */
+#define        LOGICAL 58      /* 0x3a Next opcode should set the flag only. */
+#define        RENUM   59      /* 0x3b Group with independently numbered parens. */
+#define        OPTIMIZED       60      /* 0x3c Placeholder for dump. */
  
  #ifndef DOINIT
  EXTCONST U8 PL_regkind[];
@@ -98,44 +79,25 @@ EXTCONST U8 PL_regkind[] = {
         EOL,            /* MEOL */
         EOL,            /* SEOL */
         BOUND,          /* BOUND */
-       BOUND,          /* BOUNDUTF8 */
         BOUND,          /* BOUNDL */
-       BOUND,          /* BOUNDLUTF8 */
         NBOUND,         /* NBOUND */
-       NBOUND,         /* NBOUNDUTF8 */
         NBOUND,         /* NBOUNDL */
-       NBOUND,         /* NBOUNDLUTF8 */
         GPOS,           /* GPOS */
         REG_ANY,                /* REG_ANY */
-       REG_ANY,                /* ANYUTF8 */
         REG_ANY,                /* SANY */
-       REG_ANY,                /* SANYUTF8 */
         ANYOF,          /* ANYOF */
-       ANYOF,          /* ANYOFUTF8 */
         ALNUM,          /* ALNUM */
-       ALNUM,          /* ALNUMUTF8 */
         ALNUM,          /* ALNUML */
-       ALNUM,          /* ALNUMLUTF8 */
         NALNUM,         /* NALNUM */
-       NALNUM,         /* NALNUMUTF8 */
         NALNUM,         /* NALNUML */
-       NALNUM,         /* NALNUMLUTF8 */
         SPACE,          /* SPACE */
-       SPACE,          /* SPACEUTF8 */
         SPACE,          /* SPACEL */
-       SPACE,          /* SPACELUTF8 */
         NSPACE,         /* NSPACE */
-       NSPACE,         /* NSPACEUTF8 */
         NSPACE,         /* NSPACEL */
-       NSPACE,         /* NSPACELUTF8 */
         DIGIT,          /* DIGIT */
-       DIGIT,          /* DIGITUTF8 */
         DIGIT,          /* DIGITL */
-       DIGIT,          /* DIGITLUTF8 */
         NDIGIT,         /* NDIGIT */
-       NDIGIT,         /* NDIGITUTF8 */
         NDIGIT,         /* NDIGITL */
-       NDIGIT,         /* NDIGITLUTF8 */
         CLUMP,          /* CLUMP */
         BRANCH,         /* BRANCH */
         BACK,           /* BACK */
@@ -184,44 +146,25 @@ static const U8 regarglen[] = {
         0,              /* MEOL */
         0,              /* SEOL */
         0,              /* BOUND */
-       0,              /* BOUNDUTF8 */
         0,              /* BOUNDL */
-       0,              /* BOUNDLUTF8 */
         0,              /* NBOUND */
-       0,              /* NBOUNDUTF8 */
         0,              /* NBOUNDL */
-       0,              /* NBOUNDLUTF8 */
         0,              /* GPOS */
         0,              /* REG_ANY */
-       0,              /* ANYUTF8 */
         0,              /* SANY */
-       0,              /* SANYUTF8 */
         0,              /* ANYOF */
-       EXTRA_SIZE(struct regnode_1),           /* ANYOFUTF8 */
         0,              /* ALNUM */
-       0,              /* ALNUMUTF8 */
         0,              /* ALNUML */
-       0,              /* ALNUMLUTF8 */
         0,              /* NALNUM */
-       0,              /* NALNUMUTF8 */
         0,              /* NALNUML */
-       0,              /* NALNUMLUTF8 */
         0,              /* SPACE */
-       0,              /* SPACEUTF8 */
         0,              /* SPACEL */
-       0,              /* SPACELUTF8 */
         0,              /* NSPACE */
-       0,              /* NSPACEUTF8 */
         0,              /* NSPACEL */
-       0,              /* NSPACELUTF8 */
         0,              /* DIGIT */
-       0,              /* DIGITUTF8 */
         0,              /* DIGITL */
-       0,              /* DIGITLUTF8 */
         0,              /* NDIGIT */
-       0,              /* NDIGITUTF8 */
         0,              /* NDIGITL */
-       0,              /* NDIGITLUTF8 */
         0,              /* CLUMP */
         0,              /* BRANCH */
         0,              /* BACK */
@@ -267,44 +210,25 @@ static const char reg_off_by_arg[] = {
         0,              /* MEOL */
         0,              /* SEOL */
         0,              /* BOUND */
-       0,              /* BOUNDUTF8 */
         0,              /* BOUNDL */
-       0,              /* BOUNDLUTF8 */
         0,              /* NBOUND */
-       0,              /* NBOUNDUTF8 */
         0,              /* NBOUNDL */
-       0,              /* NBOUNDLUTF8 */
         0,              /* GPOS */
         0,              /* REG_ANY */
-       0,              /* ANYUTF8 */
         0,              /* SANY */
-       0,              /* SANYUTF8 */
         0,              /* ANYOF */
-       0,              /* ANYOFUTF8 */
         0,              /* ALNUM */
-       0,              /* ALNUMUTF8 */
         0,              /* ALNUML */
-       0,              /* ALNUMLUTF8 */
         0,              /* NALNUM */
-       0,              /* NALNUMUTF8 */
         0,              /* NALNUML */
-       0,              /* NALNUMLUTF8 */
         0,              /* SPACE */
-       0,              /* SPACEUTF8 */
         0,              /* SPACEL */
-       0,              /* SPACELUTF8 */
         0,              /* NSPACE */
-       0,              /* NSPACEUTF8 */
         0,              /* NSPACEL */
-       0,              /* NSPACELUTF8 */
         0,              /* DIGIT */
-       0,              /* DIGITUTF8 */
         0,              /* DIGITL */
-       0,              /* DIGITLUTF8 */
         0,              /* NDIGIT */
-       0,              /* NDIGITUTF8 */
         0,              /* NDIGITL */
-       0,              /* NDIGITLUTF8 */
         0,              /* CLUMP */
         0,              /* BRANCH */
         0,              /* BACK */
@@ -351,79 +275,60 @@ static const char * const reg_name[] = {
         "MEOL",         /*  0x7 */
         "SEOL",         /*  0x8 */
         "BOUND",                /*  0x9 */
-       "BOUNDUTF8",            /*  0xa */
-       "BOUNDL",               /*  0xb */
-       "BOUNDLUTF8",           /*  0xc */
-       "NBOUND",               /*  0xd */
-       "NBOUNDUTF8",           /*  0xe */
-       "NBOUNDL",              /*  0xf */
-       "NBOUNDLUTF8",          /* 0x10 */
-       "GPOS",         /* 0x11 */
-       "REG_ANY",              /* 0x12 */
-       "ANYUTF8",              /* 0x13 */
-       "SANY",         /* 0x14 */
-       "SANYUTF8",             /* 0x15 */
-       "ANYOF",                /* 0x16 */
-       "ANYOFUTF8",            /* 0x17 */
-       "ALNUM",                /* 0x18 */
-       "ALNUMUTF8",            /* 0x19 */
-       "ALNUML",               /* 0x1a */
-       "ALNUMLUTF8",           /* 0x1b */
-       "NALNUM",               /* 0x1c */
-       "NALNUMUTF8",           /* 0x1d */
-       "NALNUML",              /* 0x1e */
-       "NALNUMLUTF8",          /* 0x1f */
-       "SPACE",                /* 0x20 */
-       "SPACEUTF8",            /* 0x21 */
-       "SPACEL",               /* 0x22 */
-       "SPACELUTF8",           /* 0x23 */
-       "NSPACE",               /* 0x24 */
-       "NSPACEUTF8",           /* 0x25 */
-       "NSPACEL",              /* 0x26 */
-       "NSPACELUTF8",          /* 0x27 */
-       "DIGIT",                /* 0x28 */
-       "DIGITUTF8",            /* 0x29 */
-       "DIGITL",               /* 0x2a */
-       "DIGITLUTF8",           /* 0x2b */
-       "NDIGIT",               /* 0x2c */
-       "NDIGITUTF8",           /* 0x2d */
-       "NDIGITL",              /* 0x2e */
-       "NDIGITLUTF8",          /* 0x2f */
-       "CLUMP",                /* 0x30 */
-       "BRANCH",               /* 0x31 */
-       "BACK",         /* 0x32 */
-       "EXACT",                /* 0x33 */
-       "EXACTF",               /* 0x34 */
-       "EXACTFL",              /* 0x35 */
-       "NOTHING",              /* 0x36 */
-       "TAIL",         /* 0x37 */
-       "STAR",         /* 0x38 */
-       "PLUS",         /* 0x39 */
-       "CURLY",                /* 0x3a */
-       "CURLYN",               /* 0x3b */
-       "CURLYM",               /* 0x3c */
-       "CURLYX",               /* 0x3d */
-       "WHILEM",               /* 0x3e */
-       "OPEN",         /* 0x3f */
-       "CLOSE",                /* 0x40 */
-       "REF",          /* 0x41 */
-       "REFF",         /* 0x42 */
-       "REFFL",                /* 0x43 */
-       "IFMATCH",              /* 0x44 */
-       "UNLESSM",              /* 0x45 */
-       "SUSPEND",              /* 0x46 */
-       "IFTHEN",               /* 0x47 */
-       "GROUPP",               /* 0x48 */
-       "LONGJMP",              /* 0x49 */
-       "BRANCHJ",              /* 0x4a */
-       "EVAL",         /* 0x4b */
-       "MINMOD",               /* 0x4c */
-       "LOGICAL",              /* 0x4d */
-       "RENUM",                /* 0x4e */
-       "OPTIMIZED",            /* 0x4f */
+       "BOUNDL",               /*  0xa */
+       "NBOUND",               /*  0xb */
+       "NBOUNDL",              /*  0xc */
+       "GPOS",         /*  0xd */
+       "REG_ANY",              /*  0xe */
+       "SANY",         /*  0xf */
+       "ANYOF",                /* 0x10 */
+       "ALNUM",                /* 0x11 */
+       "ALNUML",               /* 0x12 */
+       "NALNUM",               /* 0x13 */
+       "NALNUML",              /* 0x14 */
+       "SPACE",                /* 0x15 */
+       "SPACEL",               /* 0x16 */
+       "NSPACE",               /* 0x17 */
+       "NSPACEL",              /* 0x18 */
+       "DIGIT",                /* 0x19 */
+       "DIGITL",               /* 0x1a */
+       "NDIGIT",               /* 0x1b */
+       "NDIGITL",              /* 0x1c */
+       "CLUMP",                /* 0x1d */
+       "BRANCH",               /* 0x1e */
+       "BACK",         /* 0x1f */
+       "EXACT",                /* 0x20 */
+       "EXACTF",               /* 0x21 */
+       "EXACTFL",              /* 0x22 */
+       "NOTHING",              /* 0x23 */
+       "TAIL",         /* 0x24 */
+       "STAR",         /* 0x25 */
+       "PLUS",         /* 0x26 */
+       "CURLY",                /* 0x27 */
+       "CURLYN",               /* 0x28 */
+       "CURLYM",               /* 0x29 */
+       "CURLYX",               /* 0x2a */
+       "WHILEM",               /* 0x2b */
+       "OPEN",         /* 0x2c */
+       "CLOSE",                /* 0x2d */
+       "REF",          /* 0x2e */
+       "REFF",         /* 0x2f */
+       "REFFL",                /* 0x30 */
+       "IFMATCH",              /* 0x31 */
+       "UNLESSM",              /* 0x32 */
+       "SUSPEND",              /* 0x33 */
+       "IFTHEN",               /* 0x34 */
+       "GROUPP",               /* 0x35 */
+       "LONGJMP",              /* 0x36 */
+       "BRANCHJ",              /* 0x37 */
+       "EVAL",         /* 0x38 */
+       "MINMOD",               /* 0x39 */
+       "LOGICAL",              /* 0x3a */
+       "RENUM",                /* 0x3b */
+       "OPTIMIZED",            /* 0x3c */
  };
  
-static const int reg_num = 80;
+static const int reg_num = 61;
  
  #endif /* DEBUGGING */
  #endif /* REG_COMP_C */
diff --git a/sv.c b/sv.c

index 1dafbf6..1fbf83f 100644 (file)
--- a/sv.c
+++ b/sv.c
@@ -4522,11 +4522,9 @@ Perl_sv_len_utf8(pTHX_ register SV *sv)
      if (!sv)
         return 0;
  
-#ifdef NOTYET
      if (SvGMAGICAL(sv))
         return mg_length(sv);
      else
-#endif
      {
         STRLEN len;
         U8 *s = (U8*)SvPV(sv, len);
diff --git a/t/op/utf8decode.t b/t/op/utf8decode.t

index ac42b85..cd9d56a 100644 (file)
--- a/t/op/utf8decode.t
+++ b/t/op/utf8decode.t
@@ -5,6 +5,8 @@ BEGIN {
      @INC = '../lib';
  }
  
+no utf8; # this test contains raw 8-bit data on purpose; don't switch to \x{}
+
  print "1..78\n";
  
  my $test = 1;
diff --git a/t/pragma/utf8.t b/t/pragma/utf8.t

index 6986720..89416dc 100755 (executable)
--- a/t/pragma/utf8.t
+++ b/t/pragma/utf8.t
@@ -10,7 +10,7 @@ BEGIN {
      }
  }
  
-print "1..90\n";
+print "1..104\n";
  
  my $test = 1;
  
@@ -42,6 +42,7 @@ sub nok_bytes {
  
  {
      use utf8;
+
      $_ = ">\x{263A}<"; 
      s/([\x{80}-\x{10ffff}])/"&#".ord($1).";"/eg; 
      ok $_, '>&#9786;<';
@@ -106,212 +107,191 @@ sub nok_bytes {
  }
  
  {
-    use utf8;
-
-    $_ = "\x{263A}>\x{263A}\x{263A}"; 
-
-    ok length, 4;
-    $test++;                           # 13
-
-    ok length((m/>(.)/)[0]), 1;
-    $test++;                           # 14
-
-    ok length($&), 2;
-    $test++;                           # 15
+    # no use utf8 needed
+    $_ = "\x{263A}\x{263A}x\x{263A}y\x{263A}";
+    
+    ok length($_), 6;                  # 13
+    $test++;
  
-    ok length($'), 1;
-    $test++;                           # 16
+    ($a) = m/x(.)/;
  
-    ok length($`), 1;
-    $test++;                           # 17
+    ok length($a), 1;                  # 14
+    $test++;
  
-    ok length($1), 1;
-    $test++;                           # 18
+    ok length($`), 2;                  # 15
+    $test++;
+    ok length($&), 2;                  # 16
+    $test++;
+    ok length($'), 2;                  # 17
+    $test++;
  
-    ok length($tmp=$&), 2;
-    $test++;                           # 19
+    ok length($1), 1;                  # 18
+    $test++;
  
-    ok length($tmp=$'), 1;
-    $test++;                           # 20
+    ok length($b=$`), 2;               # 19
+    $test++;
  
-    ok length($tmp=$`), 1;
-    $test++;                           # 21
+    ok length($b=$&), 2;               # 20
+    $test++;
  
-    ok length($tmp=$1), 1;
-    $test++;                           # 22
+    ok length($b=$'), 2;               # 21
+    $test++;
  
-    {
-       use bytes;
+    ok length($b=$1), 1;               # 22
+    $test++;
  
-       my $tmp = $&;
-       ok $tmp, pack("C*", ord(">"), 0342, 0230, 0272);
-       $test++;                                # 23
+    ok $a, "\x{263A}";                 # 23
+    $test++;
  
-       $tmp = $';
-       ok $tmp, pack("C*", 0342, 0230, 0272);
-       $test++;                                # 24
+    ok $`, "\x{263A}\x{263A}";         # 24
+    $test++;
  
-       $tmp = $`;
-       ok $tmp, pack("C*", 0342, 0230, 0272);
-       $test++;                                # 25
+    ok $&, "x\x{263A}";                        # 25
+    $test++;
  
-       $tmp = $1;
-       ok $tmp, pack("C*", 0342, 0230, 0272);
-       $test++;                                # 26
-    }
+    ok $', "y\x{263A}";                        # 26
+    $test++;
  
-    ok_bytes $&, pack("C*", ord(">"), 0342, 0230, 0272);
-    $test++;                           # 27
+    ok $1, "\x{263A}";                 # 27
+    $test++;
  
-    ok_bytes $', pack("C*", 0342, 0230, 0272);
-    $test++;                           # 28
+    ok_bytes $a, "\342\230\272";       # 28
+    $test++;
  
-    ok_bytes $`, pack("C*", 0342, 0230, 0272);
-    $test++;                           # 29
+    ok_bytes $1, "\342\230\272";       # 29
+    $test++;
  
-    ok_bytes $1, pack("C*", 0342, 0230, 0272);
-    $test++;                           # 30
+    ok_bytes $&, "x\342\230\272";      # 30
+    $test++;
  
      {
-       use bytes;
-       no utf8;
-
-       ok length, 10;
-       $test++;                                # 31
+       use utf8; # required
+       $_ = chr(0x263A) . chr(0x263A) . 'x' . chr(0x263A) . 'y' . chr(0x263A);
+    }
  
-       ok length((m/>(.)/)[0]), 1;
-       $test++;                                # 32
+    ok length($_), 6;                  # 31
+    $test++;
  
-       ok length($&), 2;
-       $test++;                                # 33
+    ($a) = m/x(.)/;
  
-       ok length($'), 5;
-       $test++;                                # 34
+    ok length($a), 1;                  # 32
+    $test++;
  
-       ok length($`), 3;
-       $test++;                                # 35
+    ok length($`), 2;                  # 33
+    $test++;
  
-       ok length($1), 1;
-       $test++;                                # 36
+    ok length($&), 2;                  # 34
+    $test++;
  
-       ok $&, pack("C*", ord(">"), 0342);
-       $test++;                                # 37
+    ok length($'), 2;                  # 35
+    $test++;
  
-       ok $', pack("C*", 0230, 0272, 0342, 0230, 0272);
-       $test++;                                # 38
+    ok length($1), 1;                  # 36
+    $test++;
  
-       ok $`, pack("C*", 0342, 0230, 0272);
-       $test++;                                # 39
+    ok length($b=$`), 2;               # 37
+    $test++;
  
-       ok $1, pack("C*", 0342);
-       $test++;                                # 40
-    }
+    ok length($b=$&), 2;               # 38
+    $test++;
  
-    {
-       no utf8;
-       $_="\342\230\272>\342\230\272\342\230\272";
-    }
+    ok length($b=$'), 2;               # 39
+    $test++;
  
-    ok length, 10;
-    $test++;                           # 41
+    ok length($b=$1), 1;               # 40
+    $test++;
  
-    ok length((m/>(.)/)[0]), 1;
-    $test++;                           # 42
+    ok $a, "\x{263A}";                 # 41
+    $test++;
  
-    ok length($&), 2;
-    $test++;                           # 43
+    ok $`, "\x{263A}\x{263A}";         # 42
+    $test++;
  
-    ok length($'), 1;
-    $test++;                           # 44
+    ok $&, "x\x{263A}";                        # 43
+    $test++;
  
-    ok length($`), 1;
-    $test++;                           # 45
+    ok $', "y\x{263A}";                        # 44
+    $test++;
  
-    ok length($1), 1;
-    $test++;                           # 46
+    ok $1, "\x{263A}";                 # 45
+    $test++;
  
-    ok length($tmp=$&), 2;
-    $test++;                           # 47
+    ok_bytes $a, "\342\230\272";       # 46
+    $test++;
  
-    ok length($tmp=$'), 1;
-    $test++;                           # 48
+    ok_bytes $1, "\342\230\272";       # 47
+    $test++;
  
-    ok length($tmp=$`), 1;
-    $test++;                           # 49
+    ok_bytes $&, "x\342\230\272";      # 48
+    $test++;
  
-    ok length($tmp=$1), 1;
-    $test++;                           # 50
+    $_ = "\342\230\272\342\230\272x\342\230\272y\342\230\272";
  
-    {
-       use bytes;
+    ok length($_), 14;                 # 49
+    $test++;
  
-        my $tmp = $&;
-       ok $tmp, pack("C*", ord(">"), 0342, 0230, 0272);
-       $test++;                                # 51
+    ($a) = m/x(.)/;
  
-        $tmp = $';
-       ok $tmp, pack("C*", 0342, 0230, 0272);
-       $test++;                                # 52
+    ok length($a), 1;                  # 50
+    $test++;
  
-        $tmp = $`;
-       ok $tmp, pack("C*", 0342, 0230, 0272);
-       $test++;                                # 53
+    ok length($`), 6;                  # 51
+    $test++;
  
-        $tmp = $1;
-       ok $tmp, pack("C*", 0342, 0230, 0272);
-       $test++;                                # 54
-    }
+    ok length($&), 2;                  # 52
+    $test++;
  
-    {
-       use bytes;
-       no utf8;
+    ok length($'), 6;                  # 53
+    $test++;
  
-       ok length, 10;
-       $test++;                                # 55
+    ok length($1), 1;                  # 54
+    $test++;
  
-       ok length((m/>(.)/)[0]), 1;
-       $test++;                                # 56
+    ok length($b=$`), 6;               # 55
+    $test++;
  
-       ok length($&), 2;
-       $test++;                                # 57
+    ok length($b=$&), 2;               # 56
+    $test++;
  
-       ok length($'), 5;
-       $test++;                                # 58
+    ok length($b=$'), 6;               # 57
+    $test++;
  
-       ok length($`), 3;
-       $test++;                                # 59
+    ok length($b=$1), 1;               # 58
+    $test++;
  
-       ok length($1), 1;
-       $test++;                                # 60
+    ok $a, "\342";                     # 59
+    $test++;
  
-       ok $&, pack("C*", ord(">"), 0342);
-       $test++;                                # 61
+    ok $`, "\342\230\272\342\230\272"; # 60
+    $test++;
  
-       ok $', pack("C*", 0230, 0272, 0342, 0230, 0272);
-       $test++;                                # 62
+    ok $&, "x\342";                    # 61
+    $test++;
  
-       ok $`, pack("C*", 0342, 0230, 0272);
-       $test++;                                # 63
+    ok $', "\230\272y\342\230\272";    # 62
+    $test++;
  
-       ok $1, pack("C*", 0342);
-       $test++;                                # 64
-    }
+    ok $1, "\342";                     # 63
+    $test++;
+}
  
+{
+    use utf8;
      ok "\x{ab}" =~ /^\x{ab}$/, 1;
-    $test++;                                   # 65
+    $test++;                           # 64
  }
  
  {
      use utf8;
      ok_bytes chr(0xe2), pack("C*", 0xc3, 0xa2);
-    $test++;                # 66
+    $test++;                # 65
  }
  
  {
      use utf8;
      my @a = map ord, split(//, join("", map chr, (1234, 123, 2345)));
      ok "@a", "1234 123 2345";
-    $test++;                # 67
+    $test++;                # 66
  }
  
  {
@@ -319,7 +299,7 @@ sub nok_bytes {
      my $x = chr(123);
      my @a = map ord, split(/$x/, join("", map chr, (1234, 123, 2345)));
      ok "@a", "1234 2345";
-    $test++;                # 68
+    $test++;                # 67
  }
  
  {
@@ -331,10 +311,10 @@ sub nok_bytes {
      { use utf8;  $b = "\xe4"     } # \xXX must not produce UTF-8
  
      print "not " if $a eq $b;
-    print "ok $test\n"; $test++;
+    print "ok $test\n"; $test++;       # 68
  
      { use utf8; print "not " if $a eq $b; }
-    print "ok $test\n"; $test++;
+    print "ok $test\n"; $test++;       # 69
  }
  
  {
@@ -344,7 +324,7 @@ sub nok_bytes {
      for (@x) {
         s/(\d+)\s*([\w\-]+)/$1 . uc $2/e;
         my($latin) = /^(.+)(?:\s+\d)/;
-       print $latin eq "stra\337e" ? "ok $test\n" :
+       print $latin eq "stra\337e" ? "ok $test\n" :    # 70, 71
             "#latin[$latin]\nnot ok $test\n";
         $test++;
         $latin =~ s/stra\337e/straße/; # \303\237 after the 2nd a
@@ -369,7 +349,7 @@ sub nok_bytes {
      }
  
      print "not " unless $r eq " U+B36C U+5A8C U+FF5B U+5079 U+505B";
-    print "ok $test\n";
+    print "ok $test\n";                        # 72
      $test++;
  }
  
@@ -384,27 +364,27 @@ sub nok_bytes {
      print "not "
         unless $a eq "\x20" && $b eq "\x{80}\x{100}\x{80}" && $c eq $a;
      print "ok $test\n";
-    $test++;
+    $test++;                           # 73
  
      my ($a, $b) = split(/\x{100}/, $s);
      print "not " unless $a eq "\x20\x40\x{80}" && $b eq "\x{80}\x40\x20";
      print "ok $test\n";
-    $test++;
+    $test++;                           # 74
  
      my ($a, $b) = split(/\x{80}\x{100}\x{80}/, $s);
      print "not " unless $a eq "\x20\x40" && $b eq "\x40\x20";
      print "ok $test\n";
-    $test++;
+    $test++;                           # 75
  
      my ($a, $b) = split(/\x40\x{80}/, $s);
      print "not " unless $a eq "\x20" && $b eq "\x{100}\x{80}\x40\x20";
      print "ok $test\n";
-    $test++;
+    $test++;                           # 76
  
      my ($a, $b, $c) = split(/[\x40\x{80}]+/, $s);
      print "not " unless $a eq "\x20" && $b eq "\x{100}" && $c eq "\x20";
      print "ok $test\n";
-    $test++;
+    $test++;                           # 77
  }
  
  {
@@ -414,14 +394,14 @@ sub nok_bytes {
  
      my $smiley = "\x{263a}";
  
-    for my $s ("\x{263a}",                     #  1
-              $smiley,                        #  2
+    for my $s ("\x{263a}",                     # 78
+              $smiley,                        # 79
                 
-              "" . $smiley,                   #  3
-              "" . "\x{263a}",                #  4
+              "" . $smiley,                   # 80
+              "" . "\x{263a}",                # 81
  
-              $smiley    . "",                #  5
-              "\x{263a}" . "",                #  6
+              $smiley    . "",                # 82
+              "\x{263a}" . "",                # 83
                ) {
         my $length_chars = length($s);
         my $length_bytes;
@@ -437,14 +417,14 @@ sub nok_bytes {
         $test++;
      }
  
-    for my $s ("\x{263a}" . "\x{263a}",        #  7
-              $smiley    . $smiley,           #  8
+    for my $s ("\x{263a}" . "\x{263a}",        # 84
+              $smiley    . $smiley,           # 85
  
-              "\x{263a}\x{263a}",             #  9
-              "$smiley$smiley",               # 10
+              "\x{263a}\x{263a}",             # 86
+              "$smiley$smiley",               # 87
                
-              "\x{263a}" x 2,                 # 11
-              $smiley    x 2,                 # 12
+              "\x{263a}" x 2,                 # 88
+              $smiley    x 2,                 # 89
                ) {
         my $length_chars = length($s);
         my $length_bytes;
@@ -460,3 +440,106 @@ sub nok_bytes {
         $test++;
      }
  }
+
+{
+    use utf8;
+
+    print "not " unless "ba\xd4c" =~ /([a\xd4]+)/ && $1 eq "a\xd4";
+    print "ok $test\n";
+    $test++;                                   # 90
+
+    print "not " unless "ba\xd4c" =~ /([a\xd4]+)/ && $1 eq "a\x{d4}";
+    print "ok $test\n";
+    $test++;                                   # 91
+
+    print "not " unless "ba\x{d4}c" =~ /([a\xd4]+)/ && $1 eq "a\x{d4}";
+    print "ok $test\n";
+    $test++;                                   # 92
+
+    print "not " unless "ba\x{d4}c" =~ /([a\xd4]+)/ && $1 eq "a\xd4";
+    print "ok $test\n";
+    $test++;                                   # 93
+
+    print "not " unless "ba\xd4c" =~ /([a\x{d4}]+)/ && $1 eq "a\xd4";
+    print "ok $test\n";
+    $test++;                                   # 94
+
+    print "not " unless "ba\xd4c" =~ /([a\x{d4}]+)/ && $1 eq "a\x{d4}";
+    print "ok $test\n";
+    $test++;                                   # 95
+
+    print "not " unless "ba\x{d4}c" =~ /([a\x{d4}]+)/ && $1 eq "a\x{d4}";
+    print "ok $test\n";
+    $test++;                                   # 96
+
+    print "not " unless "ba\x{d4}c" =~ /([a\x{d4}]+)/ && $1 eq "a\xd4";
+    print "ok $test\n";
+    $test++;                                   # 97
+}
+
+{
+    # the first half of 20001028.003
+
+    my $X = chr(1448);
+    my ($Y) = $X =~ /(.*)/;
+    print "not " unless length $Y == 1;
+    print "ok $test\n";
+    $test++;                                   # 98
+}
+
+{
+    # 20001108.001
+
+    use utf8;
+    my $X = "Szab\x{f3},Bal\x{e1}zs";
+    my $Y = $X;
+    $Y =~ s/(B)/$1/ for 0..3;
+    print "not " unless $Y eq $X;
+    print "ok $test\n";
+    $test++;                                   # 99
+}
+
+{
+    # 20001114.001     
+
+    use utf8;
+    use charnames ':full';
+    my $text = "\N{LATIN CAPITAL LETTER A WITH DIAERESIS}";
+    print "not " unless ord($text) == 0xc4;
+    print "ok $test\n";
+    $test++;                                    # 100
+}
+
+{
+    # 20001205.014
+
+    use utf8;
+
+    my $a = "ABC\x{263A}";
+
+    my @b = split( //, $a );
+
+    print "not " unless @b == 4;
+    print "ok $test\n";
+    $test++;                                    # 101
+
+    print "not " unless length($b[3]) == 1;
+    print "ok $test\n";
+    $test++;                                    # 102
+
+    $a =~ s/^A/Z/;
+    print "not " unless length($a) == 4;
+    print "ok $test\n";
+    $test++;                                    # 103
+}
+
+{
+    # the second half of 20001028.003
+
+    use utf8;
+    $X =~ s/^/chr(1488)/e;
+    print "not " unless length $X == 1;
+    print "ok $test\n";
+    $test++;                                   # 104
+}
+
author	Jarkko Hietaniemi <jhi@iki.fi>
	Sun, 17 Dec 2000 05:31:37 +0000 (05:31 +0000)
committer	Jarkko Hietaniemi <jhi@iki.fi>
	Sun, 17 Dec 2000 05:31:37 +0000 (05:31 +0000)
embed.h		patch \| blob \| history
embed.pl		patch \| blob \| history
mg.c		patch \| blob \| history
objXSUB.h		patch \| blob \| history
pp_ctl.c		patch \| blob \| history
pp_hot.c		patch \| blob \| history
proto.h		patch \| blob \| history
regcomp.c		patch \| blob \| history
regcomp.h		patch \| blob \| history
regcomp.sym		patch \| blob \| history
regexec.c		patch \| blob \| history
regnodes.h		patch \| blob \| history
sv.c		patch \| blob \| history
t/op/utf8decode.t		patch \| blob \| history
t/pragma/utf8.t		patch \| blob \| history