From 640f820da331e7bc688f3f8820b2760fa5a09de6 Mon Sep 17 00:00:00 2001 From: =?utf8?q?=C3=86var=20Arnfj=C3=B6r=C3=B0=20Bjarmason?= Date: Thu, 9 Aug 2007 07:49:16 +0000 Subject: [PATCH] =?utf8?q?Optimize=20split=20//=20From:=20"=C3=86var=20Arn?= =?utf8?q?fj=C3=B6r=C3=B0=20Bjarmason"=20=20Message-ID:?= =?utf8?q?=20<51dd1af80708090049p2cf4810ep5a437ad53f64fa78@mail.gmail.com>?= MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit p4raw-id: //depot/perl@31693 --- pod/perlreapi.pod | 10 ++++++++++ pp.c | 37 +++++++++++++++++++++++++++++++++++++ regcomp.c | 6 +++++- regexp.h | 1 + 4 files changed, 53 insertions(+), 1 deletion(-) diff --git a/pod/perlreapi.pod b/pod/perlreapi.pod index 1e35869..5425740 100644 --- a/pod/perlreapi.pod +++ b/pod/perlreapi.pod @@ -188,6 +188,16 @@ whether RXf_PMf_LOCALE is set. Perl's engine sets this flag if the pattern is C<\s+>. +=item RXf_NULL + +Tells the split operatior to split the target string on +characters. The definition of character varies depending on whether +the target string is a UTF-8 string. + +Perl's engine sets this flag on empty patterns, this optimization +makes C much faster than it would otherwise be, it's even +faster than C. + =back =head2 exec diff --git a/pp.c b/pp.c index 51af7d8..02e530f 100644 --- a/pp.c +++ b/pp.c @@ -4711,6 +4711,43 @@ PP(pp_split) s = m; } } + else if (rx->extflags & RXf_NULL && !(s >= strend)) { + /* + Pre-extend the stack, either the number of bytes or + characters in the string or a limited amount, triggered by: + + my ($x, $y) = split //, $str; + or + split //, $str, $i; + */ + const U32 items = limit - 1; + if (items < slen) + EXTEND(SP, items); + else + EXTEND(SP, slen); + + while (--limit) { + m = s; + + if (do_utf8) + s += UTF8SKIP(s); + else + ++s; + + dstr = newSVpvn(m, s-m); + + if (make_mortal) + sv_2mortal(dstr); + if (do_utf8) + (void)SvUTF8_on(dstr); + + PUSHs(dstr); + + /* are we there yet? */ + if (s >= strend) + break; + } + } else if (do_utf8 == ((rx->extflags & RXf_UTF8) != 0) && (rx->extflags & RXf_USE_INTUIT) && !rx->nparens && (rx->extflags & RXf_CHECK_ALL) diff --git a/regcomp.c b/regcomp.c index cada4cd..f06fb68 100644 --- a/regcomp.c +++ b/regcomp.c @@ -4753,6 +4753,8 @@ reStudy: r->paren_names = NULL; #ifdef STUPID_PATTERN_CHECKS + if (r->prelen == 0) + r->extflags |= RXf_NULL; if (r->extflags & RXf_SPLIT && r->prelen == 1 && r->precomp[0] == ' ') /* XXX: this should happen BEFORE we compile */ r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); @@ -4769,7 +4771,9 @@ reStudy: U8 fop = OP(first); U8 nop = OP(NEXTOPER(first)); - if (PL_regkind[fop] == BOL && nop == END) + if (PL_regkind[fop] == NOTHING && nop == END) + r->extflags |= RXf_NULL; + else if (PL_regkind[fop] == BOL && nop == END) r->extflags |= RXf_START_ONLY; else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END) r->extflags |= RXf_WHITE; diff --git a/regexp.h b/regexp.h index 27f17e7..6de89c9 100644 --- a/regexp.h +++ b/regexp.h @@ -240,6 +240,7 @@ and check for NULL. #define RXf_SKIPWHITE 0x00000100 /* Pattern is for a split / / */ #define RXf_START_ONLY 0x00000200 /* Pattern is /^/ */ #define RXf_WHITE 0x00000400 /* Pattern is /\s+/ */ +#define RXf_NULL 0x40000000 /* Pattern is // */ /* 0x1F800 of extflags is used by (RXf_)PMf_COMPILETIME */ #define RXf_PMf_LOCALE 0x00000800 /* use locale */ -- 2.7.4