From: erik.corry@gmail.com Date: Tue, 3 Apr 2012 12:24:55 +0000 (+0000) Subject: Switch regexp strategy for regexps that are just plain X-Git-Tag: upstream/4.7.83~16969 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=965fea65c2c643cdd97f98b8ae5cf3101bb3a309;p=platform%2Fupstream%2Fv8.git Switch regexp strategy for regexps that are just plain strings with a small alphabet. We already have code that handles these regexps well, we were just not always activating it. Review URL: https://chromiumcodereview.appspot.com/9959096 git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@11218 ce2b1a6d-e550-0410-aec6-3dcde31c8c00 --- diff --git a/src/jsregexp.cc b/src/jsregexp.cc index 13d24cd..b7d0d30 100644 --- a/src/jsregexp.cc +++ b/src/jsregexp.cc @@ -108,6 +108,36 @@ static inline void ThrowRegExpException(Handle re, } +// More makes code generation slower, less makes V8 benchmark score lower. +const int kMaxLookaheadForBoyerMoore = 8; +// In a 3-character pattern you can maximally step forwards 3 characters +// at a time, which is not always enough to pay for the extra logic. +const int kPatternTooShortForBoyerMoore = 2; + + +// Identifies the sort of regexps where the regexp engine is faster +// than the code used for atom matches. +static bool HasFewDifferentCharacters(Handle pattern) { + int length = Min(kMaxLookaheadForBoyerMoore, pattern->length()); + if (length <= kPatternTooShortForBoyerMoore) return false; + const int kMod = 128; + bool character_found[kMod]; + int different = 0; + memset(&character_found[0], 0, sizeof(character_found)); + for (int i = 0; i < length; i++) { + int ch = (pattern->Get(i) & (kMod - 1)); + if (!character_found[ch]) { + character_found[ch] = true; + different++; + // We declare a regexp low-alphabet if it has at least 3 times as many + // characters as it has different characters. + if (different * 3 > length) return false; + } + } + return true; +} + + // Generic RegExp methods. Dispatches to implementation specific methods. @@ -141,9 +171,14 @@ Handle RegExpImpl::Compile(Handle re, return Handle::null(); } - if (parse_result.simple && !flags.is_ignore_case()) { + bool has_been_compiled = false; + + if (parse_result.simple && + !flags.is_ignore_case() && + !HasFewDifferentCharacters(pattern)) { // Parse-tree is a single atom that is equal to the pattern. AtomCompile(re, pattern, flags, pattern); + has_been_compiled = true; } else if (parse_result.tree->IsAtom() && !flags.is_ignore_case() && parse_result.capture_count == 0) { @@ -151,8 +186,12 @@ Handle RegExpImpl::Compile(Handle re, Vector atom_pattern = atom->data(); Handle atom_string = isolate->factory()->NewStringFromTwoByte(atom_pattern); - AtomCompile(re, pattern, flags, atom_string); - } else { + if (!HasFewDifferentCharacters(atom_string)) { + AtomCompile(re, pattern, flags, atom_string); + has_been_compiled = true; + } + } + if (!has_been_compiled) { IrregexpInitialize(re, pattern, flags, parse_result.capture_count); } ASSERT(re->data()->IsFixedArray()); @@ -3429,6 +3468,7 @@ bool BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) { return true; } + /* Code generation for choice nodes. * * We generate quick checks that do a mask and compare to eliminate a @@ -3507,7 +3547,6 @@ bool BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) { * \______________/ */ - void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) { RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); int choice_count = alternatives_->length(); @@ -3578,9 +3617,6 @@ void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) { bool skip_was_emitted = false; - // More makes code generation slower, less makes V8 benchmark score lower. - const int kMaxLookaheadForBoyerMoore = 8; - if (!greedy_loop && choice_count == 2) { GuardedAlternative alt1 = alternatives_->at(1); if (alt1.guards() == NULL || alt1.guards()->length() == 0) {