* Have an ASCII and a UC16 interpreter for Irregexp bytecodes -

author erik.corry@gmail.com <erik.corry@gmail.com@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>

Tue, 9 Dec 2008 08:30:49 +0000 (08:30 +0000)

committer erik.corry@gmail.com <erik.corry@gmail.com@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>

Tue, 9 Dec 2008 08:30:49 +0000 (08:30 +0000)
author erik.corry@gmail.com <erik.corry@gmail.com@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>
Tue, 9 Dec 2008 08:30:49 +0000 (08:30 +0000)
committer erik.corry@gmail.com <erik.corry@gmail.com@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>
Tue, 9 Dec 2008 08:30:49 +0000 (08:30 +0000)
diff --git a/src/interpreter-irregexp.cc b/src/interpreter-irregexp.cc

index 73701b9ac18c033764818bf75424653efc4990cd..14184438ef81f80af9f723ef0def621e652fbb11 100644 (file)
--- a/src/interpreter-irregexp.cc
+++ b/src/interpreter-irregexp.cc
@@ -60,6 +60,22 @@ static bool BackRefMatchesNoCase(int from,
  }
  
  
+static bool BackRefMatchesNoCase(int from,
+                                 int current,
+                                 int len,
+                                 Vector<const char> subject) {
+  for (int i = 0; i < len; i++) {
+    unsigned int old_char = subject[from++];
+    unsigned int new_char = subject[current++];
+    if (old_char == new_char) continue;
+    if (old_char - 'A' <= 'Z' - 'A') old_char |= 0x20;
+    if (new_char - 'A' <= 'Z' - 'A') new_char |= 0x20;
+    if (old_char != new_char) return false;
+  }
+  return true;
+}
+
+
  #ifdef DEBUG
  static void TraceInterpreter(const byte* code_base,
                               const byte* pc,
@@ -96,8 +112,9 @@ static void TraceInterpreter(const byte* code_base,
  
  
  
+template <typename Char>
  static bool RawMatch(const byte* code_base,
-                     Vector<const uc16> subject,
+                     Vector<const Char> subject,
                       int* registers,
                       int current,
                       int current_char) {
@@ -405,23 +422,32 @@ static bool RawMatch(const byte* code_base,
  
  
  bool IrregexpInterpreter::Match(Handle<ByteArray> code_array,
-                                Handle<String> subject16,
+                                Handle<String> subject,
                                  int* registers,
                                  int start_position) {
-  ASSERT(StringShape(*subject16).IsTwoByteRepresentation());
-  ASSERT(subject16->IsFlat(StringShape(*subject16)));
+  ASSERT(subject->IsFlat(StringShape(*subject)));
  
    AssertNoAllocation a;
    const byte* code_base = code_array->GetDataStartAddress();
+  StringShape subject_shape(*subject);
    uc16 previous_char = '\n';
-  Vector<const uc16> subject_vector =
-      Vector<const uc16>(subject16->GetTwoByteData(), subject16->length());
-  if (start_position != 0) previous_char = subject_vector[start_position - 1];
-  return RawMatch(code_base,
-                  subject_vector,
-                  registers,
-                  start_position,
-                  previous_char);
+  if (subject_shape.IsAsciiRepresentation()) {
+    Vector<const char> subject_vector = subject->ToAsciiVector();
+    if (start_position != 0) previous_char = subject_vector[start_position - 1];
+    return RawMatch(code_base,
+                    subject_vector,
+                    registers,
+                    start_position,
+                    previous_char);
+  } else {
+    Vector<const uc16> subject_vector = subject->ToUC16Vector();
+    if (start_position != 0) previous_char = subject_vector[start_position - 1];
+    return RawMatch(code_base,
+                    subject_vector,
+                    registers,
+                    start_position,
+                    previous_char);
+  }
  }
  
  } }  // namespace v8::internal
diff --git a/src/interpreter-irregexp.h b/src/interpreter-irregexp.h

index 2393d74d3669cd0391e119557edc92a2a8d67953..c65cb9ecd5fa84b1b30e091395d17a4b7bc73cb1 100644 (file)
--- a/src/interpreter-irregexp.h
+++ b/src/interpreter-irregexp.h
@@ -36,7 +36,7 @@ namespace v8 { namespace internal {
  class IrregexpInterpreter {
   public:
    static bool Match(Handle<ByteArray> code,
-                    Handle<String> subject16,
+                    Handle<String> subject,
                      int* captures,
                      int start_position);
  };
diff --git a/src/jsregexp.cc b/src/jsregexp.cc

index 13c717220deac22f0b8453b5546768859c1ae6e8..b60b1a62e01230c87e90b6098d069294c9db8151 100644 (file)
--- a/src/jsregexp.cc
+++ b/src/jsregexp.cc
@@ -883,12 +883,13 @@ Handle<Object> RegExpImpl::IrregexpExecOnce(Handle<FixedArray> irregexp,
  
    int tag = Smi::cast(irregexp->get(kIrregexpImplementationIndex))->value();
  
+  if (!subject->IsFlat(StringShape(*subject))) {
+    FlattenString(subject);
+  }
+
    switch (tag) {
      case RegExpMacroAssembler::kIA32Implementation: {
  #ifndef ARM
-      if (!subject->IsFlat(StringShape(*subject))) {
-        FlattenString(subject);
-      }
        Handle<Code> code = IrregexpNativeCode(irregexp);
  
        StringShape shape(*subject);
@@ -962,10 +963,8 @@ Handle<Object> RegExpImpl::IrregexpExecOnce(Handle<FixedArray> irregexp,
        }
        Handle<ByteArray> byte_codes = IrregexpByteCode(irregexp);
  
-      Handle<String> two_byte_subject = CachedStringToTwoByte(subject);
-
        rc = IrregexpInterpreter::Match(byte_codes,
-                                      two_byte_subject,
+                                      subject,
                                        offsets_vector,
                                        previous_index);
        break;
@@ -1191,7 +1190,7 @@ DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
  
  class RegExpCompiler {
   public:
-  RegExpCompiler(int capture_count, bool ignore_case);
+  RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii);
  
    int AllocateRegister() { return next_register_++; }
  
@@ -1215,6 +1214,7 @@ class RegExpCompiler {
    inline void DecrementRecursionDepth() { recursion_depth_--; }
  
    inline bool ignore_case() { return ignore_case_; }
+  inline bool ascii() { return ascii_; }
  
   private:
    EndNode* accept_;
@@ -1223,6 +1223,7 @@ class RegExpCompiler {
    int recursion_depth_;
    RegExpMacroAssembler* macro_assembler_;
    bool ignore_case_;
+  bool ascii_;
  };
  
  
@@ -1239,11 +1240,12 @@ class RecursionCheck {
  
  // Attempts to compile the regexp using an Irregexp code generator.  Returns
  // a fixed array or a null handle depending on whether it succeeded.
-RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case)
+RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, bool ascii)
      : next_register_(2 * (capture_count + 1)),
        work_list_(NULL),
        recursion_depth_(0),
-      ignore_case_(ignore_case) {
+      ignore_case_(ignore_case),
+      ascii_(ascii) {
    accept_ = new EndNode(EndNode::ACCEPT);
  }
  
@@ -1682,7 +1684,6 @@ static inline void EmitAtomLetters(
                                        chars[0],
                                        chars[1],
                                        on_failure)) {
-          ok.Unuse();
          } else {
            macro_assembler->CheckCharacter(chars[0], &ok);
            macro_assembler->CheckNotCharacter(chars[1], on_failure);
@@ -1711,8 +1712,12 @@ static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
                            RegExpCharacterClass* cc,
                            int cp_offset,
                            Label* on_failure,
-                          bool check_offset) {
+                          bool check_offset,
+                          bool ascii) {
    ZoneList<CharacterRange>* ranges = cc->ranges();
+  const int max_char = ascii ?
+                       String::kMaxAsciiCharCode :
+                       String::kMaxUC16CharCode;
  
    Label success;
  
@@ -1721,16 +1726,27 @@ static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
  
    int range_count = ranges->length();
  
-  if (range_count == 0) {
+  int last_valid_range = range_count - 1;
+  while (last_valid_range >= 0) {
+    CharacterRange& range = ranges->at(last_valid_range);
+    if (range.from() <= max_char) {
+      break;
+    }
+    last_valid_range--;
+  }
+
+  if (last_valid_range < 0) {
      if (!cc->is_negated()) {
+      // TODO(plesner): We can remove this when the node level does our
+      // ASCII optimizations for us.
        macro_assembler->GoTo(on_failure);
      }
      return;
    }
  
-  if (range_count == 1 &&
+  if (last_valid_range == 0 &&
        !cc->is_negated() &&
-      ranges->at(0).IsEverything(0xffff)) {
+      ranges->at(0).IsEverything(max_char)) {
      // This is a common case hit by non-anchored expressions.
      // TODO(erikcorry): We should have a macro assembler instruction that just
      // checks for end of string without loading the character.
@@ -1748,18 +1764,22 @@ static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
      macro_assembler->LoadCurrentCharacterUnchecked(cp_offset);
    }
  
-  for (int i = 0; i < range_count - 1; i++) {
+  for (int i = 0; i <= last_valid_range; i++) {
      CharacterRange& range = ranges->at(i);
      Label next_range;
      uc16 from = range.from();
      uc16 to = range.to();
+    if (from > max_char) {
+      continue;
+    }
+    if (to > max_char) to = max_char;
      if (to == from) {
        macro_assembler->CheckCharacter(to, char_is_in_class);
      } else {
        if (from != 0) {
          macro_assembler->CheckCharacterLT(from, &next_range);
        }
-      if (to != 0xffff) {
+      if (to != max_char) {
          macro_assembler->CheckCharacterLT(to + 1, char_is_in_class);
        } else {
          macro_assembler->GoTo(char_is_in_class);
@@ -1768,10 +1788,13 @@ static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
      macro_assembler->Bind(&next_range);
    }
  
-  CharacterRange& range = ranges->at(range_count - 1);
+  CharacterRange& range = ranges->at(last_valid_range);
    uc16 from = range.from();
    uc16 to = range.to();
  
+  if (to > max_char) to = max_char;
+  ASSERT(to >= from);
+
    if (to == from) {
      if (cc->is_negated()) {
        macro_assembler->CheckCharacter(to, on_failure);
@@ -1786,7 +1809,7 @@ static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
          macro_assembler->CheckCharacterLT(from, on_failure);
        }
      }
-    if (to != 0xffff) {
+    if (to != String::kMaxUC16CharCode) {
        if (cc->is_negated()) {
          macro_assembler->CheckCharacterLT(to + 1, on_failure);
        } else {
@@ -1875,7 +1898,25 @@ bool TextNode::Emit(RegExpCompiler* compiler, GenerationVariant* variant) {
      macro_assembler->GoTo(backtrack);
      return true;
    }
-  // First, handle straight character matches.
+  // First check for non-ASCII text.
+  // TODO(plesner): We should do this at node level.
+  if (compiler->ascii()) {
+    for (int i = element_count - 1; i >= 0; i--) {
+      TextElement elm = elms_->at(i);
+      if (elm.type == TextElement::ATOM) {
+        Vector<const uc16> quarks = elm.data.u_atom->data();
+        for (int j = quarks.length() - 1; j >= 0; j--) {
+          if (quarks[j] > String::kMaxAsciiCharCode) {
+            macro_assembler->GoTo(backtrack);
+            return true;
+          }
+        }
+      } else {
+        ASSERT_EQ(elm.type, TextElement::CHAR_CLASS);
+      }
+    }
+  }
+  // Second, handle straight character matches.
    int checked_up_to = -1;
    for (int i = element_count - 1; i >= 0; i--) {
      TextElement elm = elms_->at(i);
@@ -1902,7 +1943,7 @@ bool TextNode::Emit(RegExpCompiler* compiler, GenerationVariant* variant) {
        ASSERT_EQ(elm.type, TextElement::CHAR_CLASS);
      }
    }
-  // Second, handle case independent letter matches if any.
+  // Third, handle case independent letter matches if any.
    if (compiler->ignore_case()) {
      for (int i = element_count - 1; i >= 0; i--) {
        TextElement elm = elms_->at(i);
@@ -1930,7 +1971,8 @@ bool TextNode::Emit(RegExpCompiler* compiler, GenerationVariant* variant) {
                      cc,
                      cp_offset,
                      backtrack,
-                    checked_up_to < cp_offset);
+                    checked_up_to < cp_offset,
+                    compiler->ascii());
        if (cp_offset > checked_up_to) checked_up_to = cp_offset;
      }
    }
@@ -2791,7 +2833,7 @@ static void AddClassNegated(const uc16 *elmv,
                              int elmc,
                              ZoneList<CharacterRange>* ranges) {
    ASSERT(elmv[0] != 0x0000);
-  ASSERT(elmv[elmc-1] != 0xFFFF);
+  ASSERT(elmv[elmc-1] != String::kMaxUC16CharCode);
    uc16 last = 0x0000;
    for (int i = 0; i < elmc; i += 2) {
      ASSERT(last <= elmv[i] - 1);
@@ -2799,7 +2841,7 @@ static void AddClassNegated(const uc16 *elmv,
      ranges->Add(CharacterRange(last, elmv[i] - 1));
      last = elmv[i + 1] + 1;
    }
-  ranges->Add(CharacterRange(last, 0xFFFF));
+  ranges->Add(CharacterRange(last, String::kMaxUC16CharCode));
  }
  
  
@@ -3187,7 +3229,7 @@ void DispatchTable::AddRange(CharacterRange full_range, int value) {
        entry->AddValue(value);
        // Bail out if the last interval ended at 0xFFFF since otherwise
        // adding 1 will wrap around to 0.
-      if (entry->to() == 0xFFFF)
+      if (entry->to() == String::kMaxUC16CharCode)
          break;
        ASSERT(entry->to() + 1 > current.from());
        current.set_from(entry->to() + 1);
@@ -3562,14 +3604,14 @@ void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
      if (last < range.from())
        AddRange(CharacterRange(last, range.from() - 1));
      if (range.to() >= last) {
-      if (range.to() == 0xFFFF) {
+      if (range.to() == String::kMaxUC16CharCode) {
          return;
        } else {
          last = range.to() + 1;
        }
      }
    }
-  AddRange(CharacterRange(last, 0xFFFF));
+  AddRange(CharacterRange(last, String::kMaxUC16CharCode));
  }
  
  
@@ -3611,7 +3653,7 @@ Handle<FixedArray> RegExpEngine::Compile(RegExpParseResult* input,
                                           bool is_multiline,
                                           Handle<String> pattern,
                                           bool is_ascii) {
-  RegExpCompiler compiler(input->capture_count, ignore_case);
+  RegExpCompiler compiler(input->capture_count, ignore_case, is_ascii);
    // Wrap the body of the regexp in capture #0.
    RegExpNode* captured_body = RegExpCapture::ToNode(input->tree,
                                                      0,
diff --git a/src/objects.cc b/src/objects.cc

index edc5a0d03c86afcd4d03f9863b9cd236d597ac43..51a2ea3988d6644978376217193758507556a4cb 100644 (file)
--- a/src/objects.cc
+++ b/src/objects.cc
@@ -48,6 +48,9 @@ namespace v8 { namespace internal {
  const int kGetterIndex = 0;
  const int kSetterIndex = 1;
  
+const int String::kMaxAsciiCharCode;
+const int String::kMaxUC16CharCode;
+
  bool Object::IsInstanceOf(FunctionTemplateInfo* expected) {
    // There is a constraint on the object; check
    if (!this->IsJSObject()) return false;
diff --git a/src/objects.h b/src/objects.h

index 1d2ba896eb57dd82deb4e281b15a2905f520d7c9..b7e77c20e2da1ccd7e7f5518998f4637b4174823 100644 (file)
--- a/src/objects.h
+++ b/src/objects.h
@@ -3212,6 +3212,7 @@ class String: public HeapObject {
  
    // Max ascii char code.
    static const int kMaxAsciiCharCode = unibrow::Utf8::kMaxOneByteChar;
+  static const int kMaxUC16CharCode = 0xffff;
  
    // Minimum length for a cons or sliced string.
    static const int kMinNonFlatLength = 13;
author	erik.corry@gmail.com <erik.corry@gmail.com@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>
	Tue, 9 Dec 2008 08:30:49 +0000 (08:30 +0000)
committer	erik.corry@gmail.com <erik.corry@gmail.com@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>
	Tue, 9 Dec 2008 08:30:49 +0000 (08:30 +0000)
src/interpreter-irregexp.cc		patch \| blob \| history
src/interpreter-irregexp.h		patch \| blob \| history
src/jsregexp.cc		patch \| blob \| history
src/objects.cc		patch \| blob \| history
src/objects.h		patch \| blob \| history