Issue 227 Fixed. Properly handles non-ASCII characters in quick-check on ASCII strings.
authorlrn@chromium.org <lrn@chromium.org@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>
Wed, 11 Feb 2009 11:54:30 +0000 (11:54 +0000)
committerlrn@chromium.org <lrn@chromium.org@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>
Wed, 11 Feb 2009 11:54:30 +0000 (11:54 +0000)
git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@1248 ce2b1a6d-e550-0410-aec6-3dcde31c8c00

src/jsregexp.cc
test/mjsunit/regress/regress-227.js [new file with mode: 0644]

index dadec3ba6170738d8e41c51af5ab994ded7431df..717a66bceacf3448af4a2ae6d64e94fc4dc64441 100644 (file)
@@ -2227,9 +2227,17 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
       for (int i = 0; i < characters && i < quarks.length(); i++) {
         QuickCheckDetails::Position* pos =
             details->positions(characters_filled_in);
+        uc16 c = quarks[i];
+        if (c > char_mask) {
+          // If we expect a non-ASCII character from an ASCII string,
+          // there is no way we can match. Not even case independent
+          // matching can turn an ASCII character into non-ASCII or
+          // vice versa.
+          details->set_cannot_match();
+          return;
+        }
         if (compiler->ignore_case()) {
           unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
-          uc16 c = quarks[i];
           int length = uncanonicalize.get(c, '\0', chars);
           if (length < 2) {
             // This letter has no case equivalents, so it's nice and simple
@@ -2262,7 +2270,7 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
           // determine definitely whether we have a match at this character
           // position.
           pos->mask = char_mask;
-          pos->value = quarks[i];
+          pos->value = c;
           pos->determines_perfectly = true;
         }
         characters_filled_in++;
@@ -2658,47 +2666,52 @@ void TextNode::TextEmitPass(RegExpCompiler* compiler,
         for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
           bool bound_checked = true;  // Most ops will check their bounds.
           if (first_element_checked && i == 0 && j == 0) continue;
-          if (quick_check != NULL &&
-              elm.cp_offset + j < quick_check->characters() &&
-              quick_check->positions(elm.cp_offset + j)->determines_perfectly) {
-            continue;
-          }
           if (pass == NON_ASCII_MATCH) {
             ASSERT(ascii);
             if (quarks[j] > String::kMaxAsciiCharCode) {
               assembler->GoTo(backtrack);
               return;
             }
-          } else if (pass == CHARACTER_MATCH) {
-            if (compiler->ignore_case()) {
-              bound_checked = EmitAtomNonLetter(assembler,
-                                                quarks[j],
-                                                backtrack,
-                                                cp_offset + j,
-                                                *checked_up_to < cp_offset + j,
-                                                preloaded);
-            } else {
-              if (!preloaded) {
-                assembler->LoadCurrentCharacter(cp_offset + j,
-                                                backtrack,
-                                                *checked_up_to < cp_offset + j);
+          } else {
+            if (quick_check != NULL &&
+                elm.cp_offset + j < quick_check->characters() &&
+                quick_check->positions(elm.cp_offset + j)->
+                    determines_perfectly) {
+              continue;
+            }
+            if (pass == CHARACTER_MATCH) {
+              if (compiler->ignore_case()) {
+                bound_checked = EmitAtomNonLetter(
+                    assembler,
+                    quarks[j],
+                    backtrack,
+                    cp_offset + j,
+                    *checked_up_to < cp_offset + j,
+                    preloaded);
+              } else {
+                if (!preloaded) {
+                  assembler->LoadCurrentCharacter(
+                      cp_offset + j,
+                      backtrack,
+                      *checked_up_to < cp_offset + j);
+                }
+                assembler->CheckNotCharacter(quarks[j], backtrack);
               }
-              assembler->CheckNotCharacter(quarks[j], backtrack);
+            } else {
+              ASSERT_EQ(pass, CASE_CHARACTER_MATCH);
+              ASSERT(compiler->ignore_case());
+              bound_checked = EmitAtomLetter(assembler,
+                                             compiler->ascii(),
+                                             quarks[j],
+                                             backtrack,
+                                             cp_offset + j,
+                                             *checked_up_to < cp_offset + j,
+                                             preloaded);
             }
-          } else {
-            ASSERT_EQ(pass, CASE_CHARACTER_MATCH);
-            ASSERT(compiler->ignore_case());
-            bound_checked = EmitAtomLetter(assembler,
-                                           compiler->ascii(),
-                                           quarks[j],
-                                           backtrack,
-                                           cp_offset + j,
-                                           *checked_up_to < cp_offset + j,
-                                           preloaded);
-          }
-          if (pass != NON_ASCII_MATCH && bound_checked) {
-            if (cp_offset + j > *checked_up_to) {
-              *checked_up_to = cp_offset + j;
+            if (bound_checked) {
+              if (cp_offset + j > *checked_up_to) {
+                *checked_up_to = cp_offset + j;
+              }
             }
           }
         }
diff --git a/test/mjsunit/regress/regress-227.js b/test/mjsunit/regress/regress-227.js
new file mode 100644 (file)
index 0000000..ebb4627
--- /dev/null
@@ -0,0 +1,33 @@
+// Copyright 2009 the V8 project authors. All rights reserved.
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+//       notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+//       copyright notice, this list of conditions and the following
+//       disclaimer in the documentation and/or other materials provided
+//       with the distribution.
+//     * Neither the name of Google Inc. nor the names of its
+//       contributors may be used to endorse or promote products derived
+//       from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+var re = /\u23a1|x/;
+var res = re.exec("!");
+assertEquals(null, res, "Throwing away high bits on ASCII string");
+
+res = re.exec("!x");
+assertEquals(["x"], res, "Throwing away high bits on ASCII string");