Cleanup latin-1 conversion check in regexp engine

author yangguo@chromium.org <yangguo@chromium.org@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>

Wed, 16 Jan 2013 13:04:07 +0000 (13:04 +0000)

committer yangguo@chromium.org <yangguo@chromium.org@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>

Wed, 16 Jan 2013 13:04:07 +0000 (13:04 +0000)
author yangguo@chromium.org <yangguo@chromium.org@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>
Wed, 16 Jan 2013 13:04:07 +0000 (13:04 +0000)
committer yangguo@chromium.org <yangguo@chromium.org@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>
Wed, 16 Jan 2013 13:04:07 +0000 (13:04 +0000)
diff --git a/src/jsregexp.cc b/src/jsregexp.cc

index a3efb859ac66867d6931f9f1f8affb84254a9e44..f6e2e7f905052dfff5d8e38c26ca40abaf4d801c 100644 (file)
--- a/src/jsregexp.cc
+++ b/src/jsregexp.cc
@@ -2875,23 +2875,9 @@ RegExpNode* TextNode::FilterASCII(int depth, bool ignore_case) {
          if (!ignore_case) return set_replacement(NULL);
          // Here, we need to check for characters whose upper and lower cases
          // are outside the Latin-1 range.
-        // TODO(dcarney): Replace this code with a simple
-        // table lookup in unibrow::Latin-1.
-        // TODO(dcarney): Test cases!.
-        unibrow::uchar result;
-        int chars;
-        chars = unibrow::ToLowercase::Convert(quarks[j], 0, &result, NULL);
-        if (chars > 1 ||
-            (chars == 1 && result <= String::kMaxOneByteCharCodeU)) {
-          continue;
-        }
-        chars = unibrow::ToUppercase::Convert(quarks[j], 0, &result, NULL);
-        if (chars > 1 ||
-            (chars == 1 && result <= String::kMaxOneByteCharCodeU)) {
-          continue;
+        if (!unibrow::Latin1::NonLatin1CanBeConvertedToLatin1(quarks[j])) {
+          return set_replacement(NULL);
          }
-        // This character is definitely not in the Latin-1 range.
-        return set_replacement(NULL);
  #endif
        }
      } else {
diff --git a/src/unicode-inl.h b/src/unicode-inl.h

index c3a00ed710eb1da94ea8e2fe5e56cadd4db9f3be..b4e2cb52130c5ca8cb94c6ca26f403a28dd1cb56 100644 (file)
--- a/src/unicode-inl.h
+++ b/src/unicode-inl.h
@@ -79,6 +79,36 @@ template <class T, int s> int Mapping<T, s>::CalculateValue(uchar c, uchar n,
  }
  
  
+bool Latin1::NonLatin1CanBeConvertedToLatin1(uint16_t c) {
+  ASSERT(c > Latin1::kMaxChar);
+  switch (c) {
+    case 0x130:
+    case 0x131:
+    case 0x149:
+    case 0x178:
+    case 0x17f:
+    case 0x1f0:
+    case 0x1e96:
+    case 0x1e97:
+    case 0x1e98:
+    case 0x1e99:
+    case 0x1e9a:
+    case 0x1e9e:
+    case 0x212a:
+    case 0x212b:
+    case 0xfb00:
+    case 0xfb01:
+    case 0xfb02:
+    case 0xfb03:
+    case 0xfb04:
+    case 0xfb05:
+    case 0xfb06:
+      return true;
+  }
+  return false;
+}
+
+
  unsigned Utf8::Encode(char* str, uchar c, int previous) {
    static const int kMask = ~(1 << 6);
    if (c <= kMaxOneByteChar) {
diff --git a/src/unicode.h b/src/unicode.h

index 61dbab4f00c2088d724c434957ddb1a94736054b..0278576317c256450c11ebe3ec52cf4100a5530d 100644 (file)
--- a/src/unicode.h
+++ b/src/unicode.h
@@ -140,6 +140,7 @@ class Latin1 {
  #else
    static const unsigned kMaxChar = 0xff;
  #endif
+  static inline bool NonLatin1CanBeConvertedToLatin1(uint16_t);
  };
  
  class Utf8 {
diff --git a/test/cctest/test-strings.cc b/test/cctest/test-strings.cc

index 0d2971b5ba31e09a7f75fd3af630e0479eea40ea..3684b879738262d1875834cd9554dbaa3e34e10b 100644 (file)
--- a/test/cctest/test-strings.cc
+++ b/test/cctest/test-strings.cc
@@ -1275,3 +1275,40 @@ TEST(IsAscii) {
    CHECK(String::IsAscii(static_cast<char*>(NULL), 0));
    CHECK(String::IsOneByte(static_cast<uc16*>(NULL), 0));
  }
+
+
+static bool CanBeConvertedToLatin1(uint16_t c) {
+  CHECK(c > unibrow::Latin1::kMaxChar);
+  uint32_t result[4];
+  int chars;
+  chars = unibrow::ToLowercase::Convert(c, 0, result, NULL);
+  if (chars > 0) {
+    CHECK_LE(chars, static_cast<int>(sizeof(result)));
+    for (int i = 0; i < chars; i++) {
+      if (result[i] <= unibrow::Latin1::kMaxChar) {
+        return true;
+      }
+    }
+  }
+  chars = unibrow::ToUppercase::Convert(c, 0, result, NULL);
+  if (chars > 0) {
+    CHECK_LE(chars, static_cast<int>(sizeof(result)));
+    for (int i = 0; i < chars; i++) {
+      if (result[i] <= unibrow::Latin1::kMaxChar) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+
+TEST(Latin1) {
+#ifndef ENABLE_LATIN_1
+    if (true) return;
+#endif
+  for (uint16_t c = unibrow::Latin1::kMaxChar + 1; c != 0; c++) {
+    CHECK_EQ(CanBeConvertedToLatin1(c),
+             unibrow::Latin1::NonLatin1CanBeConvertedToLatin1(c));
+  }
+}
author	yangguo@chromium.org <yangguo@chromium.org@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>
	Wed, 16 Jan 2013 13:04:07 +0000 (13:04 +0000)
committer	yangguo@chromium.org <yangguo@chromium.org@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>
	Wed, 16 Jan 2013 13:04:07 +0000 (13:04 +0000)
src/jsregexp.cc		patch \| blob \| history
src/unicode-inl.h		patch \| blob \| history
src/unicode.h		patch \| blob \| history
test/cctest/test-strings.cc		patch \| blob \| history