}
+// We need to check for the following characters: 0x39c 0x3bc 0x178.
+static inline bool RangeContainsLatin1Equivalents(CharacterRange range) {
+#ifdef ENABLE_LATIN_1
+ // TODO(dcarney): this could be a lot more efficient.
+ return range.Contains(0x39c) ||
+ range.Contains(0x3bc) || range.Contains(0x178);
+#else
+ return false;
+#endif
+}
+
+
+#ifdef ENABLE_LATIN_1
+static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {
+ for (int i = 0; i < ranges->length(); i++) {
+ // TODO(dcarney): this could be a lot more efficient.
+ if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;
+ }
+ return false;
+}
+#endif
+
+
RegExpNode* TextNode::FilterASCII(int depth, bool ignore_case) {
if (info()->replacement_calculated) return replacement();
if (depth < 0) return this;
return set_replacement(NULL);
}
#else
- if (quarks[j] <= String::kMaxOneByteCharCode) continue;
+ uint16_t c = quarks[j];
+ if (c <= String::kMaxOneByteCharCode) continue;
if (!ignore_case) return set_replacement(NULL);
// Here, we need to check for characters whose upper and lower cases
// are outside the Latin-1 range.
- if (!unibrow::Latin1::NonLatin1CanBeConvertedToLatin1(quarks[j])) {
- return set_replacement(NULL);
- }
+ uint16_t converted = unibrow::Latin1::ConvertNonLatin1ToLatin1(c);
+ // Character is outside Latin-1 completely
+ if (converted == 0) return set_replacement(NULL);
+ // Convert quark to Latin-1 in place.
+ uint16_t* copy = const_cast<uint16_t*>(quarks.start());
+ copy[j] = converted;
#endif
}
} else {
ASSERT(elm.type == TextElement::CHAR_CLASS);
-#ifdef ENABLE_LATIN_1
- // TODO(dcarney): Can this be improved?
- if (ignore_case) continue;
-#endif
RegExpCharacterClass* cc = elm.data.u_char_class;
ZoneList<CharacterRange>* ranges = cc->ranges(zone());
if (!CharacterRange::IsCanonical(ranges)) {
if (range_count != 0 &&
ranges->at(0).from() == 0 &&
ranges->at(0).to() >= String::kMaxOneByteCharCode) {
+#ifdef ENABLE_LATIN_1
+ // This will be handled in a later filter.
+ if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
+#endif
return set_replacement(NULL);
}
} else {
if (range_count == 0 ||
ranges->at(0).from() > String::kMaxOneByteCharCode) {
+#ifdef ENABLE_LATIN_1
+ // This will be handled in a later filter.
+ if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
+#endif
return set_replacement(NULL);
}
}
Isolate* isolate = Isolate::Current();
uc16 bottom = from();
uc16 top = to();
- if (is_ascii) {
+ if (is_ascii && !RangeContainsLatin1Equivalents(*this)) {
if (bottom > String::kMaxOneByteCharCode) return;
if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
}
0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'h' - 'o'
0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'p' - 'w'
0xffu, 0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, // 'x' - 'z'
+ // Latin-1 range
+ 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+ 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+ 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+ 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+
+ 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+ 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+ 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+ 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+
+ 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+ 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+ 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+ 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+
+ 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+ 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+ 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
+ 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
};
static const byte* StringCharacterPosition(String* subject, int start_index);
- // Byte map of ASCII characters with a 0xff if the character is a word
+ // Byte map of one byte characters with a 0xff if the character is a word
// character (digit, letter or underscore) and 0x00 otherwise.
// Used by generated RegExp code.
- static const byte word_character_map[128];
+ static const byte word_character_map[256];
static Address word_character_map_address() {
return const_cast<Address>(&word_character_map[0]);
// Fast check for a junk value. A valid string may start from a
// whitespace, a sign ('+' or '-'), the decimal point, a decimal digit or
// the 'I' character ('Infinity'). All of that have codes not greater than
- // '9' except 'I'.
- if (data[start_pos] != 'I') {
+ // '9' except 'I' and .
+ if (data[start_pos] != 'I' && data[start_pos] != 0xa0) {
return isolate->heap()->nan_value();
}
} else if (len - start_pos < 10 && AreDigits(data, start_pos, len)) {
}
-bool Latin1::NonLatin1CanBeConvertedToLatin1(uint16_t c) {
+uint16_t Latin1::ConvertNonLatin1ToLatin1(uint16_t c) {
ASSERT(c > Latin1::kMaxChar);
switch (c) {
- case 0x130:
- case 0x131:
- case 0x149:
+ // This are equivalent characters in unicode.
+ case 0x39c:
+ case 0x3bc:
+ return 0xb5;
+ // This is an uppercase of a Latin-1 character
+ // outside of Latin-1.
case 0x178:
- case 0x17f:
- case 0x1f0:
- case 0x1e96:
- case 0x1e97:
- case 0x1e98:
- case 0x1e99:
- case 0x1e9a:
- case 0x1e9e:
- case 0x212a:
- case 0x212b:
- case 0xfb00:
- case 0xfb01:
- case 0xfb02:
- case 0xfb03:
- case 0xfb04:
- case 0xfb05:
- case 0xfb06:
- return true;
+ return 0xff;
}
- return false;
+ return 0;
}
#else
static const unsigned kMaxChar = 0xff;
#endif
- static inline bool NonLatin1CanBeConvertedToLatin1(uint16_t);
+ // Returns 0 if character does not convert to single latin-1 character
+ // or if the character doesn't not convert back to latin-1 via inverse
+ // operation (upper to lower, etc).
+ static inline uint16_t ConvertNonLatin1ToLatin1(uint16_t);
};
class Utf8 {
}
-static bool CanBeConvertedToLatin1(uint16_t c) {
- CHECK(c > unibrow::Latin1::kMaxChar);
- uint32_t result[4];
+
+#ifdef ENABLE_LATIN_1
+template<typename Op, bool return_first>
+static uint16_t ConvertLatin1(uint16_t c) {
+ uint32_t result[Op::kMaxWidth];
int chars;
- chars = unibrow::ToLowercase::Convert(c, 0, result, NULL);
- if (chars > 0) {
- CHECK_LE(chars, static_cast<int>(sizeof(result)));
- for (int i = 0; i < chars; i++) {
- if (result[i] <= unibrow::Latin1::kMaxChar) {
- return true;
- }
- }
- }
- chars = unibrow::ToUppercase::Convert(c, 0, result, NULL);
- if (chars > 0) {
- CHECK_LE(chars, static_cast<int>(sizeof(result)));
- for (int i = 0; i < chars; i++) {
- if (result[i] <= unibrow::Latin1::kMaxChar) {
- return true;
- }
- }
+ chars = Op::Convert(c, 0, result, NULL);
+ if (chars == 0) return 0;
+ CHECK_LE(chars, static_cast<int>(sizeof(result)));
+ if (!return_first && chars > 1) {
+ return 0;
}
- return false;
+ return result[0];
}
-TEST(Latin1) {
-#ifndef ENABLE_LATIN_1
- if (true) return;
-#endif
- for (uint16_t c = unibrow::Latin1::kMaxChar + 1; c != 0; c++) {
- CHECK_EQ(CanBeConvertedToLatin1(c),
- unibrow::Latin1::NonLatin1CanBeConvertedToLatin1(c));
+static void CheckCanonicalEquivalence(uint16_t c, uint16_t test) {
+ uint16_t expect = ConvertLatin1<unibrow::Ecma262UnCanonicalize, true>(c);
+ if (expect > unibrow::Latin1::kMaxChar) expect = 0;
+ CHECK_EQ(expect, test);
+}
+
+
+TEST(Latin1IgnoreCase) {
+ if (true) return;
+ using namespace unibrow;
+ for (uint16_t c = Latin1::kMaxChar + 1; c != 0; c++) {
+ uint16_t lower = ConvertLatin1<ToLowercase, false>(c);
+ uint16_t upper = ConvertLatin1<ToUppercase, false>(c);
+ uint16_t test = Latin1::ConvertNonLatin1ToLatin1(c);
+ // Filter out all character whose upper is not their lower or vice versa.
+ if (lower == 0 && upper == 0) {
+ CheckCanonicalEquivalence(c, test);
+ continue;
+ }
+ if (lower > Latin1::kMaxChar && upper > Latin1::kMaxChar) {
+ CheckCanonicalEquivalence(c, test);
+ continue;
+ }
+ if (lower == 0 && upper != 0) {
+ lower = ConvertLatin1<ToLowercase, false>(upper);
+ }
+ if (upper == 0 && lower != c) {
+ upper = ConvertLatin1<ToUppercase, false>(lower);
+ }
+ if (lower > Latin1::kMaxChar && upper > Latin1::kMaxChar) {
+ CheckCanonicalEquivalence(c, test);
+ continue;
+ }
+ if (upper != c && lower != c) {
+ CheckCanonicalEquivalence(c, test);
+ continue;
+ }
+ CHECK_EQ(Min(upper, lower), test);
}
}
+#endif // ENABLE_LATIN_1
// Should have hit the branch for the following char codes:
// [A-Z], [192-222] but not 215
assertEquals((90-65+1)+(222-192-1+1), total_lo);
+
+// Latin-1 whitespace character
+assertEquals( 1, +(String.fromCharCode(0xA0) + '1') );
+
+// Latin-1 \W characters
+assertEquals(["+\u00a3", "=="], "+\u00a3==".match(/\W\W/g));
+
+// Latin-1 character that uppercases out of Latin-1.
+assertTrue(/\u0178/i.test('\u00ff'));
+
+// Unicode equivalence
+assertTrue(/\u039c/i.test('\u00b5'));
+assertTrue(/\u039c/i.test('\u03bc'));
+assertTrue(/\u00b5/i.test('\u03bc'));
+// Unicode equivalence ranges
+assertTrue(/[\u039b-\u039d]/i.test('\u00b5'));
+assertFalse(/[^\u039b-\u039d]/i.test('\u00b5'));
+assertFalse(/[\u039b-\u039d]/.test('\u00b5'));
+assertTrue(/[^\u039b-\u039d]/.test('\u00b5'));