ScriptData* ScriptData::PreCompile(const char* input, int length) {
- i::Utf8ToUC16CharacterStream stream(
+ i::Utf8ToUtf16CharacterStream stream(
reinterpret_cast<const unsigned char*>(input), length);
return i::ParserApi::PreParse(&stream, NULL, i::FLAG_harmony_scoping);
}
ScriptData* ScriptData::PreCompile(v8::Handle<String> source) {
i::Handle<i::String> str = Utils::OpenHandle(*source);
if (str->IsExternalTwoByteString()) {
- i::ExternalTwoByteStringUC16CharacterStream stream(
+ i::ExternalTwoByteStringUtf16CharacterStream stream(
i::Handle<i::ExternalTwoByteString>::cast(str), 0, str->length());
return i::ParserApi::PreParse(&stream, NULL, i::FLAG_harmony_scoping);
} else {
- i::GenericStringUC16CharacterStream stream(str, 0, str->length());
+ i::GenericStringUtf16CharacterStream stream(str, 0, str->length());
return i::ParserApi::PreParse(&stream, NULL, i::FLAG_harmony_scoping);
}
}
int String::Utf8Length() const {
i::Handle<i::String> str = Utils::OpenHandle(this);
if (IsDeadCheck(str->GetIsolate(), "v8::String::Utf8Length()")) return 0;
- return str->Utf8Length();
+ return i::Utf8Length(str);
}
int i;
int pos = 0;
int nchars = 0;
+ int previous = unibrow::Utf16::kNoPreviousCharacter;
for (i = 0; i < len && (capacity == -1 || pos < fast_end); i++) {
i::uc32 c = write_input_buffer.GetNext();
- int written = unibrow::Utf8::Encode(buffer + pos, c);
+ int written = unibrow::Utf8::Encode(buffer + pos, c, previous);
pos += written;
nchars++;
+ previous = c;
}
if (i < len) {
// For the last characters we need to check the length for each one
char intermediate[unibrow::Utf8::kMaxEncodedSize];
for (; i < len && pos < capacity; i++) {
i::uc32 c = write_input_buffer.GetNext();
- int written = unibrow::Utf8::Encode(intermediate, c);
- if (pos + written <= capacity) {
- for (int j = 0; j < written; j++)
- buffer[pos + j] = intermediate[j];
+ if (unibrow::Utf16::IsTrailSurrogate(c) &&
+ unibrow::Utf16::IsLeadSurrogate(previous)) {
+ // We can't use the intermediate buffer here because the encoding
+ // of surrogate pairs is done under assumption that you can step
+ // back and fix the UTF8 stream. Luckily we only need space for one
+ // more byte, so there is always space.
+ ASSERT(pos < capacity);
+ int written = unibrow::Utf8::Encode(buffer + pos, c, previous);
+ ASSERT(written == 1);
pos += written;
nchars++;
} else {
- // We've reached the end of the buffer
- break;
+ int written =
+ unibrow::Utf8::Encode(intermediate,
+ c,
+ unibrow::Utf16::kNoPreviousCharacter);
+ if (pos + written <= capacity) {
+ for (int j = 0; j < written; j++)
+ buffer[pos + j] = intermediate[j];
+ pos += written;
+ nchars++;
+ } else {
+ // We've reached the end of the buffer
+ break;
+ }
}
+ previous = c;
}
}
if (nchars_ref != NULL) *nchars_ref = nchars;
TryCatch try_catch;
Handle<String> str = obj->ToString();
if (str.IsEmpty()) return;
- length_ = str->Utf8Length();
+ i::Handle<i::String> i_str = Utils::OpenHandle(*str);
+ length_ = i::Utf8Length(i_str);
str_ = i::NewArray<char>(length_ + 1);
str->WriteUtf8(str_);
}
uc16 minus,
uc16 mask,
Label* on_not_equal) {
- ASSERT(minus < String::kMaxUC16CharCode);
+ ASSERT(minus < String::kMaxUtf16CodeUnit);
__ sub(r0, current_character(), Operand(minus));
__ and_(r0, r0, Operand(mask));
__ cmp(r0, Operand(c));
// Calculate the message size in UTF-8 encoding.
int utf8_len = 0;
+ int previous = unibrow::Utf16::kNoPreviousCharacter;
for (int i = 0; i < message.length(); i++) {
- utf8_len += unibrow::Utf8::Length(message[i]);
+ uint16_t character = message[i];
+ utf8_len += unibrow::Utf8::Length(character, previous);
+ previous = character;
}
// Send the header.
// Send message body as UTF-8.
int buffer_position = 0; // Current buffer position.
+ previous = unibrow::Utf16::kNoPreviousCharacter;
for (int i = 0; i < message.length(); i++) {
// Write next UTF-8 encoded character to buffer.
+ uint16_t character = message[i];
buffer_position +=
- unibrow::Utf8::Encode(buffer + buffer_position, message[i]);
+ unibrow::Utf8::Encode(buffer + buffer_position, character, previous);
ASSERT(buffer_position < kBufferSize);
// Send buffer if full or last character is encoded.
- if (kBufferSize - buffer_position < 3 || i == message.length() - 1) {
- conn->Send(buffer, buffer_position);
- buffer_position = 0;
+ if (kBufferSize - buffer_position <
+ unibrow::Utf16::kMaxExtraUtf8BytesForOneUtf16CodeUnit ||
+ i == message.length() - 1) {
+ if (unibrow::Utf16::IsLeadSurrogate(character)) {
+ const int kEncodedSurrogateLength =
+ unibrow::Utf16::kUtf8BytesToCodeASurrogate;
+ ASSERT(buffer_position >= kEncodedSurrogateLength);
+ conn->Send(buffer, buffer_position - kEncodedSurrogateLength);
+ for (int i = 0; i < kEncodedSurrogateLength; i++) {
+ buffer[i] = buffer[buffer_position + i];
+ }
+ buffer_position = kEncodedSurrogateLength;
+ } else {
+ conn->Send(buffer, buffer_position);
+ buffer_position = 0;
+ }
}
+ previous = character;
}
return true;
// other bits set.
const uint64_t kQuietNaNMask = static_cast<uint64_t>(0xfff) << 51;
-// ASCII/UC16 constants
+// ASCII/UTF-16 constants
// Code-point values in Unicode 4.0 are 21 bits wide.
+// Code units in UTF-16 are 16 bits wide.
typedef uint16_t uc16;
typedef int32_t uc32;
const int kASCIISize = kCharSize;
}
+// This method determines the type of string involved and then gets the UTF8
+// length of the string. It doesn't flatten the string and has log(n) recursion
+// for a string of length n. If the failure flag gets set, then we have to
+// flatten the string and retry. Failures are caused by surrogate pairs in deep
+// cons strings.
+
+// Single surrogate characters that are encountered in the UTF-16 character
+// sequence of the input string get counted as 3 UTF-8 bytes, because that
+// is the way that WriteUtf8 will encode them. Surrogate pairs are counted and
+// encoded as one 4-byte UTF-8 sequence.
+
+// This function conceptually uses recursion on the two halves of cons strings.
+// However, in order to avoid the recursion going too deep it recurses on the
+// second string of the cons, but iterates on the first substring (by manually
+// eliminating it as a tail recursion). This means it counts the UTF-8 length
+// from the end to the start, which makes no difference to the total.
+
+// Surrogate pairs are recognized even if they are split across two sides of a
+// cons, which complicates the implementation somewhat. Therefore, too deep
+// recursion cannot always be avoided. This case is detected, and the failure
+// flag is set, a signal to the caller that the string should be flattened and
+// the operation retried.
+int Utf8LengthHelper(String* input,
+ int from,
+ int to,
+ bool followed_by_surrogate,
+ int max_recursion,
+ bool* failure,
+ bool* starts_with_surrogate) {
+ if (from == to) return 0;
+ int total = 0;
+ bool dummy;
+ while (true) {
+ if (input->IsAsciiRepresentation()) {
+ *starts_with_surrogate = false;
+ return total + to - from;
+ }
+ switch (StringShape(input).representation_tag()) {
+ case kConsStringTag: {
+ ConsString* str = ConsString::cast(input);
+ String* first = str->first();
+ String* second = str->second();
+ int first_length = first->length();
+ if (first_length - from > to - first_length) {
+ if (first_length < to) {
+ // Right hand side is shorter. No need to check the recursion depth
+ // since this can only happen log(n) times.
+ bool right_starts_with_surrogate = false;
+ total += Utf8LengthHelper(second,
+ 0,
+ to - first_length,
+ followed_by_surrogate,
+ max_recursion - 1,
+ failure,
+ &right_starts_with_surrogate);
+ if (*failure) return 0;
+ followed_by_surrogate = right_starts_with_surrogate;
+ input = first;
+ to = first_length;
+ } else {
+ // We only need the left hand side.
+ input = first;
+ }
+ } else {
+ if (first_length > from) {
+ // Left hand side is shorter.
+ if (first->IsAsciiRepresentation()) {
+ total += first_length - from;
+ *starts_with_surrogate = false;
+ starts_with_surrogate = &dummy;
+ input = second;
+ from = 0;
+ to -= first_length;
+ } else if (second->IsAsciiRepresentation()) {
+ followed_by_surrogate = false;
+ total += to - first_length;
+ input = first;
+ to = first_length;
+ } else if (max_recursion > 0) {
+ bool right_starts_with_surrogate = false;
+ // Recursing on the long one. This may fail.
+ total += Utf8LengthHelper(second,
+ 0,
+ to - first_length,
+ followed_by_surrogate,
+ max_recursion - 1,
+ failure,
+ &right_starts_with_surrogate);
+ if (*failure) return 0;
+ input = first;
+ to = first_length;
+ followed_by_surrogate = right_starts_with_surrogate;
+ } else {
+ *failure = true;
+ return 0;
+ }
+ } else {
+ // We only need the right hand side.
+ input = second;
+ from = 0;
+ to -= first_length;
+ }
+ }
+ continue;
+ }
+ case kExternalStringTag:
+ case kSeqStringTag: {
+ Vector<const uc16> vector = input->GetFlatContent().ToUC16Vector();
+ const uc16* p = vector.start();
+ int previous = unibrow::Utf16::kNoPreviousCharacter;
+ for (int i = from; i < to; i++) {
+ uc16 c = p[i];
+ total += unibrow::Utf8::Length(c, previous);
+ previous = c;
+ }
+ if (to - from > 0) {
+ if (unibrow::Utf16::IsLeadSurrogate(previous) &&
+ followed_by_surrogate) {
+ total -= unibrow::Utf8::kBytesSavedByCombiningSurrogates;
+ }
+ if (unibrow::Utf16::IsTrailSurrogate(p[from])) {
+ *starts_with_surrogate = true;
+ }
+ }
+ return total;
+ }
+ case kSlicedStringTag: {
+ SlicedString* str = SlicedString::cast(input);
+ int offset = str->offset();
+ input = str->parent();
+ from += offset;
+ to += offset;
+ continue;
+ }
+ default:
+ break;
+ }
+ UNREACHABLE();
+ return 0;
+ }
+ return 0;
+}
+
+
+int Utf8Length(Handle<String> str) {
+ bool dummy;
+ bool failure;
+ int len;
+ const int kRecursionBudget = 100;
+ do {
+ failure = false;
+ len = Utf8LengthHelper(
+ *str, 0, str->length(), false, kRecursionBudget, &failure, &dummy);
+ if (failure) FlattenString(str);
+ } while (failure);
+ return len;
+}
+
} } // namespace v8::internal
// string.
Handle<String> FlattenGetString(Handle<String> str);
+int Utf8Length(Handle<String> str);
+
Handle<Object> SetProperty(Handle<Object> object,
Handle<Object> key,
Handle<Object> value,
MaybeObject* Heap::AllocateStringFromUtf8Slow(Vector<const char> string,
PretenureFlag pretenure) {
- // V8 only supports characters in the Basic Multilingual Plane.
- const uc32 kMaxSupportedChar = 0xFFFF;
// Count the number of characters in the UTF-8 string and check if
// it is an ASCII string.
Access<UnicodeCache::Utf8Decoder>
decoder->Reset(string.start(), string.length());
int chars = 0;
while (decoder->has_more()) {
- decoder->GetNext();
- chars++;
+ uint32_t r = decoder->GetNext();
+ if (r <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
+ chars++;
+ } else {
+ chars += 2;
+ }
}
Object* result;
// Convert and copy the characters into the new object.
String* string_result = String::cast(result);
decoder->Reset(string.start(), string.length());
- for (int i = 0; i < chars; i++) {
- uc32 r = decoder->GetNext();
- if (r > kMaxSupportedChar) { r = unibrow::Utf8::kBadChar; }
- string_result->Set(i, r);
+ int i = 0;
+ while (i < chars) {
+ uint32_t r = decoder->GetNext();
+ if (r > unibrow::Utf16::kMaxNonSurrogateCharCode) {
+ string_result->Set(i++, unibrow::Utf16::LeadSurrogate(r));
+ string_result->Set(i++, unibrow::Utf16::TrailSurrogate(r));
+ } else {
+ string_result->Set(i++, r);
+ }
}
return result;
}
uint32_t hash_field) {
ASSERT(chars >= 0);
// Ensure the chars matches the number of characters in the buffer.
- ASSERT(static_cast<unsigned>(chars) == buffer->Length());
+ ASSERT(static_cast<unsigned>(chars) == buffer->Utf16Length());
// Determine whether the string is ASCII.
bool is_ascii = true;
while (buffer->has_more()) {
ASSERT_EQ(size, answer->Size());
// Fill in the characters.
- for (int i = 0; i < chars; i++) {
- answer->Set(i, buffer->GetNext());
+ int i = 0;
+ while (i < chars) {
+ uint32_t character = buffer->GetNext();
+ if (character > unibrow::Utf16::kMaxNonSurrogateCharCode) {
+ answer->Set(i++, unibrow::Utf16::LeadSurrogate(character));
+ answer->Set(i++, unibrow::Utf16::TrailSurrogate(character));
+ } else {
+ answer->Set(i++, character);
+ }
}
return answer;
}
virtual bool DataEquals(HValue* other) { return true; }
virtual Range* InferRange(Zone* zone) {
- return new(zone) Range(0, String::kMaxUC16CharCode);
+ return new(zone) Range(0, String::kMaxUtf16CodeUnit);
}
};
uc16 minus,
uc16 mask,
Label* on_not_equal) {
- ASSERT(minus < String::kMaxUC16CharCode);
+ ASSERT(minus < String::kMaxUtf16CodeUnit);
__ lea(eax, Operand(current_character(), -minus));
__ and_(eax, mask);
__ cmp(eax, c);
if (ascii) {
char_mask = String::kMaxAsciiCharCode;
} else {
- char_mask = String::kMaxUC16CharCode;
+ char_mask = String::kMaxUtf16CodeUnit;
}
uc16 exor = c1 ^ c2;
// Check whether exor has only one bit set.
if (ascii) {
max_char = String::kMaxAsciiCharCode;
} else {
- max_char = String::kMaxUC16CharCode;
+ max_char = String::kMaxUtf16CodeUnit;
}
Label success;
macro_assembler->CheckCharacterLT(from, on_failure);
}
}
- if (to != String::kMaxUC16CharCode) {
+ if (to != String::kMaxUtf16CodeUnit) {
if (cc->is_negated()) {
macro_assembler->CheckCharacterLT(to + 1, on_failure);
} else {
if (asc) {
char_mask = String::kMaxAsciiCharCode;
} else {
- char_mask = String::kMaxUC16CharCode;
+ char_mask = String::kMaxUtf16CodeUnit;
}
mask_ = 0;
value_ = 0;
if (compiler->ascii()) {
char_mask = String::kMaxAsciiCharCode;
} else {
- char_mask = String::kMaxUC16CharCode;
+ char_mask = String::kMaxUtf16CodeUnit;
}
if ((mask & char_mask) == char_mask) need_mask = false;
mask &= char_mask;
if (compiler->ascii()) {
char_mask = String::kMaxAsciiCharCode;
} else {
- char_mask = String::kMaxUC16CharCode;
+ char_mask = String::kMaxUtf16CodeUnit;
}
for (int k = 0; k < elms_->length(); k++) {
TextElement elm = elms_->at(k);
int elmc,
ZoneList<CharacterRange>* ranges) {
ASSERT(elmv[0] != 0x0000);
- ASSERT(elmv[elmc-1] != String::kMaxUC16CharCode);
+ ASSERT(elmv[elmc-1] != String::kMaxUtf16CodeUnit);
uc16 last = 0x0000;
for (int i = 0; i < elmc; i += 2) {
ASSERT(last <= elmv[i] - 1);
ranges->Add(CharacterRange(last, elmv[i] - 1));
last = elmv[i + 1] + 1;
}
- ranges->Add(CharacterRange(last, String::kMaxUC16CharCode));
+ ranges->Add(CharacterRange(last, String::kMaxUtf16CodeUnit));
}
from = range.to();
i++;
}
- if (from < String::kMaxUC16CharCode) {
- negated_ranges->Add(CharacterRange(from + 1, String::kMaxUC16CharCode));
+ if (from < String::kMaxUtf16CodeUnit) {
+ negated_ranges->Add(CharacterRange(from + 1, String::kMaxUtf16CodeUnit));
}
}
entry->AddValue(value);
// Bail out if the last interval ended at 0xFFFF since otherwise
// adding 1 will wrap around to 0.
- if (entry->to() == String::kMaxUC16CharCode)
+ if (entry->to() == String::kMaxUtf16CodeUnit)
break;
ASSERT(entry->to() + 1 > current.from());
current.set_from(entry->to() + 1);
int new_length = length + 1;
if (length > 0) {
if (ranges->at(0).from() == 0) new_length--;
- if (ranges->at(length - 1).to() == String::kMaxUC16CharCode) {
+ if (ranges->at(length - 1).to() == String::kMaxUtf16CodeUnit) {
new_length--;
}
}
if (last < range.from())
AddRange(CharacterRange(last, range.from() - 1));
if (range.to() >= last) {
- if (range.to() == String::kMaxUC16CharCode) {
+ if (range.to() == String::kMaxUtf16CodeUnit) {
return;
} else {
last = range.to() + 1;
}
}
}
- AddRange(CharacterRange(last, String::kMaxUC16CharCode));
+ AddRange(CharacterRange(last, String::kMaxUtf16CodeUnit));
}
utf8_pos_ += utf8_length;
return;
}
- int uc16_length = Min(str->length(), kUc16BufferSize);
- String::WriteToFlat(str, uc16_buffer_, 0, uc16_length);
+ int uc16_length = Min(str->length(), kUtf16BufferSize);
+ String::WriteToFlat(str, utf16_buffer, 0, uc16_length);
+ int previous = unibrow::Utf16::kNoPreviousCharacter;
for (int i = 0; i < uc16_length && utf8_pos_ < kUtf8BufferSize; ++i) {
- uc16 c = uc16_buffer_[i];
+ uc16 c = utf16_buffer[i];
if (c <= String::kMaxAsciiCharCodeU) {
utf8_buffer_[utf8_pos_++] = static_cast<char>(c);
} else {
- int char_length = unibrow::Utf8::Length(c);
+ int char_length = unibrow::Utf8::Length(c, previous);
if (utf8_pos_ + char_length > kUtf8BufferSize) break;
- unibrow::Utf8::Encode(utf8_buffer_ + utf8_pos_, c);
+ unibrow::Utf8::Encode(utf8_buffer_ + utf8_pos_, c, previous);
utf8_pos_ += char_length;
}
+ previous = c;
}
}
private:
static const int kUtf8BufferSize = 512;
- static const int kUc16BufferSize = 128;
+ static const int kUtf16BufferSize = 128;
int utf8_pos_;
char utf8_buffer_[kUtf8BufferSize];
- uc16 uc16_buffer_[kUc16BufferSize];
+ uc16 utf16_buffer[kUtf16BufferSize];
};
}
-void StringHasher::AddCharacter(uc32 c) {
+void StringHasher::AddCharacter(uint32_t c) {
+ if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) {
+ AddSurrogatePair(c); // Not inlined.
+ return;
+ }
// Use the Jenkins one-at-a-time hash function to update the hash
// for the given character.
raw_running_hash_ += c;
}
-void StringHasher::AddCharacterNoIndex(uc32 c) {
+void StringHasher::AddCharacterNoIndex(uint32_t c) {
ASSERT(!is_array_index());
+ if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) {
+ AddSurrogatePairNoIndex(c); // Not inlined.
+ return;
+ }
raw_running_hash_ += c;
raw_running_hash_ += (raw_running_hash_ << 10);
raw_running_hash_ ^= (raw_running_hash_ >> 6);
buffer->Reset(offset, this);
int character_position = offset;
int utf8_bytes = 0;
+ int last = unibrow::Utf16::kNoPreviousCharacter;
while (buffer->has_more() && character_position++ < offset + length) {
uint16_t character = buffer->GetNext();
- utf8_bytes += unibrow::Utf8::Length(character);
+ utf8_bytes += unibrow::Utf8::Length(character, last);
+ last = character;
}
if (length_return) {
buffer->Seek(offset);
character_position = offset;
int utf8_byte_position = 0;
+ last = unibrow::Utf16::kNoPreviousCharacter;
while (buffer->has_more() && character_position++ < offset + length) {
uint16_t character = buffer->GetNext();
if (allow_nulls == DISALLOW_NULLS && character == 0) {
character = ' ';
}
utf8_byte_position +=
- unibrow::Utf8::Encode(result + utf8_byte_position, character);
+ unibrow::Utf8::Encode(result + utf8_byte_position, character, last);
+ last = character;
}
result[utf8_byte_position] = 0;
return SmartArrayPointer<char>(result);
}
-// This method determines the type of string involved and then gets the UTF8
-// length of the string. It doesn't flatten the string and has log(n) recursion
-// for a string of length n.
-int String::Utf8Length(String* input, int from, int to) {
- if (from == to) return 0;
- int total = 0;
- while (true) {
- if (input->IsAsciiRepresentation()) return total + to - from;
- switch (StringShape(input).representation_tag()) {
- case kConsStringTag: {
- ConsString* str = ConsString::cast(input);
- String* first = str->first();
- String* second = str->second();
- int first_length = first->length();
- if (first_length - from < to - first_length) {
- if (first_length > from) {
- // Left hand side is shorter.
- total += Utf8Length(first, from, first_length);
- input = second;
- from = 0;
- to -= first_length;
- } else {
- // We only need the right hand side.
- input = second;
- from -= first_length;
- to -= first_length;
- }
- } else {
- if (first_length <= to) {
- // Right hand side is shorter.
- total += Utf8Length(second, 0, to - first_length);
- input = first;
- to = first_length;
- } else {
- // We only need the left hand side.
- input = first;
- }
- }
- continue;
- }
- case kExternalStringTag:
- case kSeqStringTag: {
- Vector<const uc16> vector = input->GetFlatContent().ToUC16Vector();
- const uc16* p = vector.start();
- for (int i = from; i < to; i++) {
- total += unibrow::Utf8::Length(p[i]);
- }
- return total;
- }
- case kSlicedStringTag: {
- SlicedString* str = SlicedString::cast(input);
- int offset = str->offset();
- input = str->parent();
- from += offset;
- to += offset;
- continue;
- }
- default:
- break;
- }
- UNREACHABLE();
- return 0;
- }
- return 0;
-}
-
-
void Relocatable::PostGarbageCollectionProcessing() {
Isolate* isolate = Isolate::Current();
Relocatable* current = isolate->relocatable_top();
// General slow case check. We know that the ia and ib iterators
// have the same length.
while (ia->has_more()) {
- uc32 ca = ia->GetNext();
- uc32 cb = ib->GetNext();
+ uint32_t ca = ia->GetNext();
+ uint32_t cb = ib->GetNext();
+ ASSERT(ca <= unibrow::Utf16::kMaxNonSurrogateCharCode);
+ ASSERT(cb <= unibrow::Utf16::kMaxNonSurrogateCharCode);
if (ca != cb)
return false;
}
decoder->Reset(str.start(), str.length());
int i;
for (i = 0; i < slen && decoder->has_more(); i++) {
- uc32 r = decoder->GetNext();
- if (Get(i) != r) return false;
+ uint32_t r = decoder->GetNext();
+ if (r > unibrow::Utf16::kMaxNonSurrogateCharCode) {
+ if (i > slen - 1) return false;
+ if (Get(i++) != unibrow::Utf16::LeadSurrogate(r)) return false;
+ if (Get(i) != unibrow::Utf16::TrailSurrogate(r)) return false;
+ } else {
+ if (Get(i) != r) return false;
+ }
}
return i == slen && !decoder->has_more();
}
}
+void StringHasher::AddSurrogatePair(uc32 c) {
+ uint16_t lead = unibrow::Utf16::LeadSurrogate(c);
+ AddCharacter(lead);
+ uint16_t trail = unibrow::Utf16::TrailSurrogate(c);
+ AddCharacter(trail);
+}
+
+
+void StringHasher::AddSurrogatePairNoIndex(uc32 c) {
+ uint16_t lead = unibrow::Utf16::LeadSurrogate(c);
+ AddCharacterNoIndex(lead);
+ uint16_t trail = unibrow::Utf16::TrailSurrogate(c);
+ AddCharacterNoIndex(trail);
+}
+
+
uint32_t StringHasher::GetHashField() {
ASSERT(is_valid());
if (length_ <= String::kMaxHashCalcLength) {
if (hash_field_ != 0) return hash_field_ >> String::kHashShift;
unibrow::Utf8InputBuffer<> buffer(string_.start(),
static_cast<unsigned>(string_.length()));
- chars_ = buffer.Length();
+ chars_ = buffer.Utf16Length();
hash_field_ = String::ComputeHashField(&buffer, chars_, seed_);
uint32_t result = hash_field_ >> String::kHashShift;
ASSERT(result != 0); // Ensure that the hash value of 0 is never computed.
inline bool has_trivial_hash();
// Add a character to the hash and update the array index calculation.
- inline void AddCharacter(uc32 c);
+ inline void AddCharacter(uint32_t c);
// Adds a character to the hash but does not update the array index
// calculation. This can only be called when it has been verified
// that the input is not an array index.
- inline void AddCharacterNoIndex(uc32 c);
+ inline void AddCharacterNoIndex(uint32_t c);
+
+ // Add a character above 0xffff as a surrogate pair. These can get into
+ // the hasher through the routines that take a UTF-8 string and make a symbol.
+ void AddSurrogatePair(uc32 c);
+ void AddSurrogatePairNoIndex(uc32 c);
// Returns the value to store in the hash field of a string with
// the given length and contents.
RobustnessFlag robustness_flag = FAST_STRING_TRAVERSAL,
int* length_output = 0);
- inline int Utf8Length() { return Utf8Length(this, 0, length()); }
- static int Utf8Length(String* input, int from, int to);
-
// Return a 16 bit Unicode representation of the string.
// The string should be nearly flat, otherwise the performance of
// of this method may be very bad. Setting robustness_flag to
// Max ASCII char code.
static const int kMaxAsciiCharCode = unibrow::Utf8::kMaxOneByteChar;
static const unsigned kMaxAsciiCharCodeU = unibrow::Utf8::kMaxOneByteChar;
- static const int kMaxUC16CharCode = 0xffff;
+ static const int kMaxUtf16CodeUnit = 0xffff;
// Mask constant for checking if a string has a computed hash code
// and if it is an array index. The least significant bit indicates
scanner().literal_ascii_string());
} else {
return isolate()->factory()->LookupTwoByteSymbol(
- scanner().literal_uc16_string());
+ scanner().literal_utf16_string());
}
}
return LookupCachedSymbol(symbol_id);
scanner().literal_ascii_string());
} else {
result = isolate()->factory()->LookupTwoByteSymbol(
- scanner().literal_uc16_string());
+ scanner().literal_utf16_string());
}
symbol_cache_.at(symbol_id) = result;
return result;
// Notice that the stream is destroyed at the end of the branch block.
// The last line of the blocks can't be moved outside, even though they're
// identical calls.
- ExternalTwoByteStringUC16CharacterStream stream(
+ ExternalTwoByteStringUtf16CharacterStream stream(
Handle<ExternalTwoByteString>::cast(source), 0, source->length());
scanner_.Initialize(&stream);
return DoParseProgram(info, source, &zone_scope);
} else {
- GenericStringUC16CharacterStream stream(source, 0, source->length());
+ GenericStringUtf16CharacterStream stream(source, 0, source->length());
scanner_.Initialize(&stream);
return DoParseProgram(info, source, &zone_scope);
}
// Initialize parser state.
source->TryFlatten();
if (source->IsExternalTwoByteString()) {
- ExternalTwoByteStringUC16CharacterStream stream(
+ ExternalTwoByteStringUtf16CharacterStream stream(
Handle<ExternalTwoByteString>::cast(source),
shared_info->start_position(),
shared_info->end_position());
FunctionLiteral* result = ParseLazy(info, &stream, &zone_scope);
return result;
} else {
- GenericStringUC16CharacterStream stream(source,
- shared_info->start_position(),
- shared_info->end_position());
+ GenericStringUtf16CharacterStream stream(source,
+ shared_info->start_position(),
+ shared_info->end_position());
FunctionLiteral* result = ParseLazy(info, &stream, &zone_scope);
return result;
}
FunctionLiteral* Parser::ParseLazy(CompilationInfo* info,
- UC16CharacterStream* source,
+ Utf16CharacterStream* source,
ZoneScope* zone_scope) {
Handle<SharedFunctionInfo> shared_info = info->shared_info();
scanner_.Initialize(source);
// Logs a symbol creation of a literal or identifier.
virtual void LogAsciiSymbol(int start, Vector<const char> literal) { }
- virtual void LogUC16Symbol(int start, Vector<const uc16> literal) { }
+ virtual void LogUtf16Symbol(int start, Vector<const uc16> literal) { }
// Logs an error message and marks the log as containing an error.
// Further logging will be ignored, and ExtractData will return a vector
// Create a Scanner for the preparser to use as input, and preparse the source.
-static ScriptDataImpl* DoPreParse(UC16CharacterStream* source,
+static ScriptDataImpl* DoPreParse(Utf16CharacterStream* source,
int flags,
ParserRecorder* recorder) {
Isolate* isolate = Isolate::Current();
PartialParserRecorder recorder;
int source_length = source->length();
if (source->IsExternalTwoByteString()) {
- ExternalTwoByteStringUC16CharacterStream stream(
+ ExternalTwoByteStringUtf16CharacterStream stream(
Handle<ExternalTwoByteString>::cast(source), 0, source_length);
return DoPreParse(&stream, flags, &recorder);
} else {
- GenericStringUC16CharacterStream stream(source, 0, source_length);
+ GenericStringUtf16CharacterStream stream(source, 0, source_length);
return DoPreParse(&stream, flags, &recorder);
}
}
-ScriptDataImpl* ParserApi::PreParse(UC16CharacterStream* source,
+ScriptDataImpl* ParserApi::PreParse(Utf16CharacterStream* source,
v8::Extension* extension,
int flags) {
Handle<Script> no_script;
static bool Parse(CompilationInfo* info, int flags);
// Generic preparser generating full preparse data.
- static ScriptDataImpl* PreParse(UC16CharacterStream* source,
+ static ScriptDataImpl* PreParse(Utf16CharacterStream* source,
v8::Extension* extension,
int flags);
FunctionLiteral* ParseLazy(CompilationInfo* info,
- UC16CharacterStream* source,
+ Utf16CharacterStream* source,
ZoneScope* zone_scope);
Isolate* isolate() { return isolate_; }
scanner().literal_ascii_string(), tenured);
} else {
return isolate_->factory()->NewStringFromTwoByte(
- scanner().literal_uc16_string(), tenured);
+ scanner().literal_utf16_string(), tenured);
}
}
scanner().next_literal_ascii_string(), tenured);
} else {
return isolate_->factory()->NewStringFromTwoByte(
- scanner().next_literal_uc16_string(), tenured);
+ scanner().next_literal_utf16_string(), tenured);
}
}
// Logs a symbol creation of a literal or identifier.
virtual void LogAsciiSymbol(int start, Vector<const char> literal) { }
- virtual void LogUC16Symbol(int start, Vector<const uc16> literal) { }
+ virtual void LogUtf16Symbol(int start, Vector<const uc16> literal) { }
// Logs an error message and marks the log as containing an error.
// Further logging will be ignored, and ExtractData will return a vector
public:
PartialParserRecorder() : FunctionLoggingParserRecorder() { }
virtual void LogAsciiSymbol(int start, Vector<const char> literal) { }
- virtual void LogUC16Symbol(int start, Vector<const uc16> literal) { }
+ virtual void LogUtf16Symbol(int start, Vector<const uc16> literal) { }
virtual ~PartialParserRecorder() { }
virtual Vector<unsigned> ExtractData();
virtual int symbol_position() { return 0; }
LogSymbol(start, hash, true, Vector<const byte>::cast(literal));
}
- virtual void LogUC16Symbol(int start, Vector<const uc16> literal) {
+ virtual void LogUtf16Symbol(int start, Vector<const uc16> literal) {
if (!is_recording_) return;
int hash = vector_hash(literal);
LogSymbol(start, hash, false, Vector<const byte>::cast(literal));
namespace internal {
// UTF16Buffer based on a v8::UnicodeInputStream.
-class InputStreamUTF16Buffer : public UC16CharacterStream {
+class InputStreamUtf16Buffer : public Utf16CharacterStream {
public:
- /* The InputStreamUTF16Buffer maintains an internal buffer
- * that is filled in chunks from the UC16CharacterStream.
+ /* The InputStreamUtf16Buffer maintains an internal buffer
+ * that is filled in chunks from the Utf16CharacterStream.
* It also maintains unlimited pushback capability, but optimized
* for small pushbacks.
* The pushback_buffer_ pointer points to the limit of pushbacks
* new buffer. When this buffer is read to the end again, the cursor is
* switched back to the internal buffer
*/
- explicit InputStreamUTF16Buffer(v8::UnicodeInputStream* stream)
- : UC16CharacterStream(),
+ explicit InputStreamUtf16Buffer(v8::UnicodeInputStream* stream)
+ : Utf16CharacterStream(),
stream_(stream),
pushback_buffer_(buffer_),
pushback_buffer_end_cache_(NULL),
buffer_cursor_ = buffer_end_ = buffer_ + kPushBackSize;
}
- virtual ~InputStreamUTF16Buffer() {
+ virtual ~InputStreamUtf16Buffer() {
if (pushback_buffer_backing_ != NULL) {
DeleteArray(pushback_buffer_backing_);
}
uc16* buffer_start = buffer_ + kPushBackSize;
buffer_cursor_ = buffer_end_ = buffer_start;
while ((value = stream_->Next()) >= 0) {
- if (value > static_cast<int32_t>(unibrow::Utf8::kMaxThreeByteChar)) {
- value = unibrow::Utf8::kBadChar;
+ if (value >
+ static_cast<int32_t>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
+ buffer_start[buffer_end_++ - buffer_start] =
+ unibrow::Utf16::LeadSurrogate(value);
+ buffer_start[buffer_end_++ - buffer_start] =
+ unibrow::Utf16::TrailSurrogate(value);
+ } else {
+ // buffer_end_ is a const pointer, but buffer_ is writable.
+ buffer_start[buffer_end_++ - buffer_start] = static_cast<uc16>(value);
}
- // buffer_end_ is a const pointer, but buffer_ is writable.
- buffer_start[buffer_end_++ - buffer_start] = static_cast<uc16>(value);
- if (buffer_end_ == buffer_ + kPushBackSize + kBufferSize) break;
+ // Stop one before the end of the buffer in case we get a surrogate pair.
+ if (buffer_end_ <= buffer_ + 1 + kPushBackSize + kBufferSize) break;
}
return buffer_end_ > buffer_start;
}
PreParserData Preparse(UnicodeInputStream* input, size_t max_stack) {
- internal::InputStreamUTF16Buffer buffer(input);
+ internal::InputStreamUtf16Buffer buffer(input);
uintptr_t stack_limit = reinterpret_cast<uintptr_t>(&buffer) - max_stack;
internal::UnicodeCache unicode_cache;
internal::Scanner scanner(&unicode_cache);
old_type = finder->AddAsciiSymbol(scanner_->literal_ascii_string(),
type);
} else {
- old_type = finder->AddUC16Symbol(scanner_->literal_uc16_string(), type);
+ old_type = finder->AddUtf16Symbol(scanner_->literal_utf16_string(), type);
}
if (HasConflict(old_type, type)) {
if (IsDataDataConflict(old_type, type)) {
duplicate_finder.AddAsciiSymbol(scanner_->literal_ascii_string(), 1);
} else {
prev_value =
- duplicate_finder.AddUC16Symbol(scanner_->literal_uc16_string(), 1);
+ duplicate_finder.AddUtf16Symbol(scanner_->literal_utf16_string(), 1);
}
if (prev_value != 0) {
if (scanner_->is_literal_ascii()) {
log_->LogAsciiSymbol(identifier_pos, scanner_->literal_ascii_string());
} else {
- log_->LogUC16Symbol(identifier_pos, scanner_->literal_uc16_string());
+ log_->LogUtf16Symbol(identifier_pos, scanner_->literal_utf16_string());
}
}
return AddSymbol(i::Vector<const byte>::cast(key), true, value);
}
-int DuplicateFinder::AddUC16Symbol(i::Vector<const uint16_t> key, int value) {
+int DuplicateFinder::AddUtf16Symbol(i::Vector<const uint16_t> key, int value) {
return AddSymbol(i::Vector<const byte>::cast(key), false, value);
}
map_(&Match) { }
int AddAsciiSymbol(i::Vector<const char> key, int value);
- int AddUC16Symbol(i::Vector<const uint16_t> key, int value);
+ int AddUtf16Symbol(i::Vector<const uint16_t> key, int value);
// Add a a number literal by converting it (if necessary)
// to the string that ToString(ToNumber(literal)) would generate.
// and then adding that string with AddAsciiSymbol.
namespace internal {
// ----------------------------------------------------------------------------
-// BufferedUC16CharacterStreams
+// BufferedUtf16CharacterStreams
-BufferedUC16CharacterStream::BufferedUC16CharacterStream()
- : UC16CharacterStream(),
+BufferedUtf16CharacterStream::BufferedUtf16CharacterStream()
+ : Utf16CharacterStream(),
pushback_limit_(NULL) {
// Initialize buffer as being empty. First read will fill the buffer.
buffer_cursor_ = buffer_;
buffer_end_ = buffer_;
}
-BufferedUC16CharacterStream::~BufferedUC16CharacterStream() { }
+BufferedUtf16CharacterStream::~BufferedUtf16CharacterStream() { }
-void BufferedUC16CharacterStream::PushBack(uc32 character) {
+void BufferedUtf16CharacterStream::PushBack(uc32 character) {
if (character == kEndOfInput) {
pos_--;
return;
}
-void BufferedUC16CharacterStream::SlowPushBack(uc16 character) {
+void BufferedUtf16CharacterStream::SlowPushBack(uc16 character) {
// In pushback mode, the end of the buffer contains pushback,
// and the start of the buffer (from buffer start to pushback_limit_)
// contains valid data that comes just after the pushback.
}
-bool BufferedUC16CharacterStream::ReadBlock() {
+bool BufferedUtf16CharacterStream::ReadBlock() {
buffer_cursor_ = buffer_;
if (pushback_limit_ != NULL) {
// Leave pushback mode.
}
-unsigned BufferedUC16CharacterStream::SlowSeekForward(unsigned delta) {
+unsigned BufferedUtf16CharacterStream::SlowSeekForward(unsigned delta) {
// Leave pushback mode (i.e., ignore that there might be valid data
// in the buffer before the pushback_limit_ point).
pushback_limit_ = NULL;
}
// ----------------------------------------------------------------------------
-// GenericStringUC16CharacterStream
+// GenericStringUtf16CharacterStream
-GenericStringUC16CharacterStream::GenericStringUC16CharacterStream(
+GenericStringUtf16CharacterStream::GenericStringUtf16CharacterStream(
Handle<String> data,
unsigned start_position,
unsigned end_position)
}
-GenericStringUC16CharacterStream::~GenericStringUC16CharacterStream() { }
+GenericStringUtf16CharacterStream::~GenericStringUtf16CharacterStream() { }
-unsigned GenericStringUC16CharacterStream::BufferSeekForward(unsigned delta) {
+unsigned GenericStringUtf16CharacterStream::BufferSeekForward(unsigned delta) {
unsigned old_pos = pos_;
pos_ = Min(pos_ + delta, length_);
ReadBlock();
}
-unsigned GenericStringUC16CharacterStream::FillBuffer(unsigned from_pos,
+unsigned GenericStringUtf16CharacterStream::FillBuffer(unsigned from_pos,
unsigned length) {
if (from_pos >= length_) return 0;
if (from_pos + length > length_) {
// ----------------------------------------------------------------------------
-// Utf8ToUC16CharacterStream
-Utf8ToUC16CharacterStream::Utf8ToUC16CharacterStream(const byte* data,
- unsigned length)
- : BufferedUC16CharacterStream(),
+// Utf8ToUtf16CharacterStream
+Utf8ToUtf16CharacterStream::Utf8ToUtf16CharacterStream(const byte* data,
+ unsigned length)
+ : BufferedUtf16CharacterStream(),
raw_data_(data),
raw_data_length_(length),
raw_data_pos_(0),
}
-Utf8ToUC16CharacterStream::~Utf8ToUC16CharacterStream() { }
+Utf8ToUtf16CharacterStream::~Utf8ToUtf16CharacterStream() { }
-unsigned Utf8ToUC16CharacterStream::BufferSeekForward(unsigned delta) {
+unsigned Utf8ToUtf16CharacterStream::BufferSeekForward(unsigned delta) {
unsigned old_pos = pos_;
unsigned target_pos = pos_ + delta;
SetRawPosition(target_pos);
}
-unsigned Utf8ToUC16CharacterStream::FillBuffer(unsigned char_position,
- unsigned length) {
- static const unibrow::uchar kMaxUC16Character = 0xffff;
+unsigned Utf8ToUtf16CharacterStream::FillBuffer(unsigned char_position,
+ unsigned length) {
+ static const unibrow::uchar kMaxUtf16Character = 0xffff;
SetRawPosition(char_position);
if (raw_character_position_ != char_position) {
// char_position was not a valid position in the stream (hit the end
return 0u;
}
unsigned i = 0;
- while (i < length) {
+ while (i < length - 1) {
if (raw_data_pos_ == raw_data_length_) break;
unibrow::uchar c = raw_data_[raw_data_pos_];
if (c <= unibrow::Utf8::kMaxOneByteChar) {
c = unibrow::Utf8::CalculateValue(raw_data_ + raw_data_pos_,
raw_data_length_ - raw_data_pos_,
&raw_data_pos_);
- // Don't allow characters outside of the BMP.
- if (c > kMaxUC16Character) {
- c = unibrow::Utf8::kBadChar;
- }
}
- buffer_[i++] = static_cast<uc16>(c);
+ if (c > kMaxUtf16Character) {
+ buffer_[i++] = unibrow::Utf16::LeadSurrogate(c);
+ buffer_[i++] = unibrow::Utf16::TrailSurrogate(c);
+ } else {
+ buffer_[i++] = static_cast<uc16>(c);
+ }
}
raw_character_position_ = char_position + i;
return i;
}
-void Utf8ToUC16CharacterStream::SetRawPosition(unsigned target_position) {
+// This can't set a raw position between two surrogate pairs, since there
+// is no position in the UTF8 stream that corresponds to that. This assumes
+// that the surrogate pair is correctly coded as a 4 byte UTF-8 sequence. If
+// it is illegally coded as two 3 byte sequences then there is no problem here.
+void Utf8ToUtf16CharacterStream::SetRawPosition(unsigned target_position) {
if (raw_character_position_ > target_position) {
// Spool backwards in utf8 buffer.
do {
+ int old_pos = raw_data_pos_;
Utf8CharacterBack(raw_data_, &raw_data_pos_);
raw_character_position_--;
+ ASSERT(old_pos - raw_data_pos_ <= 4);
+ // Step back over both code units for surrogate pairs.
+ if (old_pos - raw_data_pos_ == 4) raw_character_position_--;
} while (raw_character_position_ > target_position);
+ // No surrogate pair splitting.
+ ASSERT(raw_character_position_ == target_position);
return;
}
// Spool forwards in the utf8 buffer.
while (raw_character_position_ < target_position) {
if (raw_data_pos_ == raw_data_length_) return;
+ int old_pos = raw_data_pos_;
Utf8CharacterForward(raw_data_, &raw_data_pos_);
raw_character_position_++;
+ ASSERT(raw_data_pos_ - old_pos <= 4);
+ if (raw_data_pos_ - old_pos == 4) raw_character_position_++;
}
+ // No surrogate pair splitting.
+ ASSERT(raw_character_position_ == target_position);
}
// ----------------------------------------------------------------------------
-// ExternalTwoByteStringUC16CharacterStream
+// ExternalTwoByteStringUtf16CharacterStream
-ExternalTwoByteStringUC16CharacterStream::
- ~ExternalTwoByteStringUC16CharacterStream() { }
+ExternalTwoByteStringUtf16CharacterStream::
+ ~ExternalTwoByteStringUtf16CharacterStream() { }
-ExternalTwoByteStringUC16CharacterStream
- ::ExternalTwoByteStringUC16CharacterStream(
+ExternalTwoByteStringUtf16CharacterStream
+ ::ExternalTwoByteStringUtf16CharacterStream(
Handle<ExternalTwoByteString> data,
int start_position,
int end_position)
- : UC16CharacterStream(),
+ : Utf16CharacterStream(),
source_(data),
raw_data_(data->GetTwoByteData(start_position)) {
buffer_cursor_ = raw_data_,
// A buffered character stream based on a random access character
// source (ReadBlock can be called with pos_ pointing to any position,
// even positions before the current).
-class BufferedUC16CharacterStream: public UC16CharacterStream {
+class BufferedUtf16CharacterStream: public Utf16CharacterStream {
public:
- BufferedUC16CharacterStream();
- virtual ~BufferedUC16CharacterStream();
+ BufferedUtf16CharacterStream();
+ virtual ~BufferedUtf16CharacterStream();
virtual void PushBack(uc32 character);
// Generic string stream.
-class GenericStringUC16CharacterStream: public BufferedUC16CharacterStream {
+class GenericStringUtf16CharacterStream: public BufferedUtf16CharacterStream {
public:
- GenericStringUC16CharacterStream(Handle<String> data,
- unsigned start_position,
- unsigned end_position);
- virtual ~GenericStringUC16CharacterStream();
+ GenericStringUtf16CharacterStream(Handle<String> data,
+ unsigned start_position,
+ unsigned end_position);
+ virtual ~GenericStringUtf16CharacterStream();
protected:
virtual unsigned BufferSeekForward(unsigned delta);
};
-// UC16 stream based on a literal UTF-8 string.
-class Utf8ToUC16CharacterStream: public BufferedUC16CharacterStream {
+// Utf16 stream based on a literal UTF-8 string.
+class Utf8ToUtf16CharacterStream: public BufferedUtf16CharacterStream {
public:
- Utf8ToUC16CharacterStream(const byte* data, unsigned length);
- virtual ~Utf8ToUC16CharacterStream();
+ Utf8ToUtf16CharacterStream(const byte* data, unsigned length);
+ virtual ~Utf8ToUtf16CharacterStream();
protected:
virtual unsigned BufferSeekForward(unsigned delta);
// UTF16 buffer to read characters from an external string.
-class ExternalTwoByteStringUC16CharacterStream: public UC16CharacterStream {
+class ExternalTwoByteStringUtf16CharacterStream: public Utf16CharacterStream {
public:
- ExternalTwoByteStringUC16CharacterStream(Handle<ExternalTwoByteString> data,
- int start_position,
- int end_position);
- virtual ~ExternalTwoByteStringUC16CharacterStream();
+ ExternalTwoByteStringUtf16CharacterStream(Handle<ExternalTwoByteString> data,
+ int start_position,
+ int end_position);
+ virtual ~ExternalTwoByteStringUtf16CharacterStream();
virtual void PushBack(uc32 character) {
ASSERT(buffer_cursor_ > raw_data_);
harmony_modules_(false) { }
-void Scanner::Initialize(UC16CharacterStream* source) {
+void Scanner::Initialize(Utf16CharacterStream* source) {
source_ = source;
// Need to capture identifiers in order to recognize "get" and "set"
// in object literals.
// ---------------------------------------------------------------------
-// Buffered stream of characters, using an internal UC16 buffer.
+// Buffered stream of UTF-16 code units, using an internal UTF-16 buffer.
+// A code unit is a 16 bit value representing either a 16 bit code point
+// or one part of a surrogate pair that make a single 21 bit code point.
-class UC16CharacterStream {
+class Utf16CharacterStream {
public:
- UC16CharacterStream() : pos_(0) { }
- virtual ~UC16CharacterStream() { }
+ Utf16CharacterStream() : pos_(0) { }
+ virtual ~Utf16CharacterStream() { }
- // Returns and advances past the next UC16 character in the input
- // stream. If there are no more characters, it returns a negative
+ // Returns and advances past the next UTF-16 code unit in the input
+ // stream. If there are no more code units, it returns a negative
// value.
inline uc32 Advance() {
if (buffer_cursor_ < buffer_end_ || ReadBlock()) {
}
// Note: currently the following increment is necessary to avoid a
// parser problem! The scanner treats the final kEndOfInput as
- // a character with a position, and does math relative to that
+ // a code unit with a position, and does math relative to that
// position.
pos_++;
return kEndOfInput;
}
- // Return the current position in the character stream.
+ // Return the current position in the code unit stream.
// Starts at zero.
inline unsigned pos() const { return pos_; }
- // Skips forward past the next character_count UC16 characters
+ // Skips forward past the next code_unit_count UTF-16 code units
// in the input, or until the end of input if that comes sooner.
- // Returns the number of characters actually skipped. If less
- // than character_count,
- inline unsigned SeekForward(unsigned character_count) {
+ // Returns the number of code units actually skipped. If less
+ // than code_unit_count,
+ inline unsigned SeekForward(unsigned code_unit_count) {
unsigned buffered_chars =
static_cast<unsigned>(buffer_end_ - buffer_cursor_);
- if (character_count <= buffered_chars) {
- buffer_cursor_ += character_count;
- pos_ += character_count;
- return character_count;
+ if (code_unit_count <= buffered_chars) {
+ buffer_cursor_ += code_unit_count;
+ pos_ += code_unit_count;
+ return code_unit_count;
}
- return SlowSeekForward(character_count);
+ return SlowSeekForward(code_unit_count);
}
- // Pushes back the most recently read UC16 character (or negative
+ // Pushes back the most recently read UTF-16 code unit (or negative
// value if at end of input), i.e., the value returned by the most recent
// call to Advance.
// Must not be used right after calling SeekForward.
- virtual void PushBack(int32_t character) = 0;
+ virtual void PushBack(int32_t code_unit) = 0;
protected:
static const uc32 kEndOfInput = -1;
- // Ensures that the buffer_cursor_ points to the character at
+ // Ensures that the buffer_cursor_ points to the code_unit at
// position pos_ of the input, if possible. If the position
// is at or after the end of the input, return false. If there
- // are more characters available, return true.
+ // are more code_units available, return true.
virtual bool ReadBlock() = 0;
- virtual unsigned SlowSeekForward(unsigned character_count) = 0;
+ virtual unsigned SlowSeekForward(unsigned code_unit_count) = 0;
const uc16* buffer_cursor_;
const uc16* buffer_end_;
}
}
- INLINE(void AddChar(uc16 character)) {
+ INLINE(void AddChar(uint32_t code_unit)) {
if (position_ >= backing_store_.length()) ExpandBuffer();
if (is_ascii_) {
- if (character < kMaxAsciiCharCodeU) {
- backing_store_[position_] = static_cast<byte>(character);
+ if (code_unit < kMaxAsciiCharCodeU) {
+ backing_store_[position_] = static_cast<byte>(code_unit);
position_ += kASCIISize;
return;
}
- ConvertToUC16();
+ ConvertToUtf16();
}
- *reinterpret_cast<uc16*>(&backing_store_[position_]) = character;
+ ASSERT(code_unit < 0x10000u);
+ *reinterpret_cast<uc16*>(&backing_store_[position_]) = code_unit;
position_ += kUC16Size;
}
bool is_ascii() { return is_ascii_; }
- Vector<const uc16> uc16_literal() {
+ Vector<const uc16> utf16_literal() {
ASSERT(!is_ascii_);
ASSERT((position_ & 0x1) == 0);
return Vector<const uc16>(
backing_store_ = new_store;
}
- void ConvertToUC16() {
+ void ConvertToUtf16() {
ASSERT(is_ascii_);
Vector<byte> new_store;
int new_content_size = position_ * kUC16Size;
if (new_content_size >= backing_store_.length()) {
- // Ensure room for all currently read characters as UC16 as well
- // as the character about to be stored.
+ // Ensure room for all currently read code units as UC16 as well
+ // as the code unit about to be stored.
new_store = Vector<byte>::New(NewCapacity(new_content_size));
} else {
new_store = backing_store_;
explicit Scanner(UnicodeCache* scanner_contants);
- void Initialize(UC16CharacterStream* source);
+ void Initialize(Utf16CharacterStream* source);
// Returns the next token and advances input.
Token::Value Next();
ASSERT_NOT_NULL(current_.literal_chars);
return current_.literal_chars->ascii_literal();
}
- Vector<const uc16> literal_uc16_string() {
+ Vector<const uc16> literal_utf16_string() {
ASSERT_NOT_NULL(current_.literal_chars);
- return current_.literal_chars->uc16_literal();
+ return current_.literal_chars->utf16_literal();
}
bool is_literal_ascii() {
ASSERT_NOT_NULL(current_.literal_chars);
ASSERT_NOT_NULL(next_.literal_chars);
return next_.literal_chars->ascii_literal();
}
- Vector<const uc16> next_literal_uc16_string() {
+ Vector<const uc16> next_literal_utf16_string() {
ASSERT_NOT_NULL(next_.literal_chars);
- return next_.literal_chars->uc16_literal();
+ return next_.literal_chars->utf16_literal();
}
bool is_next_literal_ascii() {
ASSERT_NOT_NULL(next_.literal_chars);
TokenDesc current_; // desc for current token (as returned by Next())
TokenDesc next_; // desc for next token (one token look-ahead)
- // Input stream. Must be initialized to an UC16CharacterStream.
- UC16CharacterStream* source_;
+ // Input stream. Must be initialized to an Utf16CharacterStream.
+ Utf16CharacterStream* source_;
// Start position of the octal literal last scanned.
}
-unsigned Utf8::Encode(char* str, uchar c) {
+unsigned Utf8::Encode(char* str, uchar c, int previous) {
static const int kMask = ~(1 << 6);
if (c <= kMaxOneByteChar) {
str[0] = c;
str[1] = 0x80 | (c & kMask);
return 2;
} else if (c <= kMaxThreeByteChar) {
+ if (Utf16::IsTrailSurrogate(c) &&
+ Utf16::IsLeadSurrogate(previous)) {
+ const int kUnmatchedSize = kSizeOfUnmatchedSurrogate;
+ return Encode(str - kUnmatchedSize,
+ Utf16::CombineSurrogatePair(previous, c),
+ Utf16::kNoPreviousCharacter) - kUnmatchedSize;
+ }
str[0] = 0xE0 | (c >> 12);
str[1] = 0x80 | ((c >> 6) & kMask);
str[2] = 0x80 | (c & kMask);
return CalculateValue(bytes, length, cursor);
}
-unsigned Utf8::Length(uchar c) {
+unsigned Utf8::Length(uchar c, int previous) {
if (c <= kMaxOneByteChar) {
return 1;
} else if (c <= kMaxTwoByteChar) {
return 2;
} else if (c <= kMaxThreeByteChar) {
+ if (Utf16::IsTrailSurrogate(c) &&
+ Utf16::IsLeadSurrogate(previous)) {
+ return kSizeOfUnmatchedSurrogate - kBytesSavedByCombiningSurrogates;
+ }
return 3;
} else {
return 4;
return kBadChar;
}
+
const byte* Utf8::ReadBlock(Buffer<const char*> str, byte* buffer,
unsigned capacity, unsigned* chars_read_ptr, unsigned* offset_ptr) {
unsigned offset = *offset_ptr;
return result;
}
+unsigned CharacterStream::Utf16Length() {
+ unsigned result = 0;
+ while (has_more()) {
+ uchar c = GetNext();
+ result += c > Utf16::kMaxNonSurrogateCharCode ? 2 : 1;
+ }
+ Rewind();
+ return result;
+}
+
void CharacterStream::Seek(unsigned position) {
Rewind();
for (unsigned i = 0; i < position; i++) {
static const uchar kMaxCodePoint;
};
-// --- U t f 8 ---
+// --- U t f 8 a n d 16 ---
template <typename Data>
class Buffer {
unsigned length_;
};
+
+class Utf16 {
+ public:
+ static inline bool IsLeadSurrogate(int32_t code) {
+ if (code == kNoPreviousCharacter) return false;
+ return (code & 0xfc00) == 0xd800;
+ }
+ static inline bool IsTrailSurrogate(int32_t code) {
+ if (code == kNoPreviousCharacter) return false;
+ return (code & 0xfc00) == 0xdc00;
+ }
+
+ static inline int32_t CombineSurrogatePair(uchar lead, uchar trail) {
+ return 0x10000 + ((lead & 0x3ff) << 10) + (trail & 0x3ff);
+ }
+ static const int32_t kNoPreviousCharacter = -1;
+ static const uchar kMaxNonSurrogateCharCode = 0xffff;
+ // Encoding a single UTF-16 code unit will produce 1, 2 or 3 bytes
+ // of UTF-8 data. The special case where the unit is a surrogate
+ // trail produces 1 byte net, because the encoding of the pair is
+ // 4 bytes and the 3 bytes that were used to encode the lead surrogate
+ // can be reclaimed.
+ static const int kMaxExtraUtf8BytesForOneUtf16CodeUnit = 3;
+ // One UTF-16 surrogate is endoded (illegally) as 3 UTF-8 bytes.
+ // The illegality stems from the surrogate not being part of a pair.
+ static const int kUtf8BytesToCodeASurrogate = 3;
+ static inline uchar LeadSurrogate(int32_t char_code) {
+ return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff);
+ }
+ static inline uchar TrailSurrogate(int32_t char_code) {
+ return 0xdc00 + (char_code & 0x3ff);
+ }
+};
+
+
class Utf8 {
public:
- static inline uchar Length(uchar chr);
- static inline unsigned Encode(char* out, uchar c);
+ static inline uchar Length(uchar chr, int previous);
+ static inline unsigned Encode(
+ char* out, uchar c, int previous);
static const byte* ReadBlock(Buffer<const char*> str, byte* buffer,
unsigned capacity, unsigned* chars_read, unsigned* offset);
static uchar CalculateValue(const byte* str,
static const unsigned kMaxThreeByteChar = 0xffff;
static const unsigned kMaxFourByteChar = 0x1fffff;
+ // A single surrogate is coded as a 3 byte UTF-8 sequence, but two together
+ // that match are coded as a 4 byte UTF-8 sequence.
+ static const unsigned kBytesSavedByCombiningSurrogates = 2;
+ static const unsigned kSizeOfUnmatchedSurrogate = 3;
+
private:
template <unsigned s> friend class Utf8InputBuffer;
friend class Test;
// Note that default implementation is not efficient.
virtual void Seek(unsigned);
unsigned Length();
+ unsigned Utf16Length();
virtual ~CharacterStream() { }
static inline bool EncodeCharacter(uchar c, byte* buffer, unsigned capacity,
unsigned& offset);
unsigned capacity, unsigned& offset);
static inline uchar DecodeCharacter(const byte* buffer, unsigned* offset);
virtual void Rewind() = 0;
+
protected:
virtual void FillBuffer() = 0;
// The number of characters left in the current buffer
uc16 minus,
uc16 mask,
Label* on_not_equal) {
- ASSERT(minus < String::kMaxUC16CharCode);
+ ASSERT(minus < String::kMaxUtf16CodeUnit);
__ lea(rax, Operand(current_character(), -minus));
__ and_(rax, Immediate(mask));
__ cmpl(rax, Immediate(c));
}
+int GetUtf8Length(Handle<String> str) {
+ int len = str->Utf8Length();
+ if (len < 0) {
+ i::Handle<i::String> istr(v8::Utils::OpenHandle(*str));
+ i::FlattenString(istr);
+ len = str->Utf8Length();
+ }
+ return len;
+}
+
+
THREADED_TEST(StringWrite) {
LocalContext context;
v8::HandleScope scope;
CHECK_EQ(0, strncmp(utf8buf, "ab\1", 3));
memset(utf8buf, 0x1, sizeof(utf8buf));
- len = left_tree->Utf8Length();
+ len = GetUtf8Length(left_tree);
int utf8_expected =
(0x80 + (0x800 - 0x80) * 2 + (0xd800 - 0x800) * 3) / kStride;
CHECK_EQ(utf8_expected, len);
CHECK_EQ(1, utf8buf[utf8_expected]);
memset(utf8buf, 0x1, sizeof(utf8buf));
- len = right_tree->Utf8Length();
+ len = GetUtf8Length(right_tree);
CHECK_EQ(utf8_expected, len);
len = right_tree->WriteUtf8(utf8buf, utf8_expected, &charlen);
CHECK_EQ(utf8_expected, len);
}
+static void Utf16Helper(
+ LocalContext& context,
+ const char* name,
+ const char* lengths_name,
+ int len) {
+ Local<v8::Array> a =
+ Local<v8::Array>::Cast(context->Global()->Get(v8_str(name)));
+ Local<v8::Array> alens =
+ Local<v8::Array>::Cast(context->Global()->Get(v8_str(lengths_name)));
+ for (int i = 0; i < len; i++) {
+ Local<v8::String> string =
+ Local<v8::String>::Cast(a->Get(i));
+ Local<v8::Number> expected_len =
+ Local<v8::Number>::Cast(alens->Get(i));
+ int length = GetUtf8Length(string);
+ CHECK_EQ(static_cast<int>(expected_len->Value()), length);
+ }
+}
+
+
+static uint16_t StringGet(Handle<String> str, int index) {
+ i::Handle<i::String> istring =
+ v8::Utils::OpenHandle(String::Cast(*str));
+ return istring->Get(index);
+}
+
+
+static void WriteUtf8Helper(
+ LocalContext& context,
+ const char* name,
+ const char* lengths_name,
+ int len) {
+ Local<v8::Array> b =
+ Local<v8::Array>::Cast(context->Global()->Get(v8_str(name)));
+ Local<v8::Array> alens =
+ Local<v8::Array>::Cast(context->Global()->Get(v8_str(lengths_name)));
+ char buffer[1000];
+ char buffer2[1000];
+ for (int i = 0; i < len; i++) {
+ Local<v8::String> string =
+ Local<v8::String>::Cast(b->Get(i));
+ Local<v8::Number> expected_len =
+ Local<v8::Number>::Cast(alens->Get(i));
+ int utf8_length = static_cast<int>(expected_len->Value());
+ for (int j = utf8_length + 1; j >= 0; j--) {
+ memset(reinterpret_cast<void*>(&buffer), 42, sizeof(buffer));
+ memset(reinterpret_cast<void*>(&buffer2), 42, sizeof(buffer2));
+ int nchars;
+ int utf8_written =
+ string->WriteUtf8(buffer, j, &nchars, String::NO_OPTIONS);
+ int utf8_written2 =
+ string->WriteUtf8(buffer2, j, &nchars, String::NO_NULL_TERMINATION);
+ CHECK_GE(utf8_length + 1, utf8_written);
+ CHECK_GE(utf8_length, utf8_written2);
+ for (int k = 0; k < utf8_written2; k++) {
+ CHECK_EQ(buffer[k], buffer2[k]);
+ }
+ CHECK(nchars * 3 >= utf8_written - 1);
+ CHECK(nchars <= utf8_written);
+ if (j == utf8_length + 1) {
+ CHECK_EQ(utf8_written2, utf8_length);
+ CHECK_EQ(utf8_written2 + 1, utf8_written);
+ }
+ CHECK_EQ(buffer[utf8_written], 42);
+ if (j > utf8_length) {
+ if (utf8_written != 0) CHECK_EQ(buffer[utf8_written - 1], 0);
+ if (utf8_written > 1) CHECK_NE(buffer[utf8_written - 2], 42);
+ Handle<String> roundtrip = v8_str(buffer);
+ CHECK(roundtrip->Equals(string));
+ } else {
+ if (utf8_written != 0) CHECK_NE(buffer[utf8_written - 1], 42);
+ }
+ if (utf8_written2 != 0) CHECK_NE(buffer[utf8_written - 1], 42);
+ if (nchars >= 2) {
+ uint16_t trail = StringGet(string, nchars - 1);
+ uint16_t lead = StringGet(string, nchars - 2);
+ if (((lead & 0xfc00) == 0xd800) &&
+ ((trail & 0xfc00) == 0xdc00)) {
+ unsigned char u1 = buffer2[utf8_written2 - 4];
+ unsigned char u2 = buffer2[utf8_written2 - 3];
+ unsigned char u3 = buffer2[utf8_written2 - 2];
+ unsigned char u4 = buffer2[utf8_written2 - 1];
+ CHECK_EQ((u1 & 0xf8), 0xf0);
+ CHECK_EQ((u2 & 0xc0), 0x80);
+ CHECK_EQ((u3 & 0xc0), 0x80);
+ CHECK_EQ((u4 & 0xc0), 0x80);
+ uint32_t c = 0x10000 + ((lead & 0x3ff) << 10) + (trail & 0x3ff);
+ CHECK_EQ((u4 & 0x3f), (c & 0x3f));
+ CHECK_EQ((u3 & 0x3f), ((c >> 6) & 0x3f));
+ CHECK_EQ((u2 & 0x3f), ((c >> 12) & 0x3f));
+ CHECK_EQ((u1 & 0x3), c >> 18);
+ }
+ }
+ }
+ }
+}
+
+
+THREADED_TEST(Utf16) {
+ LocalContext context;
+ v8::HandleScope scope;
+ CompileRun(
+ "var pad = '01234567890123456789';"
+ "var p = [];"
+ "var plens = [20, 3, 3];"
+ "p.push('01234567890123456789');"
+ "var lead = 0xd800;"
+ "var trail = 0xdc00;"
+ "p.push(String.fromCharCode(0xd800));"
+ "p.push(String.fromCharCode(0xdc00));"
+ "var a = [];"
+ "var b = [];"
+ "var alens = [];"
+ "for (var i = 0; i < 3; i++) {"
+ " p[1] = String.fromCharCode(lead++);"
+ " for (var j = 0; j < 3; j++) {"
+ " p[2] = String.fromCharCode(trail++);"
+ " a.push(p[i] + p[j]);"
+ " b.push(p[i] + p[j]);"
+ " alens.push(plens[i] + plens[j]);"
+ " }"
+ "}"
+ "alens[5] -= 2;" // Here the surrogate pairs match up.
+ "var a2 = [];"
+ "var b2 = [];"
+ "var a2lens = [];"
+ "for (var m = 0; m < 9; m++) {"
+ " for (var n = 0; n < 9; n++) {"
+ " a2.push(a[m] + a[n]);"
+ " b2.push(b[m] + b[n]);"
+ " var utf = alens[m] + alens[n];" // And here.
+ // The 'n's that start with 0xdc.. are 6-8
+ // The 'm's that end with 0xd8.. are 1, 4 and 7
+ " if ((m % 3) == 1 && n >= 6) utf -= 2;"
+ " a2lens.push(utf);"
+ " }"
+ "}");
+ Utf16Helper(context, "a", "alens", 9);
+ Utf16Helper(context, "a2", "a2lens", 81);
+ WriteUtf8Helper(context, "b", "alens", 9);
+ WriteUtf8Helper(context, "b2", "a2lens", 81);
+}
+
+
+static bool SameSymbol(Handle<String> s1, Handle<String> s2) {
+ i::Handle<i::String> is1(v8::Utils::OpenHandle(*s1));
+ i::Handle<i::String> is2(v8::Utils::OpenHandle(*s2));
+ return *is1 == *is2;
+}
+
+
+static void SameSymbolHelper(const char* a, const char* b) {
+ Handle<String> symbol1 = v8::String::NewSymbol(a);
+ Handle<String> symbol2 = v8::String::NewSymbol(b);
+ CHECK(SameSymbol(symbol1, symbol2));
+}
+
+
+THREADED_TEST(Utf16Symbol) {
+ LocalContext context;
+ v8::HandleScope scope;
+
+ Handle<String> symbol1 = v8::String::NewSymbol("abc");
+ Handle<String> symbol2 = v8::String::NewSymbol("abc");
+ CHECK(SameSymbol(symbol1, symbol2));
+
+ SameSymbolHelper("\360\220\220\205", // 4 byte encoding.
+ "\355\240\201\355\260\205"); // 2 3-byte surrogates.
+ SameSymbolHelper("\355\240\201\355\260\206", // 2 3-byte surrogates.
+ "\360\220\220\206"); // 4 byte encoding.
+ SameSymbolHelper("x\360\220\220\205", // 4 byte encoding.
+ "x\355\240\201\355\260\205"); // 2 3-byte surrogates.
+ SameSymbolHelper("x\355\240\201\355\260\206", // 2 3-byte surrogates.
+ "x\360\220\220\206"); // 4 byte encoding.
+ CompileRun(
+ "var sym0 = 'benedictus';"
+ "var sym0b = 'S\303\270ren';"
+ "var sym1 = '\355\240\201\355\260\207';"
+ "var sym2 = '\360\220\220\210';"
+ "var sym3 = 'x\355\240\201\355\260\207';"
+ "var sym4 = 'x\360\220\220\210';"
+ "if (sym1.length != 2) throw sym1;"
+ "if (sym1.charCodeAt(1) != 0xdc07) throw sym1.charCodeAt(1);"
+ "if (sym2.length != 2) throw sym2;"
+ "if (sym2.charCodeAt(1) != 0xdc08) throw sym2.charCodeAt(2);"
+ "if (sym3.length != 3) throw sym3;"
+ "if (sym3.charCodeAt(2) != 0xdc07) throw sym1.charCodeAt(2);"
+ "if (sym4.length != 3) throw sym4;"
+ "if (sym4.charCodeAt(2) != 0xdc08) throw sym2.charCodeAt(2);");
+ Handle<String> sym0 = v8::String::NewSymbol("benedictus");
+ Handle<String> sym0b = v8::String::NewSymbol("S\303\270ren");
+ Handle<String> sym1 = v8::String::NewSymbol("\355\240\201\355\260\207");
+ Handle<String> sym2 = v8::String::NewSymbol("\360\220\220\210");
+ Handle<String> sym3 = v8::String::NewSymbol("x\355\240\201\355\260\207");
+ Handle<String> sym4 = v8::String::NewSymbol("x\360\220\220\210");
+ v8::Local<v8::Object> global = context->Global();
+ Local<Value> s0 = global->Get(v8_str("sym0"));
+ Local<Value> s0b = global->Get(v8_str("sym0b"));
+ Local<Value> s1 = global->Get(v8_str("sym1"));
+ Local<Value> s2 = global->Get(v8_str("sym2"));
+ Local<Value> s3 = global->Get(v8_str("sym3"));
+ Local<Value> s4 = global->Get(v8_str("sym4"));
+ CHECK(SameSymbol(sym0, Handle<String>(String::Cast(*s0))));
+ CHECK(SameSymbol(sym0b, Handle<String>(String::Cast(*s0b))));
+ CHECK(SameSymbol(sym1, Handle<String>(String::Cast(*s1))));
+ CHECK(SameSymbol(sym2, Handle<String>(String::Cast(*s2))));
+ CHECK(SameSymbol(sym3, Handle<String>(String::Cast(*s3))));
+ CHECK(SameSymbol(sym4, Handle<String>(String::Cast(*s4))));
+}
+
+
THREADED_TEST(ToArrayIndex) {
v8::HandleScope scope;
LocalContext context;
int length = i::StrLength(key_token.keyword);
CHECK(static_cast<int>(sizeof(buffer)) >= length);
{
- i::Utf8ToUC16CharacterStream stream(keyword, length);
+ i::Utf8ToUtf16CharacterStream stream(keyword, length);
i::Scanner scanner(&unicode_cache);
// The scanner should parse Harmony keywords for this test.
scanner.SetHarmonyScoping(true);
}
// Removing characters will make keyword matching fail.
{
- i::Utf8ToUC16CharacterStream stream(keyword, length - 1);
+ i::Utf8ToUtf16CharacterStream stream(keyword, length - 1);
i::Scanner scanner(&unicode_cache);
scanner.Initialize(&stream);
CHECK_EQ(i::Token::IDENTIFIER, scanner.Next());
for (int j = 0; j < static_cast<int>(ARRAY_SIZE(chars_to_append)); ++j) {
memmove(buffer, keyword, length);
buffer[length] = chars_to_append[j];
- i::Utf8ToUC16CharacterStream stream(buffer, length + 1);
+ i::Utf8ToUtf16CharacterStream stream(buffer, length + 1);
i::Scanner scanner(&unicode_cache);
scanner.Initialize(&stream);
CHECK_EQ(i::Token::IDENTIFIER, scanner.Next());
{
memmove(buffer, keyword, length);
buffer[length - 1] = '_';
- i::Utf8ToUC16CharacterStream stream(buffer, length);
+ i::Utf8ToUtf16CharacterStream stream(buffer, length);
i::Scanner scanner(&unicode_cache);
scanner.Initialize(&stream);
CHECK_EQ(i::Token::IDENTIFIER, scanner.Next());
uintptr_t stack_limit = i::Isolate::Current()->stack_guard()->real_climit();
for (int i = 0; programs[i]; i++) {
const char* program = programs[i];
- i::Utf8ToUC16CharacterStream stream(
+ i::Utf8ToUtf16CharacterStream stream(
reinterpret_cast<const i::byte*>(program),
static_cast<unsigned>(strlen(program)));
i::CompleteParserRecorder log;
uintptr_t stack_limit = i::Isolate::Current()->stack_guard()->real_climit();
for (int i = 0; programs[i]; i++) {
const char* program = programs[i];
- i::Utf8ToUC16CharacterStream stream(
+ i::Utf8ToUtf16CharacterStream stream(
reinterpret_cast<const i::byte*>(program),
static_cast<unsigned>(strlen(program)));
i::CompleteParserRecorder log;
// and then used the invalid currently scanned literal. This always
// failed in debug mode, and sometimes crashed in release mode.
- i::Utf8ToUC16CharacterStream stream(reinterpret_cast<const i::byte*>(program),
- static_cast<unsigned>(strlen(program)));
+ i::Utf8ToUtf16CharacterStream stream(
+ reinterpret_cast<const i::byte*>(program),
+ static_cast<unsigned>(strlen(program)));
i::ScriptDataImpl* data =
i::ParserApi::PreParse(&stream, NULL, false);
CHECK(data->HasError());
uintptr_t stack_limit = i::Isolate::Current()->stack_guard()->real_climit();
- i::Utf8ToUC16CharacterStream stream(
+ i::Utf8ToUtf16CharacterStream stream(
reinterpret_cast<const i::byte*>(*program),
static_cast<unsigned>(kProgramSize));
i::CompleteParserRecorder log;
i::Handle<i::String> uc16_string(
FACTORY->NewExternalStringFromTwoByte(&resource));
- i::ExternalTwoByteStringUC16CharacterStream uc16_stream(
+ i::ExternalTwoByteStringUtf16CharacterStream uc16_stream(
i::Handle<i::ExternalTwoByteString>::cast(uc16_string), start, end);
- i::GenericStringUC16CharacterStream string_stream(ascii_string, start, end);
- i::Utf8ToUC16CharacterStream utf8_stream(
+ i::GenericStringUtf16CharacterStream string_stream(ascii_string, start, end);
+ i::Utf8ToUtf16CharacterStream utf8_stream(
reinterpret_cast<const i::byte*>(ascii_source), end);
utf8_stream.SeekForward(start);
char buffer[kAllUtf8CharsSizeU];
unsigned cursor = 0;
for (int i = 0; i <= kMaxUC16Char; i++) {
- cursor += unibrow::Utf8::Encode(buffer + cursor, i);
+ cursor += unibrow::Utf8::Encode(buffer + cursor,
+ i,
+ unibrow::Utf16::kNoPreviousCharacter);
}
ASSERT(cursor == kAllUtf8CharsSizeU);
- i::Utf8ToUC16CharacterStream stream(reinterpret_cast<const i::byte*>(buffer),
- kAllUtf8CharsSizeU);
+ i::Utf8ToUtf16CharacterStream stream(reinterpret_cast<const i::byte*>(buffer),
+ kAllUtf8CharsSizeU);
for (int i = 0; i <= kMaxUC16Char; i++) {
CHECK_EQU(i, stream.pos());
int32_t c = stream.Advance();
#undef CHECK_EQU
-void TestStreamScanner(i::UC16CharacterStream* stream,
+void TestStreamScanner(i::Utf16CharacterStream* stream,
i::Token::Value* expected_tokens,
int skip_pos = 0, // Zero means not skipping.
int skip_to = 0) {
v8::V8::Initialize();
const char* str1 = "{ foo get for : */ <- \n\n /*foo*/ bib";
- i::Utf8ToUC16CharacterStream stream1(reinterpret_cast<const i::byte*>(str1),
- static_cast<unsigned>(strlen(str1)));
+ i::Utf8ToUtf16CharacterStream stream1(reinterpret_cast<const i::byte*>(str1),
+ static_cast<unsigned>(strlen(str1)));
i::Token::Value expectations1[] = {
i::Token::LBRACE,
i::Token::IDENTIFIER,
TestStreamScanner(&stream1, expectations1, 0, 0);
const char* str2 = "case default const {THIS\nPART\nSKIPPED} do";
- i::Utf8ToUC16CharacterStream stream2(reinterpret_cast<const i::byte*>(str2),
- static_cast<unsigned>(strlen(str2)));
+ i::Utf8ToUtf16CharacterStream stream2(reinterpret_cast<const i::byte*>(str2),
+ static_cast<unsigned>(strlen(str2)));
i::Token::Value expectations2[] = {
i::Token::CASE,
i::Token::DEFAULT,
for (int i = 0; i <= 4; i++) {
expectations3[6 - i] = i::Token::ILLEGAL;
expectations3[5 - i] = i::Token::EOS;
- i::Utf8ToUC16CharacterStream stream3(
+ i::Utf8ToUtf16CharacterStream stream3(
reinterpret_cast<const i::byte*>(str3),
static_cast<unsigned>(strlen(str3)));
TestStreamScanner(&stream3, expectations3, 1, 1 + i);
void TestScanRegExp(const char* re_source, const char* expected) {
- i::Utf8ToUC16CharacterStream stream(
+ i::Utf8ToUtf16CharacterStream stream(
reinterpret_cast<const i::byte*>(re_source),
static_cast<unsigned>(strlen(re_source)));
i::Scanner scanner(i::Isolate::Current()->unicode_cache());
}
+static int Utf8LengthHelper(const char* s) {
+ int len = strlen(s);
+ int character_length = len;
+ for (int i = 0; i < len; i++) {
+ unsigned char c = s[i];
+ int input_offset = 0;
+ int output_adjust = 0;
+ if (c > 0x7f) {
+ if (c < 0xc0) continue;
+ if (c >= 0xf0) {
+ if (c >= 0xf8) {
+ // 5 and 6 byte UTF-8 sequences turn into a kBadChar for each UTF-8
+ // byte.
+ continue; // Handle first UTF-8 byte.
+ }
+ if ((c & 7) == 0 && ((s[i + 1] & 0x30) == 0)) {
+ // This 4 byte sequence could have been coded as a 3 byte sequence.
+ // Record a single kBadChar for the first byte and continue.
+ continue;
+ }
+ input_offset = 3;
+ // 4 bytes of UTF-8 turn into 2 UTF-16 code units.
+ character_length -= 2;
+ } else if (c >= 0xe0) {
+ if ((c & 0xf) == 0 && ((s[i + 1] & 0x20) == 0)) {
+ // This 3 byte sequence could have been coded as a 2 byte sequence.
+ // Record a single kBadChar for the first byte and continue.
+ continue;
+ }
+ input_offset = 2;
+ // 3 bytes of UTF-8 turn into 1 UTF-16 code unit.
+ output_adjust = 2;
+ } else {
+ if ((c & 0x1e) == 0) {
+ // This 2 byte sequence could have been coded as a 1 byte sequence.
+ // Record a single kBadChar for the first byte and continue.
+ continue;
+ }
+ input_offset = 1;
+ // 2 bytes of UTF-8 turn into 1 UTF-16 code unit.
+ output_adjust = 1;
+ }
+ bool bad = false;
+ for (int j = 1; j <= input_offset; j++) {
+ if ((s[i + j] & 0xc0) != 0x80) {
+ // Bad UTF-8 sequence turns the first in the sequence into kBadChar,
+ // which is a single UTF-16 code unit.
+ bad = true;
+ break;
+ }
+ }
+ if (!bad) {
+ i += input_offset;
+ character_length -= output_adjust;
+ }
+ }
+ }
+ return character_length;
+}
+
+
TEST(ScopePositions) {
// Test the parser for correctly setting the start and end positions
// of a scope. We check the scope positions of exactly one scope
{ " for ", "(let x in {})\n"
" statement;", "\n"
" more;", i::BLOCK_SCOPE, i::EXTENDED_MODE },
+ // Check that 6-byte and 4-byte encodings of UTF-8 strings do not throw
+ // the preparser off in terms of byte offsets.
+ // 6 byte encoding.
+ { " 'foo\355\240\201\355\260\211';\n"
+ " (function fun", "(a,b) { infunction; }", ")();",
+ i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+ // 4 byte encoding.
+ { " 'foo\360\220\220\212';\n"
+ " (function fun", "(a,b) { infunction; }", ")();",
+ i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+ // 3 byte encoding of \u0fff.
+ { " 'foo\340\277\277';\n"
+ " (function fun", "(a,b) { infunction; }", ")();",
+ i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+ // Broken 6 byte encoding with missing last byte.
+ { " 'foo\355\240\201\355\211';\n"
+ " (function fun", "(a,b) { infunction; }", ")();",
+ i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+ // Broken 3 byte encoding of \u0fff with missing last byte.
+ { " 'foo\340\277';\n"
+ " (function fun", "(a,b) { infunction; }", ")();",
+ i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+ // Broken 3 byte encoding of \u0fff with missing 2 last bytes.
+ { " 'foo\340';\n"
+ " (function fun", "(a,b) { infunction; }", ")();",
+ i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+ // Broken 3 byte encoding of \u00ff should be a 2 byte encoding.
+ { " 'foo\340\203\277';\n"
+ " (function fun", "(a,b) { infunction; }", ")();",
+ i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+ // Broken 3 byte encoding of \u007f should be a 2 byte encoding.
+ { " 'foo\340\201\277';\n"
+ " (function fun", "(a,b) { infunction; }", ")();",
+ i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+ // Unpaired lead surrogate.
+ { " 'foo\355\240\201';\n"
+ " (function fun", "(a,b) { infunction; }", ")();",
+ i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+ // Unpaired lead surrogate where following code point is a 3 byte sequence.
+ { " 'foo\355\240\201\340\277\277';\n"
+ " (function fun", "(a,b) { infunction; }", ")();",
+ i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+ // Unpaired lead surrogate where following code point is a 4 byte encoding
+ // of a trail surrogate.
+ { " 'foo\355\240\201\360\215\260\211';\n"
+ " (function fun", "(a,b) { infunction; }", ")();",
+ i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+ // Unpaired trail surrogate.
+ { " 'foo\355\260\211';\n"
+ " (function fun", "(a,b) { infunction; }", ")();",
+ i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+ // 2 byte encoding of \u00ff.
+ { " 'foo\303\277';\n"
+ " (function fun", "(a,b) { infunction; }", ")();",
+ i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+ // Broken 2 byte encoding of \u00ff with missing last byte.
+ { " 'foo\303';\n"
+ " (function fun", "(a,b) { infunction; }", ")();",
+ i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+ // Broken 2 byte encoding of \u007f should be a 1 byte encoding.
+ { " 'foo\301\277';\n"
+ " (function fun", "(a,b) { infunction; }", ")();",
+ i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+ // Illegal 5 byte encoding.
+ { " 'foo\370\277\277\277\277';\n"
+ " (function fun", "(a,b) { infunction; }", ")();",
+ i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+ // Illegal 6 byte encoding.
+ { " 'foo\374\277\277\277\277\277';\n"
+ " (function fun", "(a,b) { infunction; }", ")();",
+ i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+ // Illegal 0xfe byte
+ { " 'foo\376\277\277\277\277\277\277';\n"
+ " (function fun", "(a,b) { infunction; }", ")();",
+ i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+ // Illegal 0xff byte
+ { " 'foo\377\277\277\277\277\277\277\277';\n"
+ " (function fun", "(a,b) { infunction; }", ")();",
+ i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+ { " 'foo';\n"
+ " (function fun", "(a,b) { 'bar\355\240\201\355\260\213'; }", ")();",
+ i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+ { " 'foo';\n"
+ " (function fun", "(a,b) { 'bar\360\220\220\214'; }", ")();",
+ i::FUNCTION_SCOPE, i::CLASSIC_MODE },
{ NULL, NULL, NULL, i::EVAL_SCOPE, i::CLASSIC_MODE }
};
i::FLAG_harmony_scoping = true;
for (int i = 0; source_data[i].outer_prefix; i++) {
- int kPrefixLen = i::StrLength(source_data[i].outer_prefix);
- int kInnerLen = i::StrLength(source_data[i].inner_source);
- int kSuffixLen = i::StrLength(source_data[i].outer_suffix);
+ int kPrefixLen = Utf8LengthHelper(source_data[i].outer_prefix);
+ int kInnerLen = Utf8LengthHelper(source_data[i].inner_source);
+ int kSuffixLen = Utf8LengthHelper(source_data[i].outer_suffix);
+ int kPrefixByteLen = i::StrLength(source_data[i].outer_prefix);
+ int kInnerByteLen = i::StrLength(source_data[i].inner_source);
+ int kSuffixByteLen = i::StrLength(source_data[i].outer_suffix);
int kProgramSize = kPrefixLen + kInnerLen + kSuffixLen;
- i::Vector<char> program = i::Vector<char>::New(kProgramSize + 1);
- int length = i::OS::SNPrintF(program, "%s%s%s",
- source_data[i].outer_prefix,
- source_data[i].inner_source,
- source_data[i].outer_suffix);
- CHECK(length == kProgramSize);
+ int kProgramByteSize = kPrefixByteLen + kInnerByteLen + kSuffixByteLen;
+ i::Vector<char> program = i::Vector<char>::New(kProgramByteSize + 1);
+ i::OS::SNPrintF(program, "%s%s%s",
+ source_data[i].outer_prefix,
+ source_data[i].inner_source,
+ source_data[i].outer_suffix);
// Parse program source.
i::Handle<i::String> source(
- FACTORY->NewStringFromAscii(i::CStrVector(program.start())));
+ FACTORY->NewStringFromUtf8(i::CStrVector(program.start())));
+ CHECK_EQ(source->length(), kProgramSize);
i::Handle<i::Script> script = FACTORY->NewScript(source);
i::Parser parser(script, i::kAllowLazy | i::EXTENDED_MODE, NULL, NULL);
i::CompilationInfo info(script);
// Preparse the data.
i::CompleteParserRecorder log;
i::Scanner scanner(i::Isolate::Current()->unicode_cache());
- i::GenericStringUC16CharacterStream stream(source, 0, source->length());
+ i::GenericStringUtf16CharacterStream stream(source, 0, source->length());
scanner.SetHarmonyScoping(harmony_scoping);
scanner.Initialize(&stream);
v8::preparser::PreParser::PreParseResult result =