From e2444edd8fa7f3bcef961be7836242599ec92c73 Mon Sep 17 00:00:00 2001 From: "yangguo@chromium.org" Date: Thu, 3 Jan 2013 12:59:54 +0000 Subject: [PATCH] Refactor out assumption that one byte strings are ascii in utf8 processing. R=yangguo@chromium.org BUG= Review URL: https://chromiumcodereview.appspot.com/11725006 Patch from Dan Carney . git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@13302 ce2b1a6d-e550-0410-aec6-3dcde31c8c00 --- src/api.cc | 431 ++++++++++++++++++++++++++++++--------------------------- src/handles.cc | 159 --------------------- src/handles.h | 2 - 3 files changed, 226 insertions(+), 366 deletions(-) diff --git a/src/api.cc b/src/api.cc index cf3c083..2290110 100644 --- a/src/api.cc +++ b/src/api.cc @@ -3873,109 +3873,229 @@ int String::Length() const { return str->length(); } +bool String::MayContainNonAscii() const { + i::Handle str = Utils::OpenHandle(this); + if (IsDeadCheck(str->GetIsolate(), "v8::String::MayContainNonAscii()")) { + return false; + } + return !str->HasOnlyAsciiChars(); +} + + +class Utf8LengthVisitor { + public: + explicit Utf8LengthVisitor() + : utf8_length_(0), + last_character_(unibrow::Utf16::kNoPreviousCharacter) {} + + inline int GetLength() { + return utf8_length_; + } + + template + inline void Visit(const Char* chars, unsigned length) { + ASSERT(length > 0); + // TODO(dcarney) Add back ascii fast path. + int utf8_length = 0; + int last_character = last_character_; + for (unsigned i = 0; i < length; i++) { + uint16_t c = chars[i]; + utf8_length += unibrow::Utf8::Length(c, last_character); + last_character = c; + } + last_character_ = last_character; + utf8_length_ += utf8_length; + } + + inline void VisitOneByteString(const uint8_t* chars, unsigned length) { + Visit(chars, length); + } + + inline void VisitTwoByteString(const uint16_t* chars, unsigned length) { + Visit(chars, length); + } + + private: + int utf8_length_; + int last_character_; + DISALLOW_COPY_AND_ASSIGN(Utf8LengthVisitor); +}; + + +static int Utf8Length(i::String* str, i::Isolate* isolate) { + unsigned length = static_cast(str->length()); + if (length == 0) return 0; + int32_t type = str->map()->instance_type(); + Utf8LengthVisitor visitor; + // Non ConsString branch. + if ((type & i::kStringRepresentationMask) != i::kConsStringTag) { + i::ConsStringNullOp null_op; + i::String::Visit(str, 0, visitor, null_op, type, length); + return visitor.GetLength(); + } + i::ConsStringIteratorOp* op = isolate->write_iterator(); + unsigned offset = 0; + i::String* leaf = op->Operate(str, &offset, &type, &length); + ASSERT(leaf != NULL); + while (leaf != NULL) { + i::ConsStringNullOp null_op; + ASSERT(offset == 0); + i::String::Visit(leaf, 0, visitor, null_op, type, length); + leaf = op->ContinueOperation(&type, &length); + } + return visitor.GetLength(); +} + int String::Utf8Length() const { i::Handle str = Utils::OpenHandle(this); - if (IsDeadCheck(str->GetIsolate(), "v8::String::Utf8Length()")) return 0; - return i::Utf8Length(str); -} - - -// Will fail with a negative answer if the recursion depth is too high. -static int RecursivelySerializeToUtf8(i::String* string, - char* buffer, - int start, - int end, - int recursion_budget, - int32_t previous_character, - int32_t* last_character) { - int utf8_bytes = 0; - while (true) { - if (string->IsOneByteRepresentation()) { - i::String::WriteToFlat(string, buffer, start, end); - *last_character = unibrow::Utf16::kNoPreviousCharacter; - return utf8_bytes + end - start; + i::Isolate* isolate = str->GetIsolate(); + if (IsDeadCheck(isolate, "v8::String::Utf8Length()")) return 0; + return v8::Utf8Length(*str, isolate); +} + + +class Utf8WriterVisitor { + public: + Utf8WriterVisitor(char* buffer, int capacity) + : early_termination_(false), + last_character_(unibrow::Utf16::kNoPreviousCharacter), + buffer_(buffer), + start_(buffer), + capacity_(capacity), + utf16_chars_read_(0) { + } + + static int WriteEndCharacter(uint16_t character, + int last_character, + int remaining, + char* const buffer) { + using namespace unibrow; + ASSERT(remaining > 0); + // We can't use a local buffer here because Encode needs to modify + // previous characters in the stream. We know, however, that + // exactly one character will be advanced. + if (Utf16::IsTrailSurrogate(character) && + Utf16::IsLeadSurrogate(last_character)) { + int written = Utf8::Encode(buffer, character, last_character); + ASSERT(written == 1); + return written; } - switch (i::StringShape(string).representation_tag()) { - case i::kExternalStringTag: { - const uint16_t* data = i::ExternalTwoByteString::cast(string)-> - ExternalTwoByteStringGetData(0); - char* current = buffer; - for (int i = start; i < end; i++) { - uint16_t character = data[i]; - current += - unibrow::Utf8::Encode(current, character, previous_character); - previous_character = character; - } - *last_character = previous_character; - return static_cast(utf8_bytes + current - buffer); + // Use a scratch buffer to check the required characters. + char temp_buffer[Utf8::kMaxEncodedSize]; + // Can't encode using last_character as gcc has array bounds issues. + int written = Utf8::Encode(temp_buffer, + character, + unibrow::Utf16::kNoPreviousCharacter); + // Won't fit. + if (written > remaining) return 0; + // Copy over the character from temp_buffer. + for (int j = 0; j < written; j++) { + buffer[j] = temp_buffer[j]; + } + return written; + } + + template + void Visit(const Char* chars, const int length) { + using namespace unibrow; + // TODO(dcarney): Add back ascii fast path. + ASSERT(!early_termination_); + ASSERT(length > 0); + // Copy state to stack. + char* buffer = buffer_; + int last_character = last_character_; + int i = 0; + // Do a fast loop where there is no exit capacity check. + while (true) { + int fast_length; + if (capacity_ == -1) { + fast_length = length; + } else { + int remaining_capacity = capacity_ - (buffer - start_); + // Need enough space to write everything but one character. + STATIC_ASSERT(Utf16::kMaxExtraUtf8BytesForOneUtf16CodeUnit == 3); + int writable_length = (remaining_capacity - 3)/3; + // Need to drop into slow loop. + if (writable_length <= 0) break; + fast_length = i + writable_length; + if (fast_length > length) fast_length = length; } - case i::kSeqStringTag: { - const uint16_t* data = - i::SeqTwoByteString::cast(string)->SeqTwoByteStringGetData(0); - char* current = buffer; - for (int i = start; i < end; i++) { - uint16_t character = data[i]; - current += - unibrow::Utf8::Encode(current, character, previous_character); - previous_character = character; - } - *last_character = previous_character; - return static_cast(utf8_bytes + current - buffer); + // Write the characters to the stream. + for (; i < fast_length; i++) { + uint16_t character = *chars++; + buffer += Utf8::Encode(buffer, character, last_character); + last_character = character; + ASSERT(capacity_ == -1 || (buffer - start_) <= capacity_); } - case i::kSlicedStringTag: { - i::SlicedString* slice = i::SlicedString::cast(string); - unsigned offset = slice->offset(); - string = slice->parent(); - start += offset; - end += offset; - continue; + // Array is fully written. Exit. + if (fast_length == length) { + // Write state back out to object. + last_character_ = last_character; + buffer_ = buffer; + utf16_chars_read_ += i; + return; } - case i::kConsStringTag: { - i::ConsString* cons_string = i::ConsString::cast(string); - i::String* first = cons_string->first(); - int boundary = first->length(); - if (start >= boundary) { - // Only need RHS. - string = cons_string->second(); - start -= boundary; - end -= boundary; - continue; - } else if (end <= boundary) { - // Only need LHS. - string = first; - } else { - if (recursion_budget == 0) return -1; - int extra_utf8_bytes = - RecursivelySerializeToUtf8(first, - buffer, - start, - boundary, - recursion_budget - 1, - previous_character, - &previous_character); - if (extra_utf8_bytes < 0) return extra_utf8_bytes; - buffer += extra_utf8_bytes; - utf8_bytes += extra_utf8_bytes; - string = cons_string->second(); - start = 0; - end -= boundary; - } + } + ASSERT(capacity_ != -1); + // Slow loop. Must check capacity on each iteration. + int remaining_capacity = capacity_ - (buffer - start_); + ASSERT(remaining_capacity >= 0); + for (; i < length && remaining_capacity > 0; i++) { + uint16_t character = *chars++; + int written = WriteEndCharacter(character, + last_character, + remaining_capacity, + buffer); + if (written == 0) { + early_termination_ = true; + break; } + buffer += written; + remaining_capacity -= written; + last_character = character; } + // Write state back out to object. + last_character_ = last_character; + buffer_ = buffer; + utf16_chars_read_ += i; } - UNREACHABLE(); - return 0; -} + inline bool IsDone() { + return early_termination_; + } -bool String::MayContainNonAscii() const { - i::Handle str = Utils::OpenHandle(this); - if (IsDeadCheck(str->GetIsolate(), "v8::String::MayContainNonAscii()")) { - return false; + inline void VisitOneByteString(const uint8_t* chars, unsigned length) { + Visit(chars, static_cast(length)); } - return !str->HasOnlyAsciiChars(); -} + + inline void VisitTwoByteString(const uint16_t* chars, unsigned length) { + Visit(chars, static_cast(length)); + } + + inline int CompleteWrite(bool write_null, int* utf16_chars_read_out) { + // Write out number of utf16 characters written to the stream. + if (utf16_chars_read_out != NULL) { + *utf16_chars_read_out = utf16_chars_read_; + } + // Only null terminate if all of the string was written and there's space. + if (write_null && + !early_termination_ && + (capacity_ == -1 || (buffer_ - start_) < capacity_)) { + *buffer_++ = '\0'; + } + return buffer_ - start_; + } + + private: + bool early_termination_; + int last_character_; + char* buffer_; + char* const start_; + int capacity_; + int utf16_chars_read_; + DISALLOW_IMPLICIT_CONSTRUCTORS(Utf8WriterVisitor); +}; int String::WriteUtf8(char* buffer, @@ -3990,122 +4110,23 @@ int String::WriteUtf8(char* buffer, if (options & HINT_MANY_WRITES_EXPECTED) { FlattenString(str); // Flatten the string for efficiency. } - int string_length = str->length(); - if (str->IsOneByteRepresentation()) { - int len; - if (capacity == -1) { - capacity = str->length() + 1; - len = string_length; - } else { - len = i::Min(capacity, str->length()); - } - i::String::WriteToFlat(*str, buffer, 0, len); - if (nchars_ref != NULL) *nchars_ref = len; - if (!(options & NO_NULL_TERMINATION) && capacity > len) { - buffer[len] = '\0'; - return len + 1; - } - return len; - } - - if (capacity == -1 || capacity / 3 >= string_length) { - int32_t previous = unibrow::Utf16::kNoPreviousCharacter; - const int kMaxRecursion = 100; - int utf8_bytes = - RecursivelySerializeToUtf8(*str, - buffer, - 0, - string_length, - kMaxRecursion, - previous, - &previous); - if (utf8_bytes >= 0) { - // Success serializing with recursion. - if ((options & NO_NULL_TERMINATION) == 0 && - (capacity > utf8_bytes || capacity == -1)) { - buffer[utf8_bytes++] = '\0'; - } - if (nchars_ref != NULL) *nchars_ref = string_length; - return utf8_bytes; + Utf8WriterVisitor writer(buffer, capacity); + i::ConsStringIteratorOp* op = isolate->write_iterator(); + op->Reset(); + int32_t type = str->map()->instance_type(); + unsigned str_length = static_cast(str->length()); + if (str_length != 0) { + i::String::Visit(*str, 0, writer, *op, type, str_length); + while (!writer.IsDone()) { + unsigned length_out; + i::String* next = op->ContinueOperation(&type, &length_out); + if (next == NULL) break; + // TODO(dcarney): need an asserting null op. + i::ConsStringNullOp null_op; + i::String::Visit(next, 0, writer, null_op, type, length_out); } - FlattenString(str); - // Recurse once. This time around the string is flat and the serializing - // with recursion will certainly succeed. - return WriteUtf8(buffer, capacity, nchars_ref, options); - } else if (capacity >= string_length) { - // First check that the buffer is large enough. If it is, then recurse - // once without a capacity limit, which will get into the other branch of - // this 'if'. - int utf8_bytes = i::Utf8Length(str); - if ((options & NO_NULL_TERMINATION) == 0) utf8_bytes++; - if (utf8_bytes <= capacity) { - return WriteUtf8(buffer, -1, nchars_ref, options); - } - } - - // Slow case. - i::StringCharacterStream stream(*str, isolate->write_iterator()); - isolate->string_tracker()->RecordWrite(str); - - int len = str->length(); - // Encode the first K - 3 bytes directly into the buffer since we - // know there's room for them. If no capacity is given we copy all - // of them here. - int fast_end = capacity - (unibrow::Utf8::kMaxEncodedSize - 1); - int i; - int pos = 0; - int nchars = 0; - int previous = unibrow::Utf16::kNoPreviousCharacter; - for (i = 0; i < len && (capacity == -1 || pos < fast_end); i++) { - i::uc32 c = stream.GetNext(); - int written = unibrow::Utf8::Encode(buffer + pos, c, previous); - pos += written; - nchars++; - previous = c; - } - if (i < len) { - // For the last characters we need to check the length for each one - // because they may be longer than the remaining space in the - // buffer. - char intermediate[unibrow::Utf8::kMaxEncodedSize]; - for (; i < len && pos < capacity; i++) { - i::uc32 c = stream.GetNext(); - if (unibrow::Utf16::IsTrailSurrogate(c) && - unibrow::Utf16::IsLeadSurrogate(previous)) { - // We can't use the intermediate buffer here because the encoding - // of surrogate pairs is done under assumption that you can step - // back and fix the UTF8 stream. Luckily we only need space for one - // more byte, so there is always space. - ASSERT(pos < capacity); - int written = unibrow::Utf8::Encode(buffer + pos, c, previous); - ASSERT(written == 1); - pos += written; - nchars++; - } else { - int written = - unibrow::Utf8::Encode(intermediate, - c, - unibrow::Utf16::kNoPreviousCharacter); - if (pos + written <= capacity) { - for (int j = 0; j < written; j++) { - buffer[pos + j] = intermediate[j]; - } - pos += written; - nchars++; - } else { - // We've reached the end of the buffer - break; - } - } - previous = c; - } - } - if (nchars_ref != NULL) *nchars_ref = nchars; - if (!(options & NO_NULL_TERMINATION) && - (i == len && (capacity == -1 || pos < capacity))) { - buffer[pos++] = '\0'; } - return pos; + return writer.CompleteWrite(!(options & NO_NULL_TERMINATION), nchars_ref); } @@ -5637,7 +5658,7 @@ String::Utf8Value::Utf8Value(v8::Handle obj) Handle str = obj->ToString(); if (str.IsEmpty()) return; i::Handle i_str = Utils::OpenHandle(*str); - length_ = i::Utf8Length(i_str); + length_ = v8::Utf8Length(*i_str, isolate); str_ = i::NewArray(length_ + 1); str->WriteUtf8(str_); } diff --git a/src/handles.cc b/src/handles.cc index 3bc1f4b..16fe0c7 100644 --- a/src/handles.cc +++ b/src/handles.cc @@ -883,165 +883,6 @@ Handle PutIntoObjectHashTable(Handle table, } -// This method determines the type of string involved and then gets the UTF8 -// length of the string. It doesn't flatten the string and has log(n) recursion -// for a string of length n. If the failure flag gets set, then we have to -// flatten the string and retry. Failures are caused by surrogate pairs in deep -// cons strings. - -// Single surrogate characters that are encountered in the UTF-16 character -// sequence of the input string get counted as 3 UTF-8 bytes, because that -// is the way that WriteUtf8 will encode them. Surrogate pairs are counted and -// encoded as one 4-byte UTF-8 sequence. - -// This function conceptually uses recursion on the two halves of cons strings. -// However, in order to avoid the recursion going too deep it recurses on the -// second string of the cons, but iterates on the first substring (by manually -// eliminating it as a tail recursion). This means it counts the UTF-8 length -// from the end to the start, which makes no difference to the total. - -// Surrogate pairs are recognized even if they are split across two sides of a -// cons, which complicates the implementation somewhat. Therefore, too deep -// recursion cannot always be avoided. This case is detected, and the failure -// flag is set, a signal to the caller that the string should be flattened and -// the operation retried. -int Utf8LengthHelper(String* input, - int from, - int to, - bool followed_by_surrogate, - int max_recursion, - bool* failure, - bool* starts_with_surrogate) { - if (from == to) return 0; - int total = 0; - bool dummy; - while (true) { - if (input->IsOneByteRepresentation()) { - *starts_with_surrogate = false; - return total + to - from; - } - switch (StringShape(input).representation_tag()) { - case kConsStringTag: { - ConsString* str = ConsString::cast(input); - String* first = str->first(); - String* second = str->second(); - int first_length = first->length(); - if (first_length - from > to - first_length) { - if (first_length < to) { - // Right hand side is shorter. No need to check the recursion depth - // since this can only happen log(n) times. - bool right_starts_with_surrogate = false; - total += Utf8LengthHelper(second, - 0, - to - first_length, - followed_by_surrogate, - max_recursion - 1, - failure, - &right_starts_with_surrogate); - if (*failure) return 0; - followed_by_surrogate = right_starts_with_surrogate; - input = first; - to = first_length; - } else { - // We only need the left hand side. - input = first; - } - } else { - if (first_length > from) { - // Left hand side is shorter. - if (first->IsOneByteRepresentation()) { - total += first_length - from; - *starts_with_surrogate = false; - starts_with_surrogate = &dummy; - input = second; - from = 0; - to -= first_length; - } else if (second->IsOneByteRepresentation()) { - followed_by_surrogate = false; - total += to - first_length; - input = first; - to = first_length; - } else if (max_recursion > 0) { - bool right_starts_with_surrogate = false; - // Recursing on the long one. This may fail. - total += Utf8LengthHelper(second, - 0, - to - first_length, - followed_by_surrogate, - max_recursion - 1, - failure, - &right_starts_with_surrogate); - if (*failure) return 0; - input = first; - to = first_length; - followed_by_surrogate = right_starts_with_surrogate; - } else { - *failure = true; - return 0; - } - } else { - // We only need the right hand side. - input = second; - from = 0; - to -= first_length; - } - } - continue; - } - case kExternalStringTag: - case kSeqStringTag: { - Vector vector = input->GetFlatContent().ToUC16Vector(); - const uc16* p = vector.start(); - int previous = unibrow::Utf16::kNoPreviousCharacter; - for (int i = from; i < to; i++) { - uc16 c = p[i]; - total += unibrow::Utf8::Length(c, previous); - previous = c; - } - if (to - from > 0) { - if (unibrow::Utf16::IsLeadSurrogate(previous) && - followed_by_surrogate) { - total -= unibrow::Utf8::kBytesSavedByCombiningSurrogates; - } - if (unibrow::Utf16::IsTrailSurrogate(p[from])) { - *starts_with_surrogate = true; - } - } - return total; - } - case kSlicedStringTag: { - SlicedString* str = SlicedString::cast(input); - int offset = str->offset(); - input = str->parent(); - from += offset; - to += offset; - continue; - } - default: - break; - } - UNREACHABLE(); - return 0; - } - return 0; -} - - -int Utf8Length(Handle str) { - bool dummy; - bool failure; - int len; - const int kRecursionBudget = 100; - do { - failure = false; - len = Utf8LengthHelper( - *str, 0, str->length(), false, kRecursionBudget, &failure, &dummy); - if (failure) FlattenString(str); - } while (failure); - return len; -} - - DeferredHandleScope::DeferredHandleScope(Isolate* isolate) : impl_(isolate->handle_scope_implementer()) { ASSERT(impl_->isolate() == Isolate::Current()); diff --git a/src/handles.h b/src/handles.h index 032fbe4..684f4ca 100644 --- a/src/handles.h +++ b/src/handles.h @@ -214,8 +214,6 @@ void FlattenString(Handle str); // string. Handle FlattenGetString(Handle str); -int Utf8Length(Handle str); - Handle SetProperty(Isolate* isolate, Handle object, Handle key, -- 2.7.4