From e2444edd8fa7f3bcef961be7836242599ec92c73 Mon Sep 17 00:00:00 2001
From: "yangguo@chromium.org"
 <yangguo@chromium.org@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>
Date: Thu, 3 Jan 2013 12:59:54 +0000
Subject: [PATCH] Refactor out assumption that one byte strings are ascii in
 utf8 processing.

R=yangguo@chromium.org
BUG=

Review URL: https://chromiumcodereview.appspot.com/11725006
Patch from Dan Carney <dcarney@google.com>.

git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@13302 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
---
 src/api.cc     | 431 ++++++++++++++++++++++++++++++---------------------------
 src/handles.cc | 159 ---------------------
 src/handles.h  |   2 -
 3 files changed, 226 insertions(+), 366 deletions(-)
diff --git a/src/api.cc b/src/api.cc
index cf3c083..2290110 100644
--- a/src/api.cc
+++ b/src/api.cc
@@ -3873,109 +3873,229 @@ int String::Length() const {
   return str->length();
 }
 
+bool String::MayContainNonAscii() const {
+  i::Handle<i::String> str = Utils::OpenHandle(this);
+  if (IsDeadCheck(str->GetIsolate(), "v8::String::MayContainNonAscii()")) {
+    return false;
+  }
+  return !str->HasOnlyAsciiChars();
+}
+
+
+class Utf8LengthVisitor {
+ public:
+  explicit Utf8LengthVisitor()
+    : utf8_length_(0),
+      last_character_(unibrow::Utf16::kNoPreviousCharacter) {}
+
+  inline int GetLength() {
+    return utf8_length_;
+  }
+
+  template<typename Char>
+  inline void Visit(const Char* chars, unsigned length) {
+    ASSERT(length > 0);
+    // TODO(dcarney) Add back ascii fast path.
+    int utf8_length = 0;
+    int last_character = last_character_;
+    for (unsigned i = 0; i < length; i++) {
+      uint16_t c = chars[i];
+      utf8_length += unibrow::Utf8::Length(c, last_character);
+      last_character = c;
+    }
+    last_character_ = last_character;
+    utf8_length_ += utf8_length;
+  }
+
+  inline void VisitOneByteString(const uint8_t* chars, unsigned length) {
+    Visit(chars, length);
+  }
+
+  inline void VisitTwoByteString(const uint16_t* chars, unsigned length) {
+    Visit(chars, length);
+  }
+
+ private:
+  int utf8_length_;
+  int last_character_;
+  DISALLOW_COPY_AND_ASSIGN(Utf8LengthVisitor);
+};
+
+
+static int Utf8Length(i::String* str, i::Isolate* isolate) {
+  unsigned length = static_cast<unsigned>(str->length());
+  if (length == 0) return 0;
+  int32_t type = str->map()->instance_type();
+  Utf8LengthVisitor visitor;
+  // Non ConsString branch.
+  if ((type & i::kStringRepresentationMask) != i::kConsStringTag) {
+    i::ConsStringNullOp null_op;
+    i::String::Visit(str, 0, visitor, null_op, type, length);
+    return visitor.GetLength();
+  }
+  i::ConsStringIteratorOp* op = isolate->write_iterator();
+  unsigned offset = 0;
+  i::String* leaf = op->Operate(str, &offset, &type, &length);
+  ASSERT(leaf != NULL);
+  while (leaf != NULL) {
+    i::ConsStringNullOp null_op;
+    ASSERT(offset == 0);
+    i::String::Visit(leaf, 0, visitor, null_op, type, length);
+    leaf = op->ContinueOperation(&type, &length);
+  }
+  return visitor.GetLength();
+}
+
 
 int String::Utf8Length() const {
   i::Handle<i::String> str = Utils::OpenHandle(this);
-  if (IsDeadCheck(str->GetIsolate(), "v8::String::Utf8Length()")) return 0;
-  return i::Utf8Length(str);
-}
-
-
-// Will fail with a negative answer if the recursion depth is too high.
-static int RecursivelySerializeToUtf8(i::String* string,
-                                      char* buffer,
-                                      int start,
-                                      int end,
-                                      int recursion_budget,
-                                      int32_t previous_character,
-                                      int32_t* last_character) {
-  int utf8_bytes = 0;
-  while (true) {
-    if (string->IsOneByteRepresentation()) {
-      i::String::WriteToFlat(string, buffer, start, end);
-      *last_character = unibrow::Utf16::kNoPreviousCharacter;
-      return utf8_bytes + end - start;
+  i::Isolate* isolate = str->GetIsolate();
+  if (IsDeadCheck(isolate, "v8::String::Utf8Length()")) return 0;
+  return v8::Utf8Length(*str, isolate);
+}
+
+
+class Utf8WriterVisitor {
+ public:
+  Utf8WriterVisitor(char* buffer, int capacity)
+    : early_termination_(false),
+      last_character_(unibrow::Utf16::kNoPreviousCharacter),
+      buffer_(buffer),
+      start_(buffer),
+      capacity_(capacity),
+      utf16_chars_read_(0) {
+  }
+
+  static int WriteEndCharacter(uint16_t character,
+                               int last_character,
+                               int remaining,
+                               char* const buffer) {
+    using namespace unibrow;
+    ASSERT(remaining > 0);
+    // We can't use a local buffer here because Encode needs to modify
+    // previous characters in the stream.  We know, however, that
+    // exactly one character will be advanced.
+    if (Utf16::IsTrailSurrogate(character) &&
+        Utf16::IsLeadSurrogate(last_character)) {
+      int written = Utf8::Encode(buffer, character, last_character);
+      ASSERT(written == 1);
+      return written;
     }
-    switch (i::StringShape(string).representation_tag()) {
-      case i::kExternalStringTag: {
-        const uint16_t* data = i::ExternalTwoByteString::cast(string)->
-          ExternalTwoByteStringGetData(0);
-        char* current = buffer;
-        for (int i = start; i < end; i++) {
-          uint16_t character = data[i];
-          current +=
-              unibrow::Utf8::Encode(current, character, previous_character);
-          previous_character = character;
-        }
-        *last_character = previous_character;
-        return static_cast<int>(utf8_bytes + current - buffer);
+    // Use a scratch buffer to check the required characters.
+    char temp_buffer[Utf8::kMaxEncodedSize];
+    // Can't encode using last_character as gcc has array bounds issues.
+    int written = Utf8::Encode(temp_buffer,
+                               character,
+                               unibrow::Utf16::kNoPreviousCharacter);
+    // Won't fit.
+    if (written > remaining) return 0;
+    // Copy over the character from temp_buffer.
+    for (int j = 0; j < written; j++) {
+      buffer[j] = temp_buffer[j];
+    }
+    return written;
+  }
+
+  template<typename Char>
+  void Visit(const Char* chars, const int length) {
+    using namespace unibrow;
+    // TODO(dcarney): Add back ascii fast path.
+    ASSERT(!early_termination_);
+    ASSERT(length > 0);
+    // Copy state to stack.
+    char* buffer = buffer_;
+    int last_character = last_character_;
+    int i = 0;
+    // Do a fast loop where there is no exit capacity check.
+    while (true) {
+      int fast_length;
+      if (capacity_ == -1) {
+        fast_length = length;
+      } else {
+        int remaining_capacity = capacity_ - (buffer - start_);
+        // Need enough space to write everything but one character.
+        STATIC_ASSERT(Utf16::kMaxExtraUtf8BytesForOneUtf16CodeUnit == 3);
+        int writable_length = (remaining_capacity - 3)/3;
+        // Need to drop into slow loop.
+        if (writable_length <= 0) break;
+        fast_length = i + writable_length;
+        if (fast_length > length) fast_length = length;
       }
-      case i::kSeqStringTag: {
-        const uint16_t* data =
-            i::SeqTwoByteString::cast(string)->SeqTwoByteStringGetData(0);
-        char* current = buffer;
-        for (int i = start; i < end; i++) {
-          uint16_t character = data[i];
-          current +=
-              unibrow::Utf8::Encode(current, character, previous_character);
-          previous_character = character;
-        }
-        *last_character = previous_character;
-        return static_cast<int>(utf8_bytes + current - buffer);
+      // Write the characters to the stream.
+      for (; i < fast_length; i++) {
+        uint16_t character = *chars++;
+        buffer += Utf8::Encode(buffer, character, last_character);
+        last_character = character;
+        ASSERT(capacity_ == -1 || (buffer - start_) <= capacity_);
       }
-      case i::kSlicedStringTag: {
-        i::SlicedString* slice = i::SlicedString::cast(string);
-        unsigned offset = slice->offset();
-        string = slice->parent();
-        start += offset;
-        end += offset;
-        continue;
+      // Array is fully written. Exit.
+      if (fast_length == length) {
+        // Write state back out to object.
+        last_character_ = last_character;
+        buffer_ = buffer;
+        utf16_chars_read_ += i;
+        return;
       }
-      case i::kConsStringTag: {
-        i::ConsString* cons_string = i::ConsString::cast(string);
-        i::String* first = cons_string->first();
-        int boundary = first->length();
-        if (start >= boundary) {
-          // Only need RHS.
-          string = cons_string->second();
-          start -= boundary;
-          end -= boundary;
-          continue;
-        } else if (end <= boundary) {
-          // Only need LHS.
-          string = first;
-        } else {
-          if (recursion_budget == 0) return -1;
-          int extra_utf8_bytes =
-              RecursivelySerializeToUtf8(first,
-                                         buffer,
-                                         start,
-                                         boundary,
-                                         recursion_budget - 1,
-                                         previous_character,
-                                         &previous_character);
-          if (extra_utf8_bytes < 0) return extra_utf8_bytes;
-          buffer += extra_utf8_bytes;
-          utf8_bytes += extra_utf8_bytes;
-          string = cons_string->second();
-          start = 0;
-          end -= boundary;
-        }
+    }
+    ASSERT(capacity_ != -1);
+    // Slow loop. Must check capacity on each iteration.
+    int remaining_capacity = capacity_ - (buffer - start_);
+    ASSERT(remaining_capacity >= 0);
+    for (; i < length && remaining_capacity > 0; i++) {
+      uint16_t character = *chars++;
+      int written = WriteEndCharacter(character,
+                                      last_character,
+                                      remaining_capacity,
+                                      buffer);
+      if (written == 0) {
+        early_termination_ = true;
+        break;
       }
+      buffer += written;
+      remaining_capacity -= written;
+      last_character = character;
     }
+    // Write state back out to object.
+    last_character_ = last_character;
+    buffer_ = buffer;
+    utf16_chars_read_ += i;
   }
-  UNREACHABLE();
-  return 0;
-}
 
+  inline bool IsDone() {
+    return early_termination_;
+  }
 
-bool String::MayContainNonAscii() const {
-  i::Handle<i::String> str = Utils::OpenHandle(this);
-  if (IsDeadCheck(str->GetIsolate(), "v8::String::MayContainNonAscii()")) {
-    return false;
+  inline void VisitOneByteString(const uint8_t* chars, unsigned length) {
+    Visit(chars, static_cast<int>(length));
   }
-  return !str->HasOnlyAsciiChars();
-}
+
+  inline void VisitTwoByteString(const uint16_t* chars, unsigned length) {
+    Visit(chars, static_cast<int>(length));
+  }
+
+  inline int CompleteWrite(bool write_null, int* utf16_chars_read_out) {
+    // Write out number of utf16 characters written to the stream.
+    if (utf16_chars_read_out != NULL) {
+      *utf16_chars_read_out = utf16_chars_read_;
+    }
+    // Only null terminate if all of the string was written and there's space.
+    if (write_null &&
+        !early_termination_ &&
+        (capacity_ == -1 || (buffer_ - start_) < capacity_)) {
+      *buffer_++ = '\0';
+    }
+    return buffer_ - start_;
+  }
+
+ private:
+  bool early_termination_;
+  int last_character_;
+  char* buffer_;
+  char* const start_;
+  int capacity_;
+  int utf16_chars_read_;
+  DISALLOW_IMPLICIT_CONSTRUCTORS(Utf8WriterVisitor);
+};
 
 
 int String::WriteUtf8(char* buffer,
@@ -3990,122 +4110,23 @@ int String::WriteUtf8(char* buffer,
   if (options & HINT_MANY_WRITES_EXPECTED) {
     FlattenString(str);  // Flatten the string for efficiency.
   }
-  int string_length = str->length();
-  if (str->IsOneByteRepresentation()) {
-    int len;
-    if (capacity == -1) {
-      capacity = str->length() + 1;
-      len = string_length;
-    } else {
-      len = i::Min(capacity, str->length());
-    }
-    i::String::WriteToFlat(*str, buffer, 0, len);
-    if (nchars_ref != NULL) *nchars_ref = len;
-    if (!(options & NO_NULL_TERMINATION) && capacity > len) {
-      buffer[len] = '\0';
-      return len + 1;
-    }
-    return len;
-  }
-
-  if (capacity == -1 || capacity / 3 >= string_length) {
-    int32_t previous = unibrow::Utf16::kNoPreviousCharacter;
-    const int kMaxRecursion = 100;
-    int utf8_bytes =
-        RecursivelySerializeToUtf8(*str,
-                                   buffer,
-                                   0,
-                                   string_length,
-                                   kMaxRecursion,
-                                   previous,
-                                   &previous);
-    if (utf8_bytes >= 0) {
-      // Success serializing with recursion.
-      if ((options & NO_NULL_TERMINATION) == 0 &&
-          (capacity > utf8_bytes || capacity == -1)) {
-        buffer[utf8_bytes++] = '\0';
-      }
-      if (nchars_ref != NULL) *nchars_ref = string_length;
-      return utf8_bytes;
+  Utf8WriterVisitor writer(buffer, capacity);
+  i::ConsStringIteratorOp* op = isolate->write_iterator();
+  op->Reset();
+  int32_t type = str->map()->instance_type();
+  unsigned str_length = static_cast<unsigned>(str->length());
+  if (str_length != 0) {
+    i::String::Visit(*str, 0, writer, *op, type, str_length);
+    while (!writer.IsDone()) {
+      unsigned length_out;
+      i::String* next = op->ContinueOperation(&type, &length_out);
+      if (next == NULL) break;
+      // TODO(dcarney): need an asserting null op.
+      i::ConsStringNullOp null_op;
+      i::String::Visit(next, 0, writer, null_op, type, length_out);
     }
-    FlattenString(str);
-    // Recurse once.  This time around the string is flat and the serializing
-    // with recursion will certainly succeed.
-    return WriteUtf8(buffer, capacity, nchars_ref, options);
-  } else if (capacity >= string_length) {
-    // First check that the buffer is large enough.  If it is, then recurse
-    // once without a capacity limit, which will get into the other branch of
-    // this 'if'.
-    int utf8_bytes = i::Utf8Length(str);
-    if ((options & NO_NULL_TERMINATION) == 0) utf8_bytes++;
-    if (utf8_bytes <= capacity) {
-      return WriteUtf8(buffer, -1, nchars_ref, options);
-    }
-  }
-
-  // Slow case.
-  i::StringCharacterStream stream(*str, isolate->write_iterator());
-  isolate->string_tracker()->RecordWrite(str);
-
-  int len = str->length();
-  // Encode the first K - 3 bytes directly into the buffer since we
-  // know there's room for them.  If no capacity is given we copy all
-  // of them here.
-  int fast_end = capacity - (unibrow::Utf8::kMaxEncodedSize - 1);
-  int i;
-  int pos = 0;
-  int nchars = 0;
-  int previous = unibrow::Utf16::kNoPreviousCharacter;
-  for (i = 0; i < len && (capacity == -1 || pos < fast_end); i++) {
-    i::uc32 c = stream.GetNext();
-    int written = unibrow::Utf8::Encode(buffer + pos, c, previous);
-    pos += written;
-    nchars++;
-    previous = c;
-  }
-  if (i < len) {
-    // For the last characters we need to check the length for each one
-    // because they may be longer than the remaining space in the
-    // buffer.
-    char intermediate[unibrow::Utf8::kMaxEncodedSize];
-    for (; i < len && pos < capacity; i++) {
-      i::uc32 c = stream.GetNext();
-      if (unibrow::Utf16::IsTrailSurrogate(c) &&
-          unibrow::Utf16::IsLeadSurrogate(previous)) {
-        // We can't use the intermediate buffer here because the encoding
-        // of surrogate pairs is done under assumption that you can step
-        // back and fix the UTF8 stream.  Luckily we only need space for one
-        // more byte, so there is always space.
-        ASSERT(pos < capacity);
-        int written = unibrow::Utf8::Encode(buffer + pos, c, previous);
-        ASSERT(written == 1);
-        pos += written;
-        nchars++;
-      } else {
-        int written =
-            unibrow::Utf8::Encode(intermediate,
-                                  c,
-                                  unibrow::Utf16::kNoPreviousCharacter);
-        if (pos + written <= capacity) {
-          for (int j = 0; j < written; j++) {
-            buffer[pos + j] = intermediate[j];
-          }
-          pos += written;
-          nchars++;
-        } else {
-          // We've reached the end of the buffer
-          break;
-        }
-      }
-      previous = c;
-    }
-  }
-  if (nchars_ref != NULL) *nchars_ref = nchars;
-  if (!(options & NO_NULL_TERMINATION) &&
-      (i == len && (capacity == -1 || pos < capacity))) {
-    buffer[pos++] = '\0';
   }
-  return pos;
+  return writer.CompleteWrite(!(options & NO_NULL_TERMINATION), nchars_ref);
 }
 
 
@@ -5637,7 +5658,7 @@ String::Utf8Value::Utf8Value(v8::Handle<v8::Value> obj)
   Handle<String> str = obj->ToString();
   if (str.IsEmpty()) return;
   i::Handle<i::String> i_str = Utils::OpenHandle(*str);
-  length_ = i::Utf8Length(i_str);
+  length_ = v8::Utf8Length(*i_str, isolate);
   str_ = i::NewArray<char>(length_ + 1);
   str->WriteUtf8(str_);
 }
diff --git a/src/handles.cc b/src/handles.cc
index 3bc1f4b..16fe0c7 100644
--- a/src/handles.cc
+++ b/src/handles.cc
@@ -883,165 +883,6 @@ Handle<ObjectHashTable> PutIntoObjectHashTable(Handle<ObjectHashTable> table,
 }
 
 
-// This method determines the type of string involved and then gets the UTF8
-// length of the string.  It doesn't flatten the string and has log(n) recursion
-// for a string of length n.  If the failure flag gets set, then we have to
-// flatten the string and retry.  Failures are caused by surrogate pairs in deep
-// cons strings.
-
-// Single surrogate characters that are encountered in the UTF-16 character
-// sequence of the input string get counted as 3 UTF-8 bytes, because that
-// is the way that WriteUtf8 will encode them.  Surrogate pairs are counted and
-// encoded as one 4-byte UTF-8 sequence.
-
-// This function conceptually uses recursion on the two halves of cons strings.
-// However, in order to avoid the recursion going too deep it recurses on the
-// second string of the cons, but iterates on the first substring (by manually
-// eliminating it as a tail recursion).  This means it counts the UTF-8 length
-// from the end to the start, which makes no difference to the total.
-
-// Surrogate pairs are recognized even if they are split across two sides of a
-// cons, which complicates the implementation somewhat.  Therefore, too deep
-// recursion cannot always be avoided.  This case is detected, and the failure
-// flag is set, a signal to the caller that the string should be flattened and
-// the operation retried.
-int Utf8LengthHelper(String* input,
-                     int from,
-                     int to,
-                     bool followed_by_surrogate,
-                     int max_recursion,
-                     bool* failure,
-                     bool* starts_with_surrogate) {
-  if (from == to) return 0;
-  int total = 0;
-  bool dummy;
-  while (true) {
-    if (input->IsOneByteRepresentation()) {
-      *starts_with_surrogate = false;
-      return total + to - from;
-    }
-    switch (StringShape(input).representation_tag()) {
-      case kConsStringTag: {
-        ConsString* str = ConsString::cast(input);
-        String* first = str->first();
-        String* second = str->second();
-        int first_length = first->length();
-        if (first_length - from > to - first_length) {
-          if (first_length < to) {
-            // Right hand side is shorter.  No need to check the recursion depth
-            // since this can only happen log(n) times.
-            bool right_starts_with_surrogate = false;
-            total += Utf8LengthHelper(second,
-                                      0,
-                                      to - first_length,
-                                      followed_by_surrogate,
-                                      max_recursion - 1,
-                                      failure,
-                                      &right_starts_with_surrogate);
-            if (*failure) return 0;
-            followed_by_surrogate = right_starts_with_surrogate;
-            input = first;
-            to = first_length;
-          } else {
-            // We only need the left hand side.
-            input = first;
-          }
-        } else {
-          if (first_length > from) {
-            // Left hand side is shorter.
-            if (first->IsOneByteRepresentation()) {
-              total += first_length - from;
-              *starts_with_surrogate = false;
-              starts_with_surrogate = &dummy;
-              input = second;
-              from = 0;
-              to -= first_length;
-            } else if (second->IsOneByteRepresentation()) {
-              followed_by_surrogate = false;
-              total += to - first_length;
-              input = first;
-              to = first_length;
-            } else if (max_recursion > 0) {
-              bool right_starts_with_surrogate = false;
-              // Recursing on the long one.  This may fail.
-              total += Utf8LengthHelper(second,
-                                        0,
-                                        to - first_length,
-                                        followed_by_surrogate,
-                                        max_recursion - 1,
-                                        failure,
-                                        &right_starts_with_surrogate);
-              if (*failure) return 0;
-              input = first;
-              to = first_length;
-              followed_by_surrogate = right_starts_with_surrogate;
-            } else {
-              *failure = true;
-              return 0;
-            }
-          } else {
-            // We only need the right hand side.
-            input = second;
-            from = 0;
-            to -= first_length;
-          }
-        }
-        continue;
-      }
-      case kExternalStringTag:
-      case kSeqStringTag: {
-        Vector<const uc16> vector = input->GetFlatContent().ToUC16Vector();
-        const uc16* p = vector.start();
-        int previous = unibrow::Utf16::kNoPreviousCharacter;
-        for (int i = from; i < to; i++) {
-          uc16 c = p[i];
-          total += unibrow::Utf8::Length(c, previous);
-          previous = c;
-        }
-        if (to - from > 0) {
-          if (unibrow::Utf16::IsLeadSurrogate(previous) &&
-              followed_by_surrogate) {
-            total -= unibrow::Utf8::kBytesSavedByCombiningSurrogates;
-          }
-          if (unibrow::Utf16::IsTrailSurrogate(p[from])) {
-            *starts_with_surrogate = true;
-          }
-        }
-        return total;
-      }
-      case kSlicedStringTag: {
-        SlicedString* str = SlicedString::cast(input);
-        int offset = str->offset();
-        input = str->parent();
-        from += offset;
-        to += offset;
-        continue;
-      }
-      default:
-        break;
-    }
-    UNREACHABLE();
-    return 0;
-  }
-  return 0;
-}
-
-
-int Utf8Length(Handle<String> str) {
-  bool dummy;
-  bool failure;
-  int len;
-  const int kRecursionBudget = 100;
-  do {
-    failure = false;
-    len = Utf8LengthHelper(
-        *str, 0, str->length(), false, kRecursionBudget, &failure, &dummy);
-    if (failure) FlattenString(str);
-  } while (failure);
-  return len;
-}
-
-
 DeferredHandleScope::DeferredHandleScope(Isolate* isolate)
     : impl_(isolate->handle_scope_implementer()) {
   ASSERT(impl_->isolate() == Isolate::Current());
diff --git a/src/handles.h b/src/handles.h
index 032fbe4..684f4ca 100644
--- a/src/handles.h
+++ b/src/handles.h
@@ -214,8 +214,6 @@ void FlattenString(Handle<String> str);
 // string.
 Handle<String> FlattenGetString(Handle<String> str);
 
-int Utf8Length(Handle<String> str);
-
 Handle<Object> SetProperty(Isolate* isolate,
                            Handle<Object> object,
                            Handle<Object> key,
-- 
2.7.4