From 25f84a48ce976bb1123de15ce1555a0b8065edbd Mon Sep 17 00:00:00 2001
From: "erik.corry@gmail.com"
 <erik.corry@gmail.com@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>
Date: Wed, 21 Mar 2012 13:48:29 +0000
Subject: [PATCH] Speed up WriteUtf8 in the case where the output buffer is
 large enough. Review URL: https://chromiumcodereview.appspot.com/9696032

git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@11104 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
---
 src/api.cc              | 127 +++++++++++++++++++++++++++++++++++++++++++++++-
 test/cctest/test-api.cc |   6 +++
 2 files changed, 132 insertions(+), 1 deletion(-)

diff --git a/src/api.cc b/src/api.cc
index c852b97..239b41e 100644
--- a/src/api.cc
+++ b/src/api.cc
@@ -3694,6 +3694,94 @@ int String::Utf8Length() const {
 }
 
 
+// Will fail with a negative answer if the recursion depth is too high.
+static int RecursivelySerializeToUtf8(i::String* string,
+                                      char* buffer,
+                                      int start,
+                                      int end,
+                                      int recursion_budget,
+                                      int32_t previous_character,
+                                      int32_t* last_character) {
+  int utf8_bytes = 0;
+  while (true) {
+    if (string->IsAsciiRepresentation()) {
+      i::String::WriteToFlat(string, buffer, start, end);
+      *last_character = unibrow::Utf16::kNoPreviousCharacter;
+      return utf8_bytes + end - start;
+    }
+    switch (i::StringShape(string).representation_tag()) {
+      case i::kExternalStringTag: {
+        const uint16_t* data = i::ExternalTwoByteString::cast(string)->
+          ExternalTwoByteStringGetData(0);
+        char* current = buffer;
+        for (int i = start; i < end; i++) {
+          uint16_t character = data[i];
+          current +=
+              unibrow::Utf8::Encode(current, character, previous_character);
+          previous_character = character;
+        }
+        *last_character = previous_character;
+        return utf8_bytes + current - buffer;
+      }
+      case i::kSeqStringTag: {
+        const uint16_t* data =
+            i::SeqTwoByteString::cast(string)->SeqTwoByteStringGetData(0);
+        char* current = buffer;
+        for (int i = start; i < end; i++) {
+          uint16_t character = data[i];
+          current +=
+              unibrow::Utf8::Encode(current, character, previous_character);
+          previous_character = character;
+        }
+        *last_character = previous_character;
+        return utf8_bytes + current - buffer;
+      }
+      case i::kSlicedStringTag: {
+        i::SlicedString* slice = i::SlicedString::cast(string);
+        unsigned offset = slice->offset();
+        string = slice->parent();
+        start += offset;
+        end += offset;
+        continue;
+      }
+      case i::kConsStringTag: {
+        i::ConsString* cons_string = i::ConsString::cast(string);
+        i::String* first = cons_string->first();
+        int boundary = first->length();
+        if (start >= boundary) {
+          // Only need RHS.
+          string = cons_string->second();
+          start -= boundary;
+          end -= boundary;
+          continue;
+        } else if (end <= boundary) {
+          // Only need LHS.
+          string = first;
+        } else {
+          if (recursion_budget == 0) return -1;
+          int extra_utf8_bytes =
+              RecursivelySerializeToUtf8(first,
+                                         buffer,
+                                         start,
+                                         boundary,
+                                         recursion_budget - 1,
+                                         previous_character,
+                                         &previous_character);
+          if (extra_utf8_bytes < 0) return extra_utf8_bytes;
+          buffer += extra_utf8_bytes;
+          utf8_bytes += extra_utf8_bytes;
+          string = cons_string->second();
+          start = 0;
+          end -= boundary;
+        }
+      }
+    }
+  }
+  UNREACHABLE();
+  return 0;
+}
+
+
 bool String::MayContainNonAscii() const {
   i::Handle<i::String> str = Utils::OpenHandle(this);
   if (IsDeadCheck(str->GetIsolate(), "v8::String::MayContainNonAscii()")) {
@@ -3712,11 +3800,12 @@ int String::WriteUtf8(char* buffer,
   LOG_API(isolate, "String::WriteUtf8");
   ENTER_V8(isolate);
   i::Handle<i::String> str = Utils::OpenHandle(this);
+  int string_length = str->length();
   if (str->IsAsciiRepresentation()) {
     int len;
     if (capacity == -1) {
       capacity = str->length() + 1;
-      len = str->length();
+      len = string_length;
     } else {
       len = i::Min(capacity, str->length());
     }
@@ -3729,6 +3818,42 @@ int String::WriteUtf8(char* buffer,
     return len;
   }
 
+  if (capacity == -1 || capacity >= string_length * 3) {
+    int32_t previous = unibrow::Utf16::kNoPreviousCharacter;
+    const int kMaxRecursion = 100;
+    int utf8_bytes =
+        RecursivelySerializeToUtf8(*str,
+                                   buffer,
+                                   0,
+                                   string_length,
+                                   kMaxRecursion,
+                                   previous,
+                                   &previous);
+    if (utf8_bytes >= 0) {
+      // Success serializing with recursion.
+      if ((options & NO_NULL_TERMINATION) == 0 &&
+          (capacity > utf8_bytes || capacity == -1)) {
+        buffer[utf8_bytes++] = '\0';
+      }
+      if (nchars_ref != NULL) *nchars_ref = string_length;
+      return utf8_bytes;
+    }
+    FlattenString(str);
+    // Recurse once.  This time around the string is flat and the serializing
+    // with recursion will certainly succeed.
+    return WriteUtf8(buffer, capacity, nchars_ref, options);
+  } else if (capacity >= string_length) {
+    // First check that the buffer is large enough.  If it is, then recurse
+    // once without a capacity limit, which will get into the other branch of
+    // this 'if'.
+    int utf8_bytes = i::Utf8Length(str);
+    if ((options & NO_NULL_TERMINATION) == 0) utf8_bytes++;
+    if (utf8_bytes <= capacity) {
+      return WriteUtf8(buffer, -1, nchars_ref, options);
+    }
+  }
+
+  // Slow case.
   i::StringInputBuffer& write_input_buffer = *isolate->write_input_buffer();
   isolate->string_tracker()->RecordWrite(str);
   if (options & HINT_MANY_WRITES_EXPECTED) {
diff --git a/test/cctest/test-api.cc b/test/cctest/test-api.cc
index 8df0b7b..b1a23c1 100644
--- a/test/cctest/test-api.cc
+++ b/test/cctest/test-api.cc
@@ -5870,6 +5870,7 @@ THREADED_TEST(Utf16) {
       "p.push(String.fromCharCode(0xdc00));"
       "var a = [];"
       "var b = [];"
+      "var c = [];"
       "var alens = [];"
       "for (var i = 0; i < 3; i++) {"
       "  p[1] = String.fromCharCode(lead++);"
@@ -5877,17 +5878,21 @@ THREADED_TEST(Utf16) {
       "    p[2] = String.fromCharCode(trail++);"
       "    a.push(p[i] + p[j]);"
       "    b.push(p[i] + p[j]);"
+      "    c.push(p[i] + p[j]);"
       "    alens.push(plens[i] + plens[j]);"
       "  }"
       "}"
       "alens[5] -= 2;"  // Here the surrogate pairs match up.
       "var a2 = [];"
       "var b2 = [];"
+      "var c2 = [];"
       "var a2lens = [];"
       "for (var m = 0; m < 9; m++) {"
       "  for (var n = 0; n < 9; n++) {"
       "    a2.push(a[m] + a[n]);"
       "    b2.push(b[m] + b[n]);"
+      "    var newc = 'x' + c[m] + c[n] + 'y';"
+      "    c2.push(newc.substring(1, newc.length - 1));"
       "    var utf = alens[m] + alens[n];"  // And here.
            // The 'n's that start with 0xdc.. are 6-8
            // The 'm's that end with 0xd8.. are 1, 4 and 7
@@ -5899,6 +5904,7 @@ THREADED_TEST(Utf16) {
   Utf16Helper(context, "a2", "a2lens", 81);
   WriteUtf8Helper(context, "b", "alens", 9);
   WriteUtf8Helper(context, "b2", "a2lens", 81);
+  WriteUtf8Helper(context, "c2", "a2lens", 81);
 }
 
 
-- 
2.7.4