Speed up WriteUtf8 in the case where the output buffer is large enough.

author erik.corry@gmail.com <erik.corry@gmail.com@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>

Wed, 21 Mar 2012 13:48:29 +0000 (13:48 +0000)

committer erik.corry@gmail.com <erik.corry@gmail.com@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>

Wed, 21 Mar 2012 13:48:29 +0000 (13:48 +0000)
author erik.corry@gmail.com <erik.corry@gmail.com@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>
Wed, 21 Mar 2012 13:48:29 +0000 (13:48 +0000)
committer erik.corry@gmail.com <erik.corry@gmail.com@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>
Wed, 21 Mar 2012 13:48:29 +0000 (13:48 +0000)
diff --git a/src/api.cc b/src/api.cc

index c852b97964916c618094685e64220d3a243134ca..239b41e5b986a5cd062712356433a9fc7b422a8b 100644 (file)
--- a/src/api.cc
+++ b/src/api.cc
@@ -3694,6 +3694,94 @@ int String::Utf8Length() const {
  }
  
  
+// Will fail with a negative answer if the recursion depth is too high.
+static int RecursivelySerializeToUtf8(i::String* string,
+                                      char* buffer,
+                                      int start,
+                                      int end,
+                                      int recursion_budget,
+                                      int32_t previous_character,
+                                      int32_t* last_character) {
+  int utf8_bytes = 0;
+  while (true) {
+    if (string->IsAsciiRepresentation()) {
+      i::String::WriteToFlat(string, buffer, start, end);
+      *last_character = unibrow::Utf16::kNoPreviousCharacter;
+      return utf8_bytes + end - start;
+    }
+    switch (i::StringShape(string).representation_tag()) {
+      case i::kExternalStringTag: {
+        const uint16_t* data = i::ExternalTwoByteString::cast(string)->
+          ExternalTwoByteStringGetData(0);
+        char* current = buffer;
+        for (int i = start; i < end; i++) {
+          uint16_t character = data[i];
+          current +=
+              unibrow::Utf8::Encode(current, character, previous_character);
+          previous_character = character;
+        }
+        *last_character = previous_character;
+        return utf8_bytes + current - buffer;
+      }
+      case i::kSeqStringTag: {
+        const uint16_t* data =
+            i::SeqTwoByteString::cast(string)->SeqTwoByteStringGetData(0);
+        char* current = buffer;
+        for (int i = start; i < end; i++) {
+          uint16_t character = data[i];
+          current +=
+              unibrow::Utf8::Encode(current, character, previous_character);
+          previous_character = character;
+        }
+        *last_character = previous_character;
+        return utf8_bytes + current - buffer;
+      }
+      case i::kSlicedStringTag: {
+        i::SlicedString* slice = i::SlicedString::cast(string);
+        unsigned offset = slice->offset();
+        string = slice->parent();
+        start += offset;
+        end += offset;
+        continue;
+      }
+      case i::kConsStringTag: {
+        i::ConsString* cons_string = i::ConsString::cast(string);
+        i::String* first = cons_string->first();
+        int boundary = first->length();
+        if (start >= boundary) {
+          // Only need RHS.
+          string = cons_string->second();
+          start -= boundary;
+          end -= boundary;
+          continue;
+        } else if (end <= boundary) {
+          // Only need LHS.
+          string = first;
+        } else {
+          if (recursion_budget == 0) return -1;
+          int extra_utf8_bytes =
+              RecursivelySerializeToUtf8(first,
+                                         buffer,
+                                         start,
+                                         boundary,
+                                         recursion_budget - 1,
+                                         previous_character,
+                                         &previous_character);
+          if (extra_utf8_bytes < 0) return extra_utf8_bytes;
+          buffer += extra_utf8_bytes;
+          utf8_bytes += extra_utf8_bytes;
+          string = cons_string->second();
+          start = 0;
+          end -= boundary;
+        }
+      }
+    }
+  }
+  UNREACHABLE();
+  return 0;
+}
+
+
  bool String::MayContainNonAscii() const {
    i::Handle<i::String> str = Utils::OpenHandle(this);
    if (IsDeadCheck(str->GetIsolate(), "v8::String::MayContainNonAscii()")) {
@@ -3712,11 +3800,12 @@ int String::WriteUtf8(char* buffer,
    LOG_API(isolate, "String::WriteUtf8");
    ENTER_V8(isolate);
    i::Handle<i::String> str = Utils::OpenHandle(this);
+  int string_length = str->length();
    if (str->IsAsciiRepresentation()) {
      int len;
      if (capacity == -1) {
        capacity = str->length() + 1;
-      len = str->length();
+      len = string_length;
      } else {
        len = i::Min(capacity, str->length());
      }
@@ -3729,6 +3818,42 @@ int String::WriteUtf8(char* buffer,
      return len;
    }
  
+  if (capacity == -1 || capacity >= string_length * 3) {
+    int32_t previous = unibrow::Utf16::kNoPreviousCharacter;
+    const int kMaxRecursion = 100;
+    int utf8_bytes =
+        RecursivelySerializeToUtf8(*str,
+                                   buffer,
+                                   0,
+                                   string_length,
+                                   kMaxRecursion,
+                                   previous,
+                                   &previous);
+    if (utf8_bytes >= 0) {
+      // Success serializing with recursion.
+      if ((options & NO_NULL_TERMINATION) == 0 &&
+          (capacity > utf8_bytes || capacity == -1)) {
+        buffer[utf8_bytes++] = '\0';
+      }
+      if (nchars_ref != NULL) *nchars_ref = string_length;
+      return utf8_bytes;
+    }
+    FlattenString(str);
+    // Recurse once.  This time around the string is flat and the serializing
+    // with recursion will certainly succeed.
+    return WriteUtf8(buffer, capacity, nchars_ref, options);
+  } else if (capacity >= string_length) {
+    // First check that the buffer is large enough.  If it is, then recurse
+    // once without a capacity limit, which will get into the other branch of
+    // this 'if'.
+    int utf8_bytes = i::Utf8Length(str);
+    if ((options & NO_NULL_TERMINATION) == 0) utf8_bytes++;
+    if (utf8_bytes <= capacity) {
+      return WriteUtf8(buffer, -1, nchars_ref, options);
+    }
+  }
+
+  // Slow case.
    i::StringInputBuffer& write_input_buffer = *isolate->write_input_buffer();
    isolate->string_tracker()->RecordWrite(str);
    if (options & HINT_MANY_WRITES_EXPECTED) {
diff --git a/test/cctest/test-api.cc b/test/cctest/test-api.cc

index 8df0b7b68b70a3d7f73d7bc3aa1faf99f7e87f69..b1a23c1ef7078a24ab166aa1254c76d46ef66838 100644 (file)
--- a/test/cctest/test-api.cc
+++ b/test/cctest/test-api.cc
@@ -5870,6 +5870,7 @@ THREADED_TEST(Utf16) {
        "p.push(String.fromCharCode(0xdc00));"
        "var a = [];"
        "var b = [];"
+      "var c = [];"
        "var alens = [];"
        "for (var i = 0; i < 3; i++) {"
        "  p[1] = String.fromCharCode(lead++);"
@@ -5877,17 +5878,21 @@ THREADED_TEST(Utf16) {
        "    p[2] = String.fromCharCode(trail++);"
        "    a.push(p[i] + p[j]);"
        "    b.push(p[i] + p[j]);"
+      "    c.push(p[i] + p[j]);"
        "    alens.push(plens[i] + plens[j]);"
        "  }"
        "}"
        "alens[5] -= 2;"  // Here the surrogate pairs match up.
        "var a2 = [];"
        "var b2 = [];"
+      "var c2 = [];"
        "var a2lens = [];"
        "for (var m = 0; m < 9; m++) {"
        "  for (var n = 0; n < 9; n++) {"
        "    a2.push(a[m] + a[n]);"
        "    b2.push(b[m] + b[n]);"
+      "    var newc = 'x' + c[m] + c[n] + 'y';"
+      "    c2.push(newc.substring(1, newc.length - 1));"
        "    var utf = alens[m] + alens[n];"  // And here.
             // The 'n's that start with 0xdc.. are 6-8
             // The 'm's that end with 0xd8.. are 1, 4 and 7
@@ -5899,6 +5904,7 @@ THREADED_TEST(Utf16) {
    Utf16Helper(context, "a2", "a2lens", 81);
    WriteUtf8Helper(context, "b", "alens", 9);
    WriteUtf8Helper(context, "b2", "a2lens", 81);
+  WriteUtf8Helper(context, "c2", "a2lens", 81);
  }
author	erik.corry@gmail.com <erik.corry@gmail.com@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>
	Wed, 21 Mar 2012 13:48:29 +0000 (13:48 +0000)
committer	erik.corry@gmail.com <erik.corry@gmail.com@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>
	Wed, 21 Mar 2012 13:48:29 +0000 (13:48 +0000)
src/api.cc		patch \| blob \| history
test/cctest/test-api.cc		patch \| blob \| history