Fix input and output to handle UTF16 surrogate pairs.
authorerik.corry@gmail.com <erik.corry@gmail.com@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>
Mon, 12 Mar 2012 12:35:28 +0000 (12:35 +0000)
committererik.corry@gmail.com <erik.corry@gmail.com@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>
Mon, 12 Mar 2012 12:35:28 +0000 (12:35 +0000)
Review URL: https://chromiumcodereview.appspot.com/9600009

git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@11007 ce2b1a6d-e550-0410-aec6-3dcde31c8c00

30 files changed:
src/api.cc
src/arm/regexp-macro-assembler-arm.cc
src/debug-agent.cc
src/globals.h
src/handles.cc
src/handles.h
src/heap.cc
src/hydrogen-instructions.h
src/ia32/regexp-macro-assembler-ia32.cc
src/jsregexp.cc
src/log.cc
src/objects-inl.h
src/objects.cc
src/objects.h
src/parser.cc
src/parser.h
src/preparse-data.h
src/preparser-api.cc
src/preparser.cc
src/preparser.h
src/scanner-character-streams.cc
src/scanner-character-streams.h
src/scanner.cc
src/scanner.h
src/unicode-inl.h
src/unicode.cc
src/unicode.h
src/x64/regexp-macro-assembler-x64.cc
test/cctest/test-api.cc
test/cctest/test-parsing.cc

index d8c7ba0eaaf6b76700f7237502e7ffae5ebc1ddd..ef78a3ff904922c03915cd24662d413e36ce915e 100644 (file)
@@ -1430,7 +1430,7 @@ void ObjectTemplate::SetInternalFieldCount(int value) {
 
 
 ScriptData* ScriptData::PreCompile(const char* input, int length) {
-  i::Utf8ToUC16CharacterStream stream(
+  i::Utf8ToUtf16CharacterStream stream(
       reinterpret_cast<const unsigned char*>(input), length);
   return i::ParserApi::PreParse(&stream, NULL, i::FLAG_harmony_scoping);
 }
@@ -1439,11 +1439,11 @@ ScriptData* ScriptData::PreCompile(const char* input, int length) {
 ScriptData* ScriptData::PreCompile(v8::Handle<String> source) {
   i::Handle<i::String> str = Utils::OpenHandle(*source);
   if (str->IsExternalTwoByteString()) {
-    i::ExternalTwoByteStringUC16CharacterStream stream(
+    i::ExternalTwoByteStringUtf16CharacterStream stream(
       i::Handle<i::ExternalTwoByteString>::cast(str), 0, str->length());
     return i::ParserApi::PreParse(&stream, NULL, i::FLAG_harmony_scoping);
   } else {
-    i::GenericStringUC16CharacterStream stream(str, 0, str->length());
+    i::GenericStringUtf16CharacterStream stream(str, 0, str->length());
     return i::ParserApi::PreParse(&stream, NULL, i::FLAG_harmony_scoping);
   }
 }
@@ -3690,7 +3690,7 @@ int String::Length() const {
 int String::Utf8Length() const {
   i::Handle<i::String> str = Utils::OpenHandle(this);
   if (IsDeadCheck(str->GetIsolate(), "v8::String::Utf8Length()")) return 0;
-  return str->Utf8Length();
+  return i::Utf8Length(str);
 }
 
 
@@ -3736,11 +3736,13 @@ int String::WriteUtf8(char* buffer,
   int i;
   int pos = 0;
   int nchars = 0;
+  int previous = unibrow::Utf16::kNoPreviousCharacter;
   for (i = 0; i < len && (capacity == -1 || pos < fast_end); i++) {
     i::uc32 c = write_input_buffer.GetNext();
-    int written = unibrow::Utf8::Encode(buffer + pos, c);
+    int written = unibrow::Utf8::Encode(buffer + pos, c, previous);
     pos += written;
     nchars++;
+    previous = c;
   }
   if (i < len) {
     // For the last characters we need to check the length for each one
@@ -3749,16 +3751,33 @@ int String::WriteUtf8(char* buffer,
     char intermediate[unibrow::Utf8::kMaxEncodedSize];
     for (; i < len && pos < capacity; i++) {
       i::uc32 c = write_input_buffer.GetNext();
-      int written = unibrow::Utf8::Encode(intermediate, c);
-      if (pos + written <= capacity) {
-        for (int j = 0; j < written; j++)
-          buffer[pos + j] = intermediate[j];
+      if (unibrow::Utf16::IsTrailSurrogate(c) &&
+          unibrow::Utf16::IsLeadSurrogate(previous)) {
+        // We can't use the intermediate buffer here because the encoding
+        // of surrogate pairs is done under assumption that you can step
+        // back and fix the UTF8 stream.  Luckily we only need space for one
+        // more byte, so there is always space.
+        ASSERT(pos < capacity);
+        int written = unibrow::Utf8::Encode(buffer + pos, c, previous);
+        ASSERT(written == 1);
         pos += written;
         nchars++;
       } else {
-        // We've reached the end of the buffer
-        break;
+        int written =
+            unibrow::Utf8::Encode(intermediate,
+                                  c,
+                                  unibrow::Utf16::kNoPreviousCharacter);
+        if (pos + written <= capacity) {
+          for (int j = 0; j < written; j++)
+            buffer[pos + j] = intermediate[j];
+          pos += written;
+          nchars++;
+        } else {
+          // We've reached the end of the buffer
+          break;
+        }
       }
+      previous = c;
     }
   }
   if (nchars_ref != NULL) *nchars_ref = nchars;
@@ -5240,7 +5259,8 @@ String::Utf8Value::Utf8Value(v8::Handle<v8::Value> obj)
   TryCatch try_catch;
   Handle<String> str = obj->ToString();
   if (str.IsEmpty()) return;
-  length_ = str->Utf8Length();
+  i::Handle<i::String> i_str = Utils::OpenHandle(*str);
+  length_ = i::Utf8Length(i_str);
   str_ = i::NewArray<char>(length_ + 1);
   str->WriteUtf8(str_);
 }
index de83c13e15007d55ca24de83b9dcbe54100d2c5f..10ff2dd96ce3ecd7be18caa4000200500983e129 100644 (file)
@@ -472,7 +472,7 @@ void RegExpMacroAssemblerARM::CheckNotCharacterAfterMinusAnd(
     uc16 minus,
     uc16 mask,
     Label* on_not_equal) {
-  ASSERT(minus < String::kMaxUC16CharCode);
+  ASSERT(minus < String::kMaxUtf16CodeUnit);
   __ sub(r0, current_character(), Operand(minus));
   __ and_(r0, r0, Operand(mask));
   __ cmp(r0, Operand(c));
index c30afa85db0852534b2776d7bae006483c8fa5e7..511663d8eeaab57c45639746e93c9ca1b3b6d049 100644 (file)
@@ -372,8 +372,11 @@ bool DebuggerAgentUtil::SendMessage(const Socket* conn,
 
   // Calculate the message size in UTF-8 encoding.
   int utf8_len = 0;
+  int previous = unibrow::Utf16::kNoPreviousCharacter;
   for (int i = 0; i < message.length(); i++) {
-    utf8_len += unibrow::Utf8::Length(message[i]);
+    uint16_t character = message[i];
+    utf8_len += unibrow::Utf8::Length(character, previous);
+    previous = character;
   }
 
   // Send the header.
@@ -388,17 +391,33 @@ bool DebuggerAgentUtil::SendMessage(const Socket* conn,
 
   // Send message body as UTF-8.
   int buffer_position = 0;  // Current buffer position.
+  previous = unibrow::Utf16::kNoPreviousCharacter;
   for (int i = 0; i < message.length(); i++) {
     // Write next UTF-8 encoded character to buffer.
+    uint16_t character = message[i];
     buffer_position +=
-        unibrow::Utf8::Encode(buffer + buffer_position, message[i]);
+        unibrow::Utf8::Encode(buffer + buffer_position, character, previous);
     ASSERT(buffer_position < kBufferSize);
 
     // Send buffer if full or last character is encoded.
-    if (kBufferSize - buffer_position < 3 || i == message.length() - 1) {
-      conn->Send(buffer, buffer_position);
-      buffer_position = 0;
+    if (kBufferSize - buffer_position <
+          unibrow::Utf16::kMaxExtraUtf8BytesForOneUtf16CodeUnit ||
+        i == message.length() - 1) {
+      if (unibrow::Utf16::IsLeadSurrogate(character)) {
+        const int kEncodedSurrogateLength =
+            unibrow::Utf16::kUtf8BytesToCodeASurrogate;
+        ASSERT(buffer_position >= kEncodedSurrogateLength);
+        conn->Send(buffer, buffer_position - kEncodedSurrogateLength);
+        for (int i = 0; i < kEncodedSurrogateLength; i++) {
+          buffer[i] = buffer[buffer_position + i];
+        }
+        buffer_position = kEncodedSurrogateLength;
+      } else {
+        conn->Send(buffer, buffer_position);
+        buffer_position = 0;
+      }
     }
+    previous = character;
   }
 
   return true;
index e53cc81d6c2e58249b91bdb319e8dafa6961a8dd..25d4ffe89bd98ec68d070cd00269941b41609f17 100644 (file)
@@ -267,8 +267,9 @@ const int kBinary32ExponentShift = 23;
 // other bits set.
 const uint64_t kQuietNaNMask = static_cast<uint64_t>(0xfff) << 51;
 
-// ASCII/UC16 constants
+// ASCII/UTF-16 constants
 // Code-point values in Unicode 4.0 are 21 bits wide.
+// Code units in UTF-16 are 16 bits wide.
 typedef uint16_t uc16;
 typedef int32_t uc32;
 const int kASCIISize    = kCharSize;
index 1bb258e475155d5200ce70c84e9ab8fa30ea1436..416ecbd211c57c15f9219a7e909478d3a8963a0f 100644 (file)
@@ -800,4 +800,162 @@ Handle<ObjectHashTable> PutIntoObjectHashTable(Handle<ObjectHashTable> table,
 }
 
 
+// This method determines the type of string involved and then gets the UTF8
+// length of the string.  It doesn't flatten the string and has log(n) recursion
+// for a string of length n.  If the failure flag gets set, then we have to
+// flatten the string and retry.  Failures are caused by surrogate pairs in deep
+// cons strings.
+
+// Single surrogate characters that are encountered in the UTF-16 character
+// sequence of the input string get counted as 3 UTF-8 bytes, because that
+// is the way that WriteUtf8 will encode them.  Surrogate pairs are counted and
+// encoded as one 4-byte UTF-8 sequence.
+
+// This function conceptually uses recursion on the two halves of cons strings.
+// However, in order to avoid the recursion going too deep it recurses on the
+// second string of the cons, but iterates on the first substring (by manually
+// eliminating it as a tail recursion).  This means it counts the UTF-8 length
+// from the end to the start, which makes no difference to the total.
+
+// Surrogate pairs are recognized even if they are split across two sides of a
+// cons, which complicates the implementation somewhat.  Therefore, too deep
+// recursion cannot always be avoided.  This case is detected, and the failure
+// flag is set, a signal to the caller that the string should be flattened and
+// the operation retried.
+int Utf8LengthHelper(String* input,
+                     int from,
+                     int to,
+                     bool followed_by_surrogate,
+                     int max_recursion,
+                     bool* failure,
+                     bool* starts_with_surrogate) {
+  if (from == to) return 0;
+  int total = 0;
+  bool dummy;
+  while (true) {
+    if (input->IsAsciiRepresentation()) {
+      *starts_with_surrogate = false;
+      return total + to - from;
+    }
+    switch (StringShape(input).representation_tag()) {
+      case kConsStringTag: {
+        ConsString* str = ConsString::cast(input);
+        String* first = str->first();
+        String* second = str->second();
+        int first_length = first->length();
+        if (first_length - from > to - first_length) {
+          if (first_length < to) {
+            // Right hand side is shorter.  No need to check the recursion depth
+            // since this can only happen log(n) times.
+            bool right_starts_with_surrogate = false;
+            total += Utf8LengthHelper(second,
+                                      0,
+                                      to - first_length,
+                                      followed_by_surrogate,
+                                      max_recursion - 1,
+                                      failure,
+                                      &right_starts_with_surrogate);
+            if (*failure) return 0;
+            followed_by_surrogate = right_starts_with_surrogate;
+            input = first;
+            to = first_length;
+          } else {
+            // We only need the left hand side.
+            input = first;
+          }
+        } else {
+          if (first_length > from) {
+            // Left hand side is shorter.
+            if (first->IsAsciiRepresentation()) {
+              total += first_length - from;
+              *starts_with_surrogate = false;
+              starts_with_surrogate = &dummy;
+              input = second;
+              from = 0;
+              to -= first_length;
+            } else if (second->IsAsciiRepresentation()) {
+              followed_by_surrogate = false;
+              total += to - first_length;
+              input = first;
+              to = first_length;
+            } else if (max_recursion > 0) {
+              bool right_starts_with_surrogate = false;
+              // Recursing on the long one.  This may fail.
+              total += Utf8LengthHelper(second,
+                                        0,
+                                        to - first_length,
+                                        followed_by_surrogate,
+                                        max_recursion - 1,
+                                        failure,
+                                        &right_starts_with_surrogate);
+              if (*failure) return 0;
+              input = first;
+              to = first_length;
+              followed_by_surrogate = right_starts_with_surrogate;
+            } else {
+              *failure = true;
+              return 0;
+            }
+          } else {
+            // We only need the right hand side.
+            input = second;
+            from = 0;
+            to -= first_length;
+          }
+        }
+        continue;
+      }
+      case kExternalStringTag:
+      case kSeqStringTag: {
+        Vector<const uc16> vector = input->GetFlatContent().ToUC16Vector();
+        const uc16* p = vector.start();
+        int previous = unibrow::Utf16::kNoPreviousCharacter;
+        for (int i = from; i < to; i++) {
+          uc16 c = p[i];
+          total += unibrow::Utf8::Length(c, previous);
+          previous = c;
+        }
+        if (to - from > 0) {
+          if (unibrow::Utf16::IsLeadSurrogate(previous) &&
+              followed_by_surrogate) {
+            total -= unibrow::Utf8::kBytesSavedByCombiningSurrogates;
+          }
+          if (unibrow::Utf16::IsTrailSurrogate(p[from])) {
+            *starts_with_surrogate = true;
+          }
+        }
+        return total;
+      }
+      case kSlicedStringTag: {
+        SlicedString* str = SlicedString::cast(input);
+        int offset = str->offset();
+        input = str->parent();
+        from += offset;
+        to += offset;
+        continue;
+      }
+      default:
+        break;
+    }
+    UNREACHABLE();
+    return 0;
+  }
+  return 0;
+}
+
+
+int Utf8Length(Handle<String> str) {
+  bool dummy;
+  bool failure;
+  int len;
+  const int kRecursionBudget = 100;
+  do {
+    failure = false;
+    len = Utf8LengthHelper(
+        *str, 0, str->length(), false, kRecursionBudget, &failure, &dummy);
+    if (failure) FlattenString(str);
+  } while (failure);
+  return len;
+}
+
 } }  // namespace v8::internal
index 42089134e4b816a2f2829407f86a303b9ca0cd39..960696b5fb81f705f2128b5586fef44ca0c8e7fb 100644 (file)
@@ -174,6 +174,8 @@ void FlattenString(Handle<String> str);
 // string.
 Handle<String> FlattenGetString(Handle<String> str);
 
+int Utf8Length(Handle<String> str);
+
 Handle<Object> SetProperty(Handle<Object> object,
                            Handle<Object> key,
                            Handle<Object> value,
index da98239db1ead1a20c1bedc4dabdb82bf705858a..65763bb8dc0e3c001fc2cca6f3dd1492f3bf083a 100644 (file)
@@ -4186,8 +4186,6 @@ MaybeObject* Heap::AllocateStringFromAscii(Vector<const char> string,
 
 MaybeObject* Heap::AllocateStringFromUtf8Slow(Vector<const char> string,
                                               PretenureFlag pretenure) {
-  // V8 only supports characters in the Basic Multilingual Plane.
-  const uc32 kMaxSupportedChar = 0xFFFF;
   // Count the number of characters in the UTF-8 string and check if
   // it is an ASCII string.
   Access<UnicodeCache::Utf8Decoder>
@@ -4195,8 +4193,12 @@ MaybeObject* Heap::AllocateStringFromUtf8Slow(Vector<const char> string,
   decoder->Reset(string.start(), string.length());
   int chars = 0;
   while (decoder->has_more()) {
-    decoder->GetNext();
-    chars++;
+    uint32_t r = decoder->GetNext();
+    if (r <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
+      chars++;
+    } else {
+      chars += 2;
+    }
   }
 
   Object* result;
@@ -4207,10 +4209,15 @@ MaybeObject* Heap::AllocateStringFromUtf8Slow(Vector<const char> string,
   // Convert and copy the characters into the new object.
   String* string_result = String::cast(result);
   decoder->Reset(string.start(), string.length());
-  for (int i = 0; i < chars; i++) {
-    uc32 r = decoder->GetNext();
-    if (r > kMaxSupportedChar) { r = unibrow::Utf8::kBadChar; }
-    string_result->Set(i, r);
+  int i = 0;
+  while (i < chars) {
+    uint32_t r = decoder->GetNext();
+    if (r > unibrow::Utf16::kMaxNonSurrogateCharCode) {
+      string_result->Set(i++, unibrow::Utf16::LeadSurrogate(r));
+      string_result->Set(i++, unibrow::Utf16::TrailSurrogate(r));
+    } else {
+      string_result->Set(i++, r);
+    }
   }
   return result;
 }
@@ -4267,7 +4274,7 @@ MaybeObject* Heap::AllocateInternalSymbol(unibrow::CharacterStream* buffer,
                                           uint32_t hash_field) {
   ASSERT(chars >= 0);
   // Ensure the chars matches the number of characters in the buffer.
-  ASSERT(static_cast<unsigned>(chars) == buffer->Length());
+  ASSERT(static_cast<unsigned>(chars) == buffer->Utf16Length());
   // Determine whether the string is ASCII.
   bool is_ascii = true;
   while (buffer->has_more()) {
@@ -4313,8 +4320,15 @@ MaybeObject* Heap::AllocateInternalSymbol(unibrow::CharacterStream* buffer,
   ASSERT_EQ(size, answer->Size());
 
   // Fill in the characters.
-  for (int i = 0; i < chars; i++) {
-    answer->Set(i, buffer->GetNext());
+  int i = 0;
+  while (i < chars) {
+    uint32_t character = buffer->GetNext();
+    if (character > unibrow::Utf16::kMaxNonSurrogateCharCode) {
+      answer->Set(i++, unibrow::Utf16::LeadSurrogate(character));
+      answer->Set(i++, unibrow::Utf16::TrailSurrogate(character));
+    } else {
+      answer->Set(i++, character);
+    }
   }
   return answer;
 }
index 5733e51affcc643d1377c1012848ecf9f5761b1e..893c7081a8800d1850e9c322cb0a4487d7f2d389 100644 (file)
@@ -4284,7 +4284,7 @@ class HStringCharCodeAt: public HTemplateInstruction<3> {
   virtual bool DataEquals(HValue* other) { return true; }
 
   virtual Range* InferRange(Zone* zone) {
-    return new(zone) Range(0, String::kMaxUC16CharCode);
+    return new(zone) Range(0, String::kMaxUtf16CodeUnit);
   }
 };
 
index 2c9b60c8687723efea580b2cc042805821023340..04d6b62c80f390480be591035a5c7cf9ba042993 100644 (file)
@@ -523,7 +523,7 @@ void RegExpMacroAssemblerIA32::CheckNotCharacterAfterMinusAnd(
     uc16 minus,
     uc16 mask,
     Label* on_not_equal) {
-  ASSERT(minus < String::kMaxUC16CharCode);
+  ASSERT(minus < String::kMaxUtf16CodeUnit);
   __ lea(eax, Operand(current_character(), -minus));
   __ and_(eax, mask);
   __ cmp(eax, c);
index 7e695bb2214da259eec0e9c8a29d921556b448fd..8ccbae49ce4d3c8a5d90f4734ce17945b7c5f70b 100644 (file)
@@ -1444,7 +1444,7 @@ static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
   if (ascii) {
     char_mask = String::kMaxAsciiCharCode;
   } else {
-    char_mask = String::kMaxUC16CharCode;
+    char_mask = String::kMaxUtf16CodeUnit;
   }
   uc16 exor = c1 ^ c2;
   // Check whether exor has only one bit set.
@@ -1546,7 +1546,7 @@ static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
   if (ascii) {
     max_char = String::kMaxAsciiCharCode;
   } else {
-    max_char = String::kMaxUC16CharCode;
+    max_char = String::kMaxUtf16CodeUnit;
   }
 
   Label success;
@@ -1642,7 +1642,7 @@ static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
         macro_assembler->CheckCharacterLT(from, on_failure);
       }
     }
-    if (to != String::kMaxUC16CharCode) {
+    if (to != String::kMaxUtf16CodeUnit) {
       if (cc->is_negated()) {
         macro_assembler->CheckCharacterLT(to + 1, on_failure);
       } else {
@@ -1835,7 +1835,7 @@ bool QuickCheckDetails::Rationalize(bool asc) {
   if (asc) {
     char_mask = String::kMaxAsciiCharCode;
   } else {
-    char_mask = String::kMaxUC16CharCode;
+    char_mask = String::kMaxUtf16CodeUnit;
   }
   mask_ = 0;
   value_ = 0;
@@ -1887,7 +1887,7 @@ bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
     if (compiler->ascii()) {
       char_mask = String::kMaxAsciiCharCode;
     } else {
-      char_mask = String::kMaxUC16CharCode;
+      char_mask = String::kMaxUtf16CodeUnit;
     }
     if ((mask & char_mask) == char_mask) need_mask = false;
     mask &= char_mask;
@@ -1939,7 +1939,7 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
   if (compiler->ascii()) {
     char_mask = String::kMaxAsciiCharCode;
   } else {
-    char_mask = String::kMaxUC16CharCode;
+    char_mask = String::kMaxUtf16CodeUnit;
   }
   for (int k = 0; k < elms_->length(); k++) {
     TextElement elm = elms_->at(k);
@@ -4079,7 +4079,7 @@ static void AddClassNegated(const uc16 *elmv,
                             int elmc,
                             ZoneList<CharacterRange>* ranges) {
   ASSERT(elmv[0] != 0x0000);
-  ASSERT(elmv[elmc-1] != String::kMaxUC16CharCode);
+  ASSERT(elmv[elmc-1] != String::kMaxUtf16CodeUnit);
   uc16 last = 0x0000;
   for (int i = 0; i < elmc; i += 2) {
     ASSERT(last <= elmv[i] - 1);
@@ -4087,7 +4087,7 @@ static void AddClassNegated(const uc16 *elmv,
     ranges->Add(CharacterRange(last, elmv[i] - 1));
     last = elmv[i + 1] + 1;
   }
-  ranges->Add(CharacterRange(last, String::kMaxUC16CharCode));
+  ranges->Add(CharacterRange(last, String::kMaxUtf16CodeUnit));
 }
 
 
@@ -4633,8 +4633,8 @@ void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
     from = range.to();
     i++;
   }
-  if (from < String::kMaxUC16CharCode) {
-    negated_ranges->Add(CharacterRange(from + 1, String::kMaxUC16CharCode));
+  if (from < String::kMaxUtf16CodeUnit) {
+    negated_ranges->Add(CharacterRange(from + 1, String::kMaxUtf16CodeUnit));
   }
 }
 
@@ -4797,7 +4797,7 @@ void DispatchTable::AddRange(CharacterRange full_range, int value) {
       entry->AddValue(value);
       // Bail out if the last interval ended at 0xFFFF since otherwise
       // adding 1 will wrap around to 0.
-      if (entry->to() == String::kMaxUC16CharCode)
+      if (entry->to() == String::kMaxUtf16CodeUnit)
         break;
       ASSERT(entry->to() + 1 > current.from());
       current.set_from(entry->to() + 1);
@@ -5117,7 +5117,7 @@ int TextNode::ComputeFirstCharacterSet(int budget) {
         int new_length = length + 1;
         if (length > 0) {
           if (ranges->at(0).from() == 0) new_length--;
-          if (ranges->at(length - 1).to() == String::kMaxUC16CharCode) {
+          if (ranges->at(length - 1).to() == String::kMaxUtf16CodeUnit) {
             new_length--;
           }
         }
@@ -5207,14 +5207,14 @@ void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
     if (last < range.from())
       AddRange(CharacterRange(last, range.from() - 1));
     if (range.to() >= last) {
-      if (range.to() == String::kMaxUC16CharCode) {
+      if (range.to() == String::kMaxUtf16CodeUnit) {
         return;
       } else {
         last = range.to() + 1;
       }
     }
   }
-  AddRange(CharacterRange(last, String::kMaxUC16CharCode));
+  AddRange(CharacterRange(last, String::kMaxUtf16CodeUnit));
 }
 
 
index 39797190712456ceea38f25c9f4ef121f6687107..e023d8864420d9d1aeb872958cdbf28245b513a3 100644 (file)
@@ -461,18 +461,20 @@ class Logger::NameBuffer {
       utf8_pos_ += utf8_length;
       return;
     }
-    int uc16_length = Min(str->length(), kUc16BufferSize);
-    String::WriteToFlat(str, uc16_buffer_, 0, uc16_length);
+    int uc16_length = Min(str->length(), kUtf16BufferSize);
+    String::WriteToFlat(str, utf16_buffer, 0, uc16_length);
+    int previous = unibrow::Utf16::kNoPreviousCharacter;
     for (int i = 0; i < uc16_length && utf8_pos_ < kUtf8BufferSize; ++i) {
-      uc16 c = uc16_buffer_[i];
+      uc16 c = utf16_buffer[i];
       if (c <= String::kMaxAsciiCharCodeU) {
         utf8_buffer_[utf8_pos_++] = static_cast<char>(c);
       } else {
-        int char_length = unibrow::Utf8::Length(c);
+        int char_length = unibrow::Utf8::Length(c, previous);
         if (utf8_pos_ + char_length > kUtf8BufferSize) break;
-        unibrow::Utf8::Encode(utf8_buffer_ + utf8_pos_, c);
+        unibrow::Utf8::Encode(utf8_buffer_ + utf8_pos_, c, previous);
         utf8_pos_ += char_length;
       }
+      previous = c;
     }
   }
 
@@ -504,11 +506,11 @@ class Logger::NameBuffer {
 
  private:
   static const int kUtf8BufferSize = 512;
-  static const int kUc16BufferSize = 128;
+  static const int kUtf16BufferSize = 128;
 
   int utf8_pos_;
   char utf8_buffer_[kUtf8BufferSize];
-  uc16 uc16_buffer_[kUc16BufferSize];
+  uc16 utf16_buffer[kUtf16BufferSize];
 };
 
 
index d0e9bf82bacb50161dcf9f98a63f8e3757ee7eed..ed0c19f3caca0a14bada4ee9f52c96e29fed30f5 100644 (file)
@@ -4463,7 +4463,11 @@ bool StringHasher::has_trivial_hash() {
 }
 
 
-void StringHasher::AddCharacter(uc32 c) {
+void StringHasher::AddCharacter(uint32_t c) {
+  if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) {
+    AddSurrogatePair(c);  // Not inlined.
+    return;
+  }
   // Use the Jenkins one-at-a-time hash function to update the hash
   // for the given character.
   raw_running_hash_ += c;
@@ -4492,8 +4496,12 @@ void StringHasher::AddCharacter(uc32 c) {
 }
 
 
-void StringHasher::AddCharacterNoIndex(uc32 c) {
+void StringHasher::AddCharacterNoIndex(uint32_t c) {
   ASSERT(!is_array_index());
+  if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) {
+    AddSurrogatePairNoIndex(c);  // Not inlined.
+    return;
+  }
   raw_running_hash_ += c;
   raw_running_hash_ += (raw_running_hash_ << 10);
   raw_running_hash_ ^= (raw_running_hash_ >> 6);
index e0a95372b48c7ebfb5755a32243d7bc9999adc7f..7865ac0715aba77b0f0ff84641095c6d341af492 100644 (file)
@@ -6051,9 +6051,11 @@ SmartArrayPointer<char> String::ToCString(AllowNullsFlag allow_nulls,
   buffer->Reset(offset, this);
   int character_position = offset;
   int utf8_bytes = 0;
+  int last = unibrow::Utf16::kNoPreviousCharacter;
   while (buffer->has_more() && character_position++ < offset + length) {
     uint16_t character = buffer->GetNext();
-    utf8_bytes += unibrow::Utf8::Length(character);
+    utf8_bytes += unibrow::Utf8::Length(character, last);
+    last = character;
   }
 
   if (length_return) {
@@ -6067,13 +6069,15 @@ SmartArrayPointer<char> String::ToCString(AllowNullsFlag allow_nulls,
   buffer->Seek(offset);
   character_position = offset;
   int utf8_byte_position = 0;
+  last = unibrow::Utf16::kNoPreviousCharacter;
   while (buffer->has_more() && character_position++ < offset + length) {
     uint16_t character = buffer->GetNext();
     if (allow_nulls == DISALLOW_NULLS && character == 0) {
       character = ' ';
     }
     utf8_byte_position +=
-        unibrow::Utf8::Encode(result + utf8_byte_position, character);
+        unibrow::Utf8::Encode(result + utf8_byte_position, character, last);
+    last = character;
   }
   result[utf8_byte_position] = 0;
   return SmartArrayPointer<char>(result);
@@ -6387,73 +6391,6 @@ const unibrow::byte* String::ReadBlock(String* input,
 }
 
 
-// This method determines the type of string involved and then gets the UTF8
-// length of the string.  It doesn't flatten the string and has log(n) recursion
-// for a string of length n.
-int String::Utf8Length(String* input, int from, int to) {
-  if (from == to) return 0;
-  int total = 0;
-  while (true) {
-    if (input->IsAsciiRepresentation()) return total + to - from;
-    switch (StringShape(input).representation_tag()) {
-      case kConsStringTag: {
-        ConsString* str = ConsString::cast(input);
-        String* first = str->first();
-        String* second = str->second();
-        int first_length = first->length();
-        if (first_length - from < to - first_length) {
-          if (first_length > from) {
-            // Left hand side is shorter.
-            total += Utf8Length(first, from, first_length);
-            input = second;
-            from = 0;
-            to -= first_length;
-          } else {
-            // We only need the right hand side.
-            input = second;
-            from -= first_length;
-            to -= first_length;
-          }
-        } else {
-          if (first_length <= to) {
-            // Right hand side is shorter.
-            total += Utf8Length(second, 0, to - first_length);
-            input = first;
-            to = first_length;
-          } else {
-            // We only need the left hand side.
-            input = first;
-          }
-        }
-        continue;
-      }
-      case kExternalStringTag:
-      case kSeqStringTag: {
-        Vector<const uc16> vector = input->GetFlatContent().ToUC16Vector();
-        const uc16* p = vector.start();
-        for (int i = from; i < to; i++) {
-          total += unibrow::Utf8::Length(p[i]);
-        }
-        return total;
-      }
-      case kSlicedStringTag: {
-        SlicedString* str = SlicedString::cast(input);
-        int offset = str->offset();
-        input = str->parent();
-        from += offset;
-        to += offset;
-        continue;
-      }
-      default:
-        break;
-    }
-    UNREACHABLE();
-    return 0;
-  }
-  return 0;
-}
-
-
 void Relocatable::PostGarbageCollectionProcessing() {
   Isolate* isolate = Isolate::Current();
   Relocatable* current = isolate->relocatable_top();
@@ -6847,8 +6784,10 @@ static inline bool CompareStringContents(IteratorA* ia, IteratorB* ib) {
   // General slow case check.  We know that the ia and ib iterators
   // have the same length.
   while (ia->has_more()) {
-    uc32 ca = ia->GetNext();
-    uc32 cb = ib->GetNext();
+    uint32_t ca = ia->GetNext();
+    uint32_t cb = ib->GetNext();
+    ASSERT(ca <= unibrow::Utf16::kMaxNonSurrogateCharCode);
+    ASSERT(cb <= unibrow::Utf16::kMaxNonSurrogateCharCode);
     if (ca != cb)
       return false;
   }
@@ -7031,8 +6970,14 @@ bool String::IsEqualTo(Vector<const char> str) {
   decoder->Reset(str.start(), str.length());
   int i;
   for (i = 0; i < slen && decoder->has_more(); i++) {
-    uc32 r = decoder->GetNext();
-    if (Get(i) != r) return false;
+    uint32_t r = decoder->GetNext();
+    if (r > unibrow::Utf16::kMaxNonSurrogateCharCode) {
+      if (i > slen - 1) return false;
+      if (Get(i++) != unibrow::Utf16::LeadSurrogate(r)) return false;
+      if (Get(i) != unibrow::Utf16::TrailSurrogate(r)) return false;
+    } else {
+      if (Get(i) != r) return false;
+    }
   }
   return i == slen && !decoder->has_more();
 }
@@ -7162,6 +7107,22 @@ uint32_t StringHasher::MakeArrayIndexHash(uint32_t value, int length) {
 }
 
 
+void StringHasher::AddSurrogatePair(uc32 c) {
+  uint16_t lead = unibrow::Utf16::LeadSurrogate(c);
+  AddCharacter(lead);
+  uint16_t trail = unibrow::Utf16::TrailSurrogate(c);
+  AddCharacter(trail);
+}
+
+
+void StringHasher::AddSurrogatePairNoIndex(uc32 c) {
+  uint16_t lead = unibrow::Utf16::LeadSurrogate(c);
+  AddCharacterNoIndex(lead);
+  uint16_t trail = unibrow::Utf16::TrailSurrogate(c);
+  AddCharacterNoIndex(trail);
+}
+
+
 uint32_t StringHasher::GetHashField() {
   ASSERT(is_valid());
   if (length_ <= String::kMaxHashCalcLength) {
@@ -10655,7 +10616,7 @@ class Utf8SymbolKey : public HashTableKey {
     if (hash_field_ != 0) return hash_field_ >> String::kHashShift;
     unibrow::Utf8InputBuffer<> buffer(string_.start(),
                                       static_cast<unsigned>(string_.length()));
-    chars_ = buffer.Length();
+    chars_ = buffer.Utf16Length();
     hash_field_ = String::ComputeHashField(&buffer, chars_, seed_);
     uint32_t result = hash_field_ >> String::kHashShift;
     ASSERT(result != 0);  // Ensure that the hash value of 0 is never computed.
index 7906d14fa31dbb928a9242861aa17a1f2ff4f113..548dbcf48b0d6387c069bf6f29cf7eb6779abbb8 100644 (file)
@@ -6616,12 +6616,17 @@ class StringHasher {
   inline bool has_trivial_hash();
 
   // Add a character to the hash and update the array index calculation.
-  inline void AddCharacter(uc32 c);
+  inline void AddCharacter(uint32_t c);
 
   // Adds a character to the hash but does not update the array index
   // calculation.  This can only be called when it has been verified
   // that the input is not an array index.
-  inline void AddCharacterNoIndex(uc32 c);
+  inline void AddCharacterNoIndex(uint32_t c);
+
+  // Add a character above 0xffff as a surrogate pair.  These can get into
+  // the hasher through the routines that take a UTF-8 string and make a symbol.
+  void AddSurrogatePair(uc32 c);
+  void AddSurrogatePairNoIndex(uc32 c);
 
   // Returns the value to store in the hash field of a string with
   // the given length and contents.
@@ -6871,9 +6876,6 @@ class String: public HeapObject {
       RobustnessFlag robustness_flag = FAST_STRING_TRAVERSAL,
       int* length_output = 0);
 
-  inline int Utf8Length() { return Utf8Length(this, 0, length()); }
-  static int Utf8Length(String* input, int from, int to);
-
   // Return a 16 bit Unicode representation of the string.
   // The string should be nearly flat, otherwise the performance of
   // of this method may be very bad.  Setting robustness_flag to
@@ -6939,7 +6941,7 @@ class String: public HeapObject {
   // Max ASCII char code.
   static const int kMaxAsciiCharCode = unibrow::Utf8::kMaxOneByteChar;
   static const unsigned kMaxAsciiCharCodeU = unibrow::Utf8::kMaxOneByteChar;
-  static const int kMaxUC16CharCode = 0xffff;
+  static const int kMaxUtf16CodeUnit = 0xffff;
 
   // Mask constant for checking if a string has a computed hash code
   // and if it is an array index.  The least significant bit indicates
index ca8cbb9029c5b0f6f930453efcf6ac0827152aad..90dd6a7c43ab728613008dd4c9af5014aad2c69b 100644 (file)
@@ -258,7 +258,7 @@ Handle<String> Parser::LookupSymbol(int symbol_id) {
           scanner().literal_ascii_string());
     } else {
       return isolate()->factory()->LookupTwoByteSymbol(
-          scanner().literal_uc16_string());
+          scanner().literal_utf16_string());
     }
   }
   return LookupCachedSymbol(symbol_id);
@@ -279,7 +279,7 @@ Handle<String> Parser::LookupCachedSymbol(int symbol_id) {
           scanner().literal_ascii_string());
     } else {
       result = isolate()->factory()->LookupTwoByteSymbol(
-          scanner().literal_uc16_string());
+          scanner().literal_utf16_string());
     }
     symbol_cache_.at(symbol_id) = result;
     return result;
@@ -576,12 +576,12 @@ FunctionLiteral* Parser::ParseProgram(CompilationInfo* info) {
     // Notice that the stream is destroyed at the end of the branch block.
     // The last line of the blocks can't be moved outside, even though they're
     // identical calls.
-    ExternalTwoByteStringUC16CharacterStream stream(
+    ExternalTwoByteStringUtf16CharacterStream stream(
         Handle<ExternalTwoByteString>::cast(source), 0, source->length());
     scanner_.Initialize(&stream);
     return DoParseProgram(info, source, &zone_scope);
   } else {
-    GenericStringUC16CharacterStream stream(source, 0, source->length());
+    GenericStringUtf16CharacterStream stream(source, 0, source->length());
     scanner_.Initialize(&stream);
     return DoParseProgram(info, source, &zone_scope);
   }
@@ -665,16 +665,16 @@ FunctionLiteral* Parser::ParseLazy(CompilationInfo* info) {
   // Initialize parser state.
   source->TryFlatten();
   if (source->IsExternalTwoByteString()) {
-    ExternalTwoByteStringUC16CharacterStream stream(
+    ExternalTwoByteStringUtf16CharacterStream stream(
         Handle<ExternalTwoByteString>::cast(source),
         shared_info->start_position(),
         shared_info->end_position());
     FunctionLiteral* result = ParseLazy(info, &stream, &zone_scope);
     return result;
   } else {
-    GenericStringUC16CharacterStream stream(source,
-                                            shared_info->start_position(),
-                                            shared_info->end_position());
+    GenericStringUtf16CharacterStream stream(source,
+                                             shared_info->start_position(),
+                                             shared_info->end_position());
     FunctionLiteral* result = ParseLazy(info, &stream, &zone_scope);
     return result;
   }
@@ -682,7 +682,7 @@ FunctionLiteral* Parser::ParseLazy(CompilationInfo* info) {
 
 
 FunctionLiteral* Parser::ParseLazy(CompilationInfo* info,
-                                   UC16CharacterStream* source,
+                                   Utf16CharacterStream* source,
                                    ZoneScope* zone_scope) {
   Handle<SharedFunctionInfo> shared_info = info->shared_info();
   scanner_.Initialize(source);
@@ -4285,7 +4285,7 @@ class SingletonLogger : public ParserRecorder {
 
   // Logs a symbol creation of a literal or identifier.
   virtual void LogAsciiSymbol(int start, Vector<const char> literal) { }
-  virtual void LogUC16Symbol(int start, Vector<const uc16> literal) { }
+  virtual void LogUtf16Symbol(int start, Vector<const uc16> literal) { }
 
   // Logs an error message and marks the log as containing an error.
   // Further logging will be ignored, and ExtractData will return a vector
@@ -5874,7 +5874,7 @@ int ScriptDataImpl::ReadNumber(byte** source) {
 
 
 // Create a Scanner for the preparser to use as input, and preparse the source.
-static ScriptDataImpl* DoPreParse(UC16CharacterStream* source,
+static ScriptDataImpl* DoPreParse(Utf16CharacterStream* source,
                                   int flags,
                                   ParserRecorder* recorder) {
   Isolate* isolate = Isolate::Current();
@@ -5915,17 +5915,17 @@ ScriptDataImpl* ParserApi::PartialPreParse(Handle<String> source,
   PartialParserRecorder recorder;
   int source_length = source->length();
   if (source->IsExternalTwoByteString()) {
-    ExternalTwoByteStringUC16CharacterStream stream(
+    ExternalTwoByteStringUtf16CharacterStream stream(
         Handle<ExternalTwoByteString>::cast(source), 0, source_length);
     return DoPreParse(&stream, flags, &recorder);
   } else {
-    GenericStringUC16CharacterStream stream(source, 0, source_length);
+    GenericStringUtf16CharacterStream stream(source, 0, source_length);
     return DoPreParse(&stream, flags, &recorder);
   }
 }
 
 
-ScriptDataImpl* ParserApi::PreParse(UC16CharacterStream* source,
+ScriptDataImpl* ParserApi::PreParse(Utf16CharacterStream* source,
                                     v8::Extension* extension,
                                     int flags) {
   Handle<Script> no_script;
index 90ef39983e19410a5e7e697c68f440d030ba08c9..227344f32019aa89d3631baa0aba97bdc44678e5 100644 (file)
@@ -172,7 +172,7 @@ class ParserApi {
   static bool Parse(CompilationInfo* info, int flags);
 
   // Generic preparser generating full preparse data.
-  static ScriptDataImpl* PreParse(UC16CharacterStream* source,
+  static ScriptDataImpl* PreParse(Utf16CharacterStream* source,
                                   v8::Extension* extension,
                                   int flags);
 
@@ -542,7 +542,7 @@ class Parser {
 
 
   FunctionLiteral* ParseLazy(CompilationInfo* info,
-                             UC16CharacterStream* source,
+                             Utf16CharacterStream* source,
                              ZoneScope* zone_scope);
 
   Isolate* isolate() { return isolate_; }
@@ -712,7 +712,7 @@ class Parser {
           scanner().literal_ascii_string(), tenured);
     } else {
       return isolate_->factory()->NewStringFromTwoByte(
-            scanner().literal_uc16_string(), tenured);
+            scanner().literal_utf16_string(), tenured);
     }
   }
 
@@ -722,7 +722,7 @@ class Parser {
           scanner().next_literal_ascii_string(), tenured);
     } else {
       return isolate_->factory()->NewStringFromTwoByte(
-          scanner().next_literal_uc16_string(), tenured);
+          scanner().next_literal_utf16_string(), tenured);
     }
   }
 
index c77a47a10cb8ce3cfa985ad290a3c452f12cbb3d..f347430208ca14239798fc1e5c0814df1aa6a0b4 100644 (file)
@@ -53,7 +53,7 @@ class ParserRecorder {
 
   // Logs a symbol creation of a literal or identifier.
   virtual void LogAsciiSymbol(int start, Vector<const char> literal) { }
-  virtual void LogUC16Symbol(int start, Vector<const uc16> literal) { }
+  virtual void LogUtf16Symbol(int start, Vector<const uc16> literal) { }
 
   // Logs an error message and marks the log as containing an error.
   // Further logging will be ignored, and ExtractData will return a vector
@@ -149,7 +149,7 @@ class PartialParserRecorder : public FunctionLoggingParserRecorder {
  public:
   PartialParserRecorder() : FunctionLoggingParserRecorder() { }
   virtual void LogAsciiSymbol(int start, Vector<const char> literal) { }
-  virtual void LogUC16Symbol(int start, Vector<const uc16> literal) { }
+  virtual void LogUtf16Symbol(int start, Vector<const uc16> literal) { }
   virtual ~PartialParserRecorder() { }
   virtual Vector<unsigned> ExtractData();
   virtual int symbol_position() { return 0; }
@@ -171,7 +171,7 @@ class CompleteParserRecorder: public FunctionLoggingParserRecorder {
     LogSymbol(start, hash, true, Vector<const byte>::cast(literal));
   }
 
-  virtual void LogUC16Symbol(int start, Vector<const uc16> literal) {
+  virtual void LogUtf16Symbol(int start, Vector<const uc16> literal) {
     if (!is_recording_) return;
     int hash = vector_hash(literal);
     LogSymbol(start, hash, false, Vector<const byte>::cast(literal));
index 1bca9a3333227058ca897ffb533bbd4e9df8ef01..6e8556aa14b4b9309e2a091d7540f872e1e012ba 100644 (file)
@@ -46,10 +46,10 @@ namespace v8 {
 namespace internal {
 
 // UTF16Buffer based on a v8::UnicodeInputStream.
-class InputStreamUTF16Buffer : public UC16CharacterStream {
+class InputStreamUtf16Buffer : public Utf16CharacterStream {
  public:
-  /* The InputStreamUTF16Buffer maintains an internal buffer
-   * that is filled in chunks from the UC16CharacterStream.
+  /* The InputStreamUtf16Buffer maintains an internal buffer
+   * that is filled in chunks from the Utf16CharacterStream.
    * It also maintains unlimited pushback capability, but optimized
    * for small pushbacks.
    * The pushback_buffer_ pointer points to the limit of pushbacks
@@ -60,8 +60,8 @@ class InputStreamUTF16Buffer : public UC16CharacterStream {
    * new buffer. When this buffer is read to the end again, the cursor is
    * switched back to the internal buffer
    */
-  explicit InputStreamUTF16Buffer(v8::UnicodeInputStream* stream)
-      : UC16CharacterStream(),
+  explicit InputStreamUtf16Buffer(v8::UnicodeInputStream* stream)
+      : Utf16CharacterStream(),
         stream_(stream),
         pushback_buffer_(buffer_),
         pushback_buffer_end_cache_(NULL),
@@ -70,7 +70,7 @@ class InputStreamUTF16Buffer : public UC16CharacterStream {
     buffer_cursor_ = buffer_end_ = buffer_ + kPushBackSize;
   }
 
-  virtual ~InputStreamUTF16Buffer() {
+  virtual ~InputStreamUtf16Buffer() {
     if (pushback_buffer_backing_ != NULL) {
       DeleteArray(pushback_buffer_backing_);
     }
@@ -127,12 +127,18 @@ class InputStreamUTF16Buffer : public UC16CharacterStream {
     uc16* buffer_start = buffer_ + kPushBackSize;
     buffer_cursor_ = buffer_end_ = buffer_start;
     while ((value = stream_->Next()) >= 0) {
-      if (value > static_cast<int32_t>(unibrow::Utf8::kMaxThreeByteChar)) {
-        value = unibrow::Utf8::kBadChar;
+      if (value >
+          static_cast<int32_t>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
+        buffer_start[buffer_end_++ - buffer_start] =
+            unibrow::Utf16::LeadSurrogate(value);
+        buffer_start[buffer_end_++ - buffer_start] =
+            unibrow::Utf16::TrailSurrogate(value);
+      } else {
+        // buffer_end_ is a const pointer, but buffer_ is writable.
+        buffer_start[buffer_end_++ - buffer_start] = static_cast<uc16>(value);
       }
-      // buffer_end_ is a const pointer, but buffer_ is writable.
-      buffer_start[buffer_end_++ - buffer_start] = static_cast<uc16>(value);
-      if (buffer_end_ == buffer_ + kPushBackSize + kBufferSize) break;
+      // Stop one before the end of the buffer in case we get a surrogate pair.
+      if (buffer_end_ <= buffer_ + 1 + kPushBackSize + kBufferSize) break;
     }
     return buffer_end_ > buffer_start;
   }
@@ -179,7 +185,7 @@ UnicodeInputStream::~UnicodeInputStream() { }
 
 
 PreParserData Preparse(UnicodeInputStream* input, size_t max_stack) {
-  internal::InputStreamUTF16Buffer buffer(input);
+  internal::InputStreamUtf16Buffer buffer(input);
   uintptr_t stack_limit = reinterpret_cast<uintptr_t>(&buffer) - max_stack;
   internal::UnicodeCache unicode_cache;
   internal::Scanner scanner(&unicode_cache);
index b36f4faca423e7254d5373d1c7f0d15e73ad6280..20d3b9c59c40916ea5eabad11fc8dcd8bf5fae65 100644 (file)
@@ -1214,7 +1214,7 @@ void PreParser::CheckDuplicate(DuplicateFinder* finder,
     old_type = finder->AddAsciiSymbol(scanner_->literal_ascii_string(),
                                       type);
   } else {
-    old_type = finder->AddUC16Symbol(scanner_->literal_uc16_string(), type);
+    old_type = finder->AddUtf16Symbol(scanner_->literal_utf16_string(), type);
   }
   if (HasConflict(old_type, type)) {
     if (IsDataDataConflict(old_type, type)) {
@@ -1387,7 +1387,7 @@ PreParser::Expression PreParser::ParseFunctionLiteral(bool* ok) {
           duplicate_finder.AddAsciiSymbol(scanner_->literal_ascii_string(), 1);
     } else {
       prev_value =
-          duplicate_finder.AddUC16Symbol(scanner_->literal_uc16_string(), 1);
+          duplicate_finder.AddUtf16Symbol(scanner_->literal_utf16_string(), 1);
     }
 
     if (prev_value != 0) {
@@ -1485,7 +1485,7 @@ void PreParser::LogSymbol() {
   if (scanner_->is_literal_ascii()) {
     log_->LogAsciiSymbol(identifier_pos, scanner_->literal_ascii_string());
   } else {
-    log_->LogUC16Symbol(identifier_pos, scanner_->literal_uc16_string());
+    log_->LogUtf16Symbol(identifier_pos, scanner_->literal_utf16_string());
   }
 }
 
@@ -1657,7 +1657,7 @@ int DuplicateFinder::AddAsciiSymbol(i::Vector<const char> key, int value) {
   return AddSymbol(i::Vector<const byte>::cast(key), true, value);
 }
 
-int DuplicateFinder::AddUC16Symbol(i::Vector<const uint16_t> key, int value) {
+int DuplicateFinder::AddUtf16Symbol(i::Vector<const uint16_t> key, int value) {
   return AddSymbol(i::Vector<const byte>::cast(key), false, value);
 }
 
index 1455561bbdde5d40e7e0e88048207f11d2bcfd58..f3a43475dfe96bb3d3fd78abfecbf42862883b3c 100644 (file)
@@ -65,7 +65,7 @@ class DuplicateFinder {
         map_(&Match) { }
 
   int AddAsciiSymbol(i::Vector<const char> key, int value);
-  int AddUC16Symbol(i::Vector<const uint16_t> key, int value);
+  int AddUtf16Symbol(i::Vector<const uint16_t> key, int value);
   // Add a a number literal by converting it (if necessary)
   // to the string that ToString(ToNumber(literal)) would generate.
   // and then adding that string with AddAsciiSymbol.
index ee10703c9ef82d20b835ea182f9b755d3295797f..56b9f03aa59f67d4c2bcc67a574f711070f0fb2e 100644 (file)
@@ -36,19 +36,19 @@ namespace v8 {
 namespace internal {
 
 // ----------------------------------------------------------------------------
-// BufferedUC16CharacterStreams
+// BufferedUtf16CharacterStreams
 
-BufferedUC16CharacterStream::BufferedUC16CharacterStream()
-    : UC16CharacterStream(),
+BufferedUtf16CharacterStream::BufferedUtf16CharacterStream()
+    : Utf16CharacterStream(),
       pushback_limit_(NULL) {
   // Initialize buffer as being empty. First read will fill the buffer.
   buffer_cursor_ = buffer_;
   buffer_end_ = buffer_;
 }
 
-BufferedUC16CharacterStream::~BufferedUC16CharacterStream() { }
+BufferedUtf16CharacterStream::~BufferedUtf16CharacterStream() { }
 
-void BufferedUC16CharacterStream::PushBack(uc32 character) {
+void BufferedUtf16CharacterStream::PushBack(uc32 character) {
   if (character == kEndOfInput) {
     pos_--;
     return;
@@ -63,7 +63,7 @@ void BufferedUC16CharacterStream::PushBack(uc32 character) {
 }
 
 
-void BufferedUC16CharacterStream::SlowPushBack(uc16 character) {
+void BufferedUtf16CharacterStream::SlowPushBack(uc16 character) {
   // In pushback mode, the end of the buffer contains pushback,
   // and the start of the buffer (from buffer start to pushback_limit_)
   // contains valid data that comes just after the pushback.
@@ -89,7 +89,7 @@ void BufferedUC16CharacterStream::SlowPushBack(uc16 character) {
 }
 
 
-bool BufferedUC16CharacterStream::ReadBlock() {
+bool BufferedUtf16CharacterStream::ReadBlock() {
   buffer_cursor_ = buffer_;
   if (pushback_limit_ != NULL) {
     // Leave pushback mode.
@@ -106,7 +106,7 @@ bool BufferedUC16CharacterStream::ReadBlock() {
 }
 
 
-unsigned BufferedUC16CharacterStream::SlowSeekForward(unsigned delta) {
+unsigned BufferedUtf16CharacterStream::SlowSeekForward(unsigned delta) {
   // Leave pushback mode (i.e., ignore that there might be valid data
   // in the buffer before the pushback_limit_ point).
   pushback_limit_ = NULL;
@@ -114,10 +114,10 @@ unsigned BufferedUC16CharacterStream::SlowSeekForward(unsigned delta) {
 }
 
 // ----------------------------------------------------------------------------
-// GenericStringUC16CharacterStream
+// GenericStringUtf16CharacterStream
 
 
-GenericStringUC16CharacterStream::GenericStringUC16CharacterStream(
+GenericStringUtf16CharacterStream::GenericStringUtf16CharacterStream(
     Handle<String> data,
     unsigned start_position,
     unsigned end_position)
@@ -130,10 +130,10 @@ GenericStringUC16CharacterStream::GenericStringUC16CharacterStream(
 }
 
 
-GenericStringUC16CharacterStream::~GenericStringUC16CharacterStream() { }
+GenericStringUtf16CharacterStream::~GenericStringUtf16CharacterStream() { }
 
 
-unsigned GenericStringUC16CharacterStream::BufferSeekForward(unsigned delta) {
+unsigned GenericStringUtf16CharacterStream::BufferSeekForward(unsigned delta) {
   unsigned old_pos = pos_;
   pos_ = Min(pos_ + delta, length_);
   ReadBlock();
@@ -141,7 +141,7 @@ unsigned GenericStringUC16CharacterStream::BufferSeekForward(unsigned delta) {
 }
 
 
-unsigned GenericStringUC16CharacterStream::FillBuffer(unsigned from_pos,
+unsigned GenericStringUtf16CharacterStream::FillBuffer(unsigned from_pos,
                                                       unsigned length) {
   if (from_pos >= length_) return 0;
   if (from_pos + length > length_) {
@@ -153,10 +153,10 @@ unsigned GenericStringUC16CharacterStream::FillBuffer(unsigned from_pos,
 
 
 // ----------------------------------------------------------------------------
-// Utf8ToUC16CharacterStream
-Utf8ToUC16CharacterStream::Utf8ToUC16CharacterStream(const byte* data,
-                                                     unsigned length)
-    : BufferedUC16CharacterStream(),
+// Utf8ToUtf16CharacterStream
+Utf8ToUtf16CharacterStream::Utf8ToUtf16CharacterStream(const byte* data,
+                                                       unsigned length)
+    : BufferedUtf16CharacterStream(),
       raw_data_(data),
       raw_data_length_(length),
       raw_data_pos_(0),
@@ -165,10 +165,10 @@ Utf8ToUC16CharacterStream::Utf8ToUC16CharacterStream(const byte* data,
 }
 
 
-Utf8ToUC16CharacterStream::~Utf8ToUC16CharacterStream() { }
+Utf8ToUtf16CharacterStream::~Utf8ToUtf16CharacterStream() { }
 
 
-unsigned Utf8ToUC16CharacterStream::BufferSeekForward(unsigned delta) {
+unsigned Utf8ToUtf16CharacterStream::BufferSeekForward(unsigned delta) {
   unsigned old_pos = pos_;
   unsigned target_pos = pos_ + delta;
   SetRawPosition(target_pos);
@@ -178,9 +178,9 @@ unsigned Utf8ToUC16CharacterStream::BufferSeekForward(unsigned delta) {
 }
 
 
-unsigned Utf8ToUC16CharacterStream::FillBuffer(unsigned char_position,
-                                               unsigned length) {
-  static const unibrow::uchar kMaxUC16Character = 0xffff;
+unsigned Utf8ToUtf16CharacterStream::FillBuffer(unsigned char_position,
+                                                unsigned length) {
+  static const unibrow::uchar kMaxUtf16Character = 0xffff;
   SetRawPosition(char_position);
   if (raw_character_position_ != char_position) {
     // char_position was not a valid position in the stream (hit the end
@@ -188,7 +188,7 @@ unsigned Utf8ToUC16CharacterStream::FillBuffer(unsigned char_position,
     return 0u;
   }
   unsigned i = 0;
-  while (i < length) {
+  while (i < length - 1) {
     if (raw_data_pos_ == raw_data_length_) break;
     unibrow::uchar c = raw_data_[raw_data_pos_];
     if (c <= unibrow::Utf8::kMaxOneByteChar) {
@@ -197,12 +197,13 @@ unsigned Utf8ToUC16CharacterStream::FillBuffer(unsigned char_position,
       c =  unibrow::Utf8::CalculateValue(raw_data_ + raw_data_pos_,
                                          raw_data_length_ - raw_data_pos_,
                                          &raw_data_pos_);
-      // Don't allow characters outside of the BMP.
-      if (c > kMaxUC16Character) {
-        c = unibrow::Utf8::kBadChar;
-      }
     }
-    buffer_[i++] = static_cast<uc16>(c);
+    if (c > kMaxUtf16Character) {
+      buffer_[i++] = unibrow::Utf16::LeadSurrogate(c);
+      buffer_[i++] = unibrow::Utf16::TrailSurrogate(c);
+    } else {
+      buffer_[i++] = static_cast<uc16>(c);
+    }
   }
   raw_character_position_ = char_position + i;
   return i;
@@ -266,37 +267,52 @@ static inline void Utf8CharacterForward(const byte* buffer, unsigned* cursor) {
 }
 
 
-void Utf8ToUC16CharacterStream::SetRawPosition(unsigned target_position) {
+// This can't set a raw position between two surrogate pairs, since there
+// is no position in the UTF8 stream that corresponds to that.  This assumes
+// that the surrogate pair is correctly coded as a 4 byte UTF-8 sequence.  If
+// it is illegally coded as two 3 byte sequences then there is no problem here.
+void Utf8ToUtf16CharacterStream::SetRawPosition(unsigned target_position) {
   if (raw_character_position_ > target_position) {
     // Spool backwards in utf8 buffer.
     do {
+      int old_pos = raw_data_pos_;
       Utf8CharacterBack(raw_data_, &raw_data_pos_);
       raw_character_position_--;
+      ASSERT(old_pos - raw_data_pos_ <= 4);
+      // Step back over both code units for surrogate pairs.
+      if (old_pos - raw_data_pos_ == 4) raw_character_position_--;
     } while (raw_character_position_ > target_position);
+    // No surrogate pair splitting.
+    ASSERT(raw_character_position_ == target_position);
     return;
   }
   // Spool forwards in the utf8 buffer.
   while (raw_character_position_ < target_position) {
     if (raw_data_pos_ == raw_data_length_) return;
+    int old_pos = raw_data_pos_;
     Utf8CharacterForward(raw_data_, &raw_data_pos_);
     raw_character_position_++;
+    ASSERT(raw_data_pos_ - old_pos <= 4);
+    if (raw_data_pos_ - old_pos == 4) raw_character_position_++;
   }
+  // No surrogate pair splitting.
+  ASSERT(raw_character_position_ == target_position);
 }
 
 
 // ----------------------------------------------------------------------------
-// ExternalTwoByteStringUC16CharacterStream
+// ExternalTwoByteStringUtf16CharacterStream
 
-ExternalTwoByteStringUC16CharacterStream::
-    ~ExternalTwoByteStringUC16CharacterStream() { }
+ExternalTwoByteStringUtf16CharacterStream::
+    ~ExternalTwoByteStringUtf16CharacterStream() { }
 
 
-ExternalTwoByteStringUC16CharacterStream
-    ::ExternalTwoByteStringUC16CharacterStream(
+ExternalTwoByteStringUtf16CharacterStream
+    ::ExternalTwoByteStringUtf16CharacterStream(
         Handle<ExternalTwoByteString> data,
         int start_position,
         int end_position)
-    : UC16CharacterStream(),
+    : Utf16CharacterStream(),
       source_(data),
       raw_data_(data->GetTwoByteData(start_position)) {
   buffer_cursor_ = raw_data_,
index 5c4ea2ca36d603f6e6d6688dcba223adf301d262..319ee8fc1c587f82487b8e402250914adb2dcee2 100644 (file)
@@ -36,10 +36,10 @@ namespace internal {
 // A buffered character stream based on a random access character
 // source (ReadBlock can be called with pos_ pointing to any position,
 // even positions before the current).
-class BufferedUC16CharacterStream: public UC16CharacterStream {
+class BufferedUtf16CharacterStream: public Utf16CharacterStream {
  public:
-  BufferedUC16CharacterStream();
-  virtual ~BufferedUC16CharacterStream();
+  BufferedUtf16CharacterStream();
+  virtual ~BufferedUtf16CharacterStream();
 
   virtual void PushBack(uc32 character);
 
@@ -60,12 +60,12 @@ class BufferedUC16CharacterStream: public UC16CharacterStream {
 
 
 // Generic string stream.
-class GenericStringUC16CharacterStream: public BufferedUC16CharacterStream {
+class GenericStringUtf16CharacterStream: public BufferedUtf16CharacterStream {
  public:
-  GenericStringUC16CharacterStream(Handle<String> data,
-                                   unsigned start_position,
-                                   unsigned end_position);
-  virtual ~GenericStringUC16CharacterStream();
+  GenericStringUtf16CharacterStream(Handle<String> data,
+                                    unsigned start_position,
+                                    unsigned end_position);
+  virtual ~GenericStringUtf16CharacterStream();
 
  protected:
   virtual unsigned BufferSeekForward(unsigned delta);
@@ -77,11 +77,11 @@ class GenericStringUC16CharacterStream: public BufferedUC16CharacterStream {
 };
 
 
-// UC16 stream based on a literal UTF-8 string.
-class Utf8ToUC16CharacterStream: public BufferedUC16CharacterStream {
+// Utf16 stream based on a literal UTF-8 string.
+class Utf8ToUtf16CharacterStream: public BufferedUtf16CharacterStream {
  public:
-  Utf8ToUC16CharacterStream(const byte* data, unsigned length);
-  virtual ~Utf8ToUC16CharacterStream();
+  Utf8ToUtf16CharacterStream(const byte* data, unsigned length);
+  virtual ~Utf8ToUtf16CharacterStream();
 
  protected:
   virtual unsigned BufferSeekForward(unsigned delta);
@@ -98,12 +98,12 @@ class Utf8ToUC16CharacterStream: public BufferedUC16CharacterStream {
 
 
 // UTF16 buffer to read characters from an external string.
-class ExternalTwoByteStringUC16CharacterStream: public UC16CharacterStream {
+class ExternalTwoByteStringUtf16CharacterStream: public Utf16CharacterStream {
  public:
-  ExternalTwoByteStringUC16CharacterStream(Handle<ExternalTwoByteString> data,
-                                           int start_position,
-                                           int end_position);
-  virtual ~ExternalTwoByteStringUC16CharacterStream();
+  ExternalTwoByteStringUtf16CharacterStream(Handle<ExternalTwoByteString> data,
+                                            int start_position,
+                                            int end_position);
+  virtual ~ExternalTwoByteStringUtf16CharacterStream();
 
   virtual void PushBack(uc32 character) {
     ASSERT(buffer_cursor_ > raw_data_);
index 72768b381b4b085e5a0281093b44b76b92a38b31..7901b5d8264b85c07f6a1e8db724b851110c1085 100755 (executable)
@@ -45,7 +45,7 @@ Scanner::Scanner(UnicodeCache* unicode_cache)
       harmony_modules_(false) { }
 
 
-void Scanner::Initialize(UC16CharacterStream* source) {
+void Scanner::Initialize(Utf16CharacterStream* source) {
   source_ = source;
   // Need to capture identifiers in order to recognize "get" and "set"
   // in object literals.
index e892fe0c1fbfdcb13139ec708ef7b5072cb1925f..045e7d27a6c647ee963a6c859c47b72548f1edf7 100644 (file)
@@ -73,15 +73,17 @@ inline int HexValue(uc32 c) {
 
 
 // ---------------------------------------------------------------------
-// Buffered stream of characters, using an internal UC16 buffer.
+// Buffered stream of UTF-16 code units, using an internal UTF-16 buffer.
+// A code unit is a 16 bit value representing either a 16 bit code point
+// or one part of a surrogate pair that make a single 21 bit code point.
 
-class UC16CharacterStream {
+class Utf16CharacterStream {
  public:
-  UC16CharacterStream() : pos_(0) { }
-  virtual ~UC16CharacterStream() { }
+  Utf16CharacterStream() : pos_(0) { }
+  virtual ~Utf16CharacterStream() { }
 
-  // Returns and advances past the next UC16 character in the input
-  // stream. If there are no more characters, it returns a negative
+  // Returns and advances past the next UTF-16 code unit in the input
+  // stream. If there are no more code units, it returns a negative
   // value.
   inline uc32 Advance() {
     if (buffer_cursor_ < buffer_end_ || ReadBlock()) {
@@ -90,47 +92,47 @@ class UC16CharacterStream {
     }
     // Note: currently the following increment is necessary to avoid a
     // parser problem! The scanner treats the final kEndOfInput as
-    // a character with a position, and does math relative to that
+    // a code unit with a position, and does math relative to that
     // position.
     pos_++;
 
     return kEndOfInput;
   }
 
-  // Return the current position in the character stream.
+  // Return the current position in the code unit stream.
   // Starts at zero.
   inline unsigned pos() const { return pos_; }
 
-  // Skips forward past the next character_count UC16 characters
+  // Skips forward past the next code_unit_count UTF-16 code units
   // in the input, or until the end of input if that comes sooner.
-  // Returns the number of characters actually skipped. If less
-  // than character_count,
-  inline unsigned SeekForward(unsigned character_count) {
+  // Returns the number of code units actually skipped. If less
+  // than code_unit_count,
+  inline unsigned SeekForward(unsigned code_unit_count) {
     unsigned buffered_chars =
         static_cast<unsigned>(buffer_end_ - buffer_cursor_);
-    if (character_count <= buffered_chars) {
-      buffer_cursor_ += character_count;
-      pos_ += character_count;
-      return character_count;
+    if (code_unit_count <= buffered_chars) {
+      buffer_cursor_ += code_unit_count;
+      pos_ += code_unit_count;
+      return code_unit_count;
     }
-    return SlowSeekForward(character_count);
+    return SlowSeekForward(code_unit_count);
   }
 
-  // Pushes back the most recently read UC16 character (or negative
+  // Pushes back the most recently read UTF-16 code unit (or negative
   // value if at end of input), i.e., the value returned by the most recent
   // call to Advance.
   // Must not be used right after calling SeekForward.
-  virtual void PushBack(int32_t character) = 0;
+  virtual void PushBack(int32_t code_unit) = 0;
 
  protected:
   static const uc32 kEndOfInput = -1;
 
-  // Ensures that the buffer_cursor_ points to the character at
+  // Ensures that the buffer_cursor_ points to the code_unit at
   // position pos_ of the input, if possible. If the position
   // is at or after the end of the input, return false. If there
-  // are more characters available, return true.
+  // are more code_units available, return true.
   virtual bool ReadBlock() = 0;
-  virtual unsigned SlowSeekForward(unsigned character_count) = 0;
+  virtual unsigned SlowSeekForward(unsigned code_unit_count) = 0;
 
   const uc16* buffer_cursor_;
   const uc16* buffer_end_;
@@ -178,23 +180,24 @@ class LiteralBuffer {
     }
   }
 
-  INLINE(void AddChar(uc16 character)) {
+  INLINE(void AddChar(uint32_t code_unit)) {
     if (position_ >= backing_store_.length()) ExpandBuffer();
     if (is_ascii_) {
-      if (character < kMaxAsciiCharCodeU) {
-        backing_store_[position_] = static_cast<byte>(character);
+      if (code_unit < kMaxAsciiCharCodeU) {
+        backing_store_[position_] = static_cast<byte>(code_unit);
         position_ += kASCIISize;
         return;
       }
-      ConvertToUC16();
+      ConvertToUtf16();
     }
-    *reinterpret_cast<uc16*>(&backing_store_[position_]) = character;
+    ASSERT(code_unit < 0x10000u);
+    *reinterpret_cast<uc16*>(&backing_store_[position_]) = code_unit;
     position_ += kUC16Size;
   }
 
   bool is_ascii() { return is_ascii_; }
 
-  Vector<const uc16> uc16_literal() {
+  Vector<const uc16> utf16_literal() {
     ASSERT(!is_ascii_);
     ASSERT((position_ & 0x1) == 0);
     return Vector<const uc16>(
@@ -236,13 +239,13 @@ class LiteralBuffer {
     backing_store_ = new_store;
   }
 
-  void ConvertToUC16() {
+  void ConvertToUtf16() {
     ASSERT(is_ascii_);
     Vector<byte> new_store;
     int new_content_size = position_ * kUC16Size;
     if (new_content_size >= backing_store_.length()) {
-      // Ensure room for all currently read characters as UC16 as well
-      // as the character about to be stored.
+      // Ensure room for all currently read code units as UC16 as well
+      // as the code unit about to be stored.
       new_store = Vector<byte>::New(NewCapacity(new_content_size));
     } else {
       new_store = backing_store_;
@@ -316,7 +319,7 @@ class Scanner {
 
   explicit Scanner(UnicodeCache* scanner_contants);
 
-  void Initialize(UC16CharacterStream* source);
+  void Initialize(Utf16CharacterStream* source);
 
   // Returns the next token and advances input.
   Token::Value Next();
@@ -335,9 +338,9 @@ class Scanner {
     ASSERT_NOT_NULL(current_.literal_chars);
     return current_.literal_chars->ascii_literal();
   }
-  Vector<const uc16> literal_uc16_string() {
+  Vector<const uc16> literal_utf16_string() {
     ASSERT_NOT_NULL(current_.literal_chars);
-    return current_.literal_chars->uc16_literal();
+    return current_.literal_chars->utf16_literal();
   }
   bool is_literal_ascii() {
     ASSERT_NOT_NULL(current_.literal_chars);
@@ -371,9 +374,9 @@ class Scanner {
     ASSERT_NOT_NULL(next_.literal_chars);
     return next_.literal_chars->ascii_literal();
   }
-  Vector<const uc16> next_literal_uc16_string() {
+  Vector<const uc16> next_literal_utf16_string() {
     ASSERT_NOT_NULL(next_.literal_chars);
-    return next_.literal_chars->uc16_literal();
+    return next_.literal_chars->utf16_literal();
   }
   bool is_next_literal_ascii() {
     ASSERT_NOT_NULL(next_.literal_chars);
@@ -542,8 +545,8 @@ class Scanner {
   TokenDesc current_;  // desc for current token (as returned by Next())
   TokenDesc next_;     // desc for next token (one token look-ahead)
 
-  // Input stream. Must be initialized to an UC16CharacterStream.
-  UC16CharacterStream* source_;
+  // Input stream. Must be initialized to an Utf16CharacterStream.
+  Utf16CharacterStream* source_;
 
 
   // Start position of the octal literal last scanned.
index c0649d74fb051390f8633e836326517545830ccb..9c0ebf9e1bc4c274ffefb4171acd44183a6981e5 100644 (file)
@@ -78,7 +78,7 @@ template <class T, int s> int Mapping<T, s>::CalculateValue(uchar c, uchar n,
 }
 
 
-unsigned Utf8::Encode(char* str, uchar c) {
+unsigned Utf8::Encode(char* str, uchar c, int previous) {
   static const int kMask = ~(1 << 6);
   if (c <= kMaxOneByteChar) {
     str[0] = c;
@@ -88,6 +88,13 @@ unsigned Utf8::Encode(char* str, uchar c) {
     str[1] = 0x80 | (c & kMask);
     return 2;
   } else if (c <= kMaxThreeByteChar) {
+    if (Utf16::IsTrailSurrogate(c) &&
+        Utf16::IsLeadSurrogate(previous)) {
+      const int kUnmatchedSize = kSizeOfUnmatchedSurrogate;
+      return Encode(str - kUnmatchedSize,
+                    Utf16::CombineSurrogatePair(previous, c),
+                    Utf16::kNoPreviousCharacter) - kUnmatchedSize;
+    }
     str[0] = 0xE0 | (c >> 12);
     str[1] = 0x80 | ((c >> 6) & kMask);
     str[2] = 0x80 | (c & kMask);
@@ -113,12 +120,16 @@ uchar Utf8::ValueOf(const byte* bytes, unsigned length, unsigned* cursor) {
   return CalculateValue(bytes, length, cursor);
 }
 
-unsigned Utf8::Length(uchar c) {
+unsigned Utf8::Length(uchar c, int previous) {
   if (c <= kMaxOneByteChar) {
     return 1;
   } else if (c <= kMaxTwoByteChar) {
     return 2;
   } else if (c <= kMaxThreeByteChar) {
+    if (Utf16::IsTrailSurrogate(c) &&
+        Utf16::IsLeadSurrogate(previous)) {
+      return kSizeOfUnmatchedSurrogate - kBytesSavedByCombiningSurrogates;
+    }
     return 3;
   } else {
     return 4;
index 61c649f5e4bb1906e384c5a3d1a473d187efcb72..14f380642a7a8dcd8286f93a3dbd176b59a68b99 100644 (file)
@@ -276,6 +276,7 @@ uchar Utf8::CalculateValue(const byte* str,
   return kBadChar;
 }
 
+
 const byte* Utf8::ReadBlock(Buffer<const char*> str, byte* buffer,
     unsigned capacity, unsigned* chars_read_ptr, unsigned* offset_ptr) {
   unsigned offset = *offset_ptr;
@@ -338,6 +339,16 @@ unsigned CharacterStream::Length() {
   return result;
 }
 
+unsigned CharacterStream::Utf16Length() {
+  unsigned result = 0;
+  while (has_more()) {
+    uchar c = GetNext();
+    result += c > Utf16::kMaxNonSurrogateCharCode ? 2 : 1;
+  }
+  Rewind();
+  return result;
+}
+
 void CharacterStream::Seek(unsigned position) {
   Rewind();
   for (unsigned i = 0; i < position; i++) {
index fb9e6339e1279dd2e4c5ce6dd08333f805b03c19..b9c4234e87ed4a1a9445c3529b01055a44bdf1e5 100644 (file)
@@ -100,7 +100,7 @@ class UnicodeData {
   static const uchar kMaxCodePoint;
 };
 
-// --- U t f   8 ---
+// --- U t f   8   a n d   16 ---
 
 template <typename Data>
 class Buffer {
@@ -114,10 +114,46 @@ class Buffer {
   unsigned length_;
 };
 
+
+class Utf16 {
+ public:
+  static inline bool IsLeadSurrogate(int32_t code) {
+    if (code == kNoPreviousCharacter) return false;
+    return (code & 0xfc00) == 0xd800;
+  }
+  static inline bool IsTrailSurrogate(int32_t code) {
+    if (code == kNoPreviousCharacter) return false;
+    return (code & 0xfc00) == 0xdc00;
+  }
+
+  static inline int32_t CombineSurrogatePair(uchar lead, uchar trail) {
+    return 0x10000 + ((lead & 0x3ff) << 10) + (trail & 0x3ff);
+  }
+  static const int32_t kNoPreviousCharacter = -1;
+  static const uchar kMaxNonSurrogateCharCode = 0xffff;
+  // Encoding a single UTF-16 code unit will produce 1, 2 or 3 bytes
+  // of UTF-8 data.  The special case where the unit is a surrogate
+  // trail produces 1 byte net, because the encoding of the pair is
+  // 4 bytes and the 3 bytes that were used to encode the lead surrogate
+  // can be reclaimed.
+  static const int kMaxExtraUtf8BytesForOneUtf16CodeUnit = 3;
+  // One UTF-16 surrogate is endoded (illegally) as 3 UTF-8 bytes.
+  // The illegality stems from the surrogate not being part of a pair.
+  static const int kUtf8BytesToCodeASurrogate = 3;
+  static inline uchar LeadSurrogate(int32_t char_code) {
+    return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff);
+  }
+  static inline uchar TrailSurrogate(int32_t char_code) {
+    return 0xdc00 + (char_code & 0x3ff);
+  }
+};
+
+
 class Utf8 {
  public:
-  static inline uchar Length(uchar chr);
-  static inline unsigned Encode(char* out, uchar c);
+  static inline uchar Length(uchar chr, int previous);
+  static inline unsigned Encode(
+      char* out, uchar c, int previous);
   static const byte* ReadBlock(Buffer<const char*> str, byte* buffer,
       unsigned capacity, unsigned* chars_read, unsigned* offset);
   static uchar CalculateValue(const byte* str,
@@ -130,6 +166,11 @@ class Utf8 {
   static const unsigned kMaxThreeByteChar = 0xffff;
   static const unsigned kMaxFourByteChar  = 0x1fffff;
 
+  // A single surrogate is coded as a 3 byte UTF-8 sequence, but two together
+  // that match are coded as a 4 byte UTF-8 sequence.
+  static const unsigned kBytesSavedByCombiningSurrogates = 2;
+  static const unsigned kSizeOfUnmatchedSurrogate = 3;
+
  private:
   template <unsigned s> friend class Utf8InputBuffer;
   friend class Test;
@@ -147,6 +188,7 @@ class CharacterStream {
   // Note that default implementation is not efficient.
   virtual void Seek(unsigned);
   unsigned Length();
+  unsigned Utf16Length();
   virtual ~CharacterStream() { }
   static inline bool EncodeCharacter(uchar c, byte* buffer, unsigned capacity,
       unsigned& offset);
@@ -156,6 +198,7 @@ class CharacterStream {
       unsigned capacity, unsigned& offset);
   static inline uchar DecodeCharacter(const byte* buffer, unsigned* offset);
   virtual void Rewind() = 0;
+
  protected:
   virtual void FillBuffer() = 0;
   // The number of characters left in the current buffer
index 773fc4c16cce529ff9eb2b41f124ac60c5a77e6c..837c2543c32289705ebeffde9fd794a9a5bdc0dd 100644 (file)
@@ -564,7 +564,7 @@ void RegExpMacroAssemblerX64::CheckNotCharacterAfterMinusAnd(
     uc16 minus,
     uc16 mask,
     Label* on_not_equal) {
-  ASSERT(minus < String::kMaxUC16CharCode);
+  ASSERT(minus < String::kMaxUtf16CodeUnit);
   __ lea(rax, Operand(current_character(), -minus));
   __ and_(rax, Immediate(mask));
   __ cmpl(rax, Immediate(c));
index 5137c6563764d63e6d1fb5ab172ef7cb94a878e7..79818acfada099bb7d05d5460955d0f7b4b33259 100644 (file)
@@ -5526,6 +5526,17 @@ static int StrNCmp16(uint16_t* a, uint16_t* b, int n) {
 }
 
 
+int GetUtf8Length(Handle<String> str) {
+  int len = str->Utf8Length();
+  if (len < 0) {
+    i::Handle<i::String> istr(v8::Utils::OpenHandle(*str));
+    i::FlattenString(istr);
+    len = str->Utf8Length();
+  }
+  return len;
+}
+
+
 THREADED_TEST(StringWrite) {
   LocalContext context;
   v8::HandleScope scope;
@@ -5606,7 +5617,7 @@ THREADED_TEST(StringWrite) {
   CHECK_EQ(0, strncmp(utf8buf, "ab\1", 3));
 
   memset(utf8buf, 0x1, sizeof(utf8buf));
-  len = left_tree->Utf8Length();
+  len = GetUtf8Length(left_tree);
   int utf8_expected =
       (0x80 + (0x800 - 0x80) * 2 + (0xd800 - 0x800) * 3) / kStride;
   CHECK_EQ(utf8_expected, len);
@@ -5620,7 +5631,7 @@ THREADED_TEST(StringWrite) {
   CHECK_EQ(1, utf8buf[utf8_expected]);
 
   memset(utf8buf, 0x1, sizeof(utf8buf));
-  len = right_tree->Utf8Length();
+  len = GetUtf8Length(right_tree);
   CHECK_EQ(utf8_expected, len);
   len = right_tree->WriteUtf8(utf8buf, utf8_expected, &charlen);
   CHECK_EQ(utf8_expected, len);
@@ -5745,6 +5756,217 @@ THREADED_TEST(StringWrite) {
 }
 
 
+static void Utf16Helper(
+    LocalContext& context,
+    const char* name,
+    const char* lengths_name,
+    int len) {
+  Local<v8::Array> a =
+      Local<v8::Array>::Cast(context->Global()->Get(v8_str(name)));
+  Local<v8::Array> alens =
+      Local<v8::Array>::Cast(context->Global()->Get(v8_str(lengths_name)));
+  for (int i = 0; i < len; i++) {
+    Local<v8::String> string =
+      Local<v8::String>::Cast(a->Get(i));
+    Local<v8::Number> expected_len =
+      Local<v8::Number>::Cast(alens->Get(i));
+    int length = GetUtf8Length(string);
+    CHECK_EQ(static_cast<int>(expected_len->Value()), length);
+  }
+}
+
+
+static uint16_t StringGet(Handle<String> str, int index) {
+  i::Handle<i::String> istring =
+      v8::Utils::OpenHandle(String::Cast(*str));
+  return istring->Get(index);
+}
+
+
+static void WriteUtf8Helper(
+    LocalContext& context,
+    const char* name,
+    const char* lengths_name,
+    int len) {
+  Local<v8::Array> b =
+      Local<v8::Array>::Cast(context->Global()->Get(v8_str(name)));
+  Local<v8::Array> alens =
+      Local<v8::Array>::Cast(context->Global()->Get(v8_str(lengths_name)));
+  char buffer[1000];
+  char buffer2[1000];
+  for (int i = 0; i < len; i++) {
+    Local<v8::String> string =
+      Local<v8::String>::Cast(b->Get(i));
+    Local<v8::Number> expected_len =
+      Local<v8::Number>::Cast(alens->Get(i));
+    int utf8_length = static_cast<int>(expected_len->Value());
+    for (int j = utf8_length + 1; j >= 0; j--) {
+      memset(reinterpret_cast<void*>(&buffer), 42, sizeof(buffer));
+      memset(reinterpret_cast<void*>(&buffer2), 42, sizeof(buffer2));
+      int nchars;
+      int utf8_written =
+          string->WriteUtf8(buffer, j, &nchars, String::NO_OPTIONS);
+      int utf8_written2 =
+          string->WriteUtf8(buffer2, j, &nchars, String::NO_NULL_TERMINATION);
+      CHECK_GE(utf8_length + 1, utf8_written);
+      CHECK_GE(utf8_length, utf8_written2);
+      for (int k = 0; k < utf8_written2; k++) {
+        CHECK_EQ(buffer[k], buffer2[k]);
+      }
+      CHECK(nchars * 3 >= utf8_written - 1);
+      CHECK(nchars <= utf8_written);
+      if (j == utf8_length + 1) {
+        CHECK_EQ(utf8_written2, utf8_length);
+        CHECK_EQ(utf8_written2 + 1, utf8_written);
+      }
+      CHECK_EQ(buffer[utf8_written], 42);
+      if (j > utf8_length) {
+        if (utf8_written != 0) CHECK_EQ(buffer[utf8_written - 1], 0);
+        if (utf8_written > 1) CHECK_NE(buffer[utf8_written - 2], 42);
+        Handle<String> roundtrip = v8_str(buffer);
+        CHECK(roundtrip->Equals(string));
+      } else {
+        if (utf8_written != 0) CHECK_NE(buffer[utf8_written - 1], 42);
+      }
+      if (utf8_written2 != 0) CHECK_NE(buffer[utf8_written - 1], 42);
+      if (nchars >= 2) {
+        uint16_t trail = StringGet(string, nchars - 1);
+        uint16_t lead = StringGet(string, nchars - 2);
+        if (((lead & 0xfc00) == 0xd800) &&
+            ((trail & 0xfc00) == 0xdc00)) {
+          unsigned char u1 = buffer2[utf8_written2 - 4];
+          unsigned char u2 = buffer2[utf8_written2 - 3];
+          unsigned char u3 = buffer2[utf8_written2 - 2];
+          unsigned char u4 = buffer2[utf8_written2 - 1];
+          CHECK_EQ((u1 & 0xf8), 0xf0);
+          CHECK_EQ((u2 & 0xc0), 0x80);
+          CHECK_EQ((u3 & 0xc0), 0x80);
+          CHECK_EQ((u4 & 0xc0), 0x80);
+          uint32_t c = 0x10000 + ((lead & 0x3ff) << 10) + (trail & 0x3ff);
+          CHECK_EQ((u4 & 0x3f), (c & 0x3f));
+          CHECK_EQ((u3 & 0x3f), ((c >> 6) & 0x3f));
+          CHECK_EQ((u2 & 0x3f), ((c >> 12) & 0x3f));
+          CHECK_EQ((u1 & 0x3), c >> 18);
+        }
+      }
+    }
+  }
+}
+
+
+THREADED_TEST(Utf16) {
+  LocalContext context;
+  v8::HandleScope scope;
+  CompileRun(
+      "var pad = '01234567890123456789';"
+      "var p = [];"
+      "var plens = [20, 3, 3];"
+      "p.push('01234567890123456789');"
+      "var lead = 0xd800;"
+      "var trail = 0xdc00;"
+      "p.push(String.fromCharCode(0xd800));"
+      "p.push(String.fromCharCode(0xdc00));"
+      "var a = [];"
+      "var b = [];"
+      "var alens = [];"
+      "for (var i = 0; i < 3; i++) {"
+      "  p[1] = String.fromCharCode(lead++);"
+      "  for (var j = 0; j < 3; j++) {"
+      "    p[2] = String.fromCharCode(trail++);"
+      "    a.push(p[i] + p[j]);"
+      "    b.push(p[i] + p[j]);"
+      "    alens.push(plens[i] + plens[j]);"
+      "  }"
+      "}"
+      "alens[5] -= 2;"  // Here the surrogate pairs match up.
+      "var a2 = [];"
+      "var b2 = [];"
+      "var a2lens = [];"
+      "for (var m = 0; m < 9; m++) {"
+      "  for (var n = 0; n < 9; n++) {"
+      "    a2.push(a[m] + a[n]);"
+      "    b2.push(b[m] + b[n]);"
+      "    var utf = alens[m] + alens[n];"  // And here.
+           // The 'n's that start with 0xdc.. are 6-8
+           // The 'm's that end with 0xd8.. are 1, 4 and 7
+      "    if ((m % 3) == 1 && n >= 6) utf -= 2;"
+      "    a2lens.push(utf);"
+      "  }"
+      "}");
+  Utf16Helper(context, "a", "alens", 9);
+  Utf16Helper(context, "a2", "a2lens", 81);
+  WriteUtf8Helper(context, "b", "alens", 9);
+  WriteUtf8Helper(context, "b2", "a2lens", 81);
+}
+
+
+static bool SameSymbol(Handle<String> s1, Handle<String> s2) {
+  i::Handle<i::String> is1(v8::Utils::OpenHandle(*s1));
+  i::Handle<i::String> is2(v8::Utils::OpenHandle(*s2));
+  return *is1 == *is2;
+}
+
+
+static void SameSymbolHelper(const char* a, const char* b) {
+  Handle<String> symbol1 = v8::String::NewSymbol(a);
+  Handle<String> symbol2 = v8::String::NewSymbol(b);
+  CHECK(SameSymbol(symbol1, symbol2));
+}
+
+
+THREADED_TEST(Utf16Symbol) {
+  LocalContext context;
+  v8::HandleScope scope;
+
+  Handle<String> symbol1 = v8::String::NewSymbol("abc");
+  Handle<String> symbol2 = v8::String::NewSymbol("abc");
+  CHECK(SameSymbol(symbol1, symbol2));
+
+  SameSymbolHelper("\360\220\220\205",  // 4 byte encoding.
+                   "\355\240\201\355\260\205");  // 2 3-byte surrogates.
+  SameSymbolHelper("\355\240\201\355\260\206",  // 2 3-byte surrogates.
+                   "\360\220\220\206");  // 4 byte encoding.
+  SameSymbolHelper("x\360\220\220\205",  // 4 byte encoding.
+                   "x\355\240\201\355\260\205");  // 2 3-byte surrogates.
+  SameSymbolHelper("x\355\240\201\355\260\206",  // 2 3-byte surrogates.
+                   "x\360\220\220\206");  // 4 byte encoding.
+  CompileRun(
+      "var sym0 = 'benedictus';"
+      "var sym0b = 'S\303\270ren';"
+      "var sym1 = '\355\240\201\355\260\207';"
+      "var sym2 = '\360\220\220\210';"
+      "var sym3 = 'x\355\240\201\355\260\207';"
+      "var sym4 = 'x\360\220\220\210';"
+      "if (sym1.length != 2) throw sym1;"
+      "if (sym1.charCodeAt(1) != 0xdc07) throw sym1.charCodeAt(1);"
+      "if (sym2.length != 2) throw sym2;"
+      "if (sym2.charCodeAt(1) != 0xdc08) throw sym2.charCodeAt(2);"
+      "if (sym3.length != 3) throw sym3;"
+      "if (sym3.charCodeAt(2) != 0xdc07) throw sym1.charCodeAt(2);"
+      "if (sym4.length != 3) throw sym4;"
+      "if (sym4.charCodeAt(2) != 0xdc08) throw sym2.charCodeAt(2);");
+  Handle<String> sym0 = v8::String::NewSymbol("benedictus");
+  Handle<String> sym0b = v8::String::NewSymbol("S\303\270ren");
+  Handle<String> sym1 = v8::String::NewSymbol("\355\240\201\355\260\207");
+  Handle<String> sym2 = v8::String::NewSymbol("\360\220\220\210");
+  Handle<String> sym3 = v8::String::NewSymbol("x\355\240\201\355\260\207");
+  Handle<String> sym4 = v8::String::NewSymbol("x\360\220\220\210");
+  v8::Local<v8::Object> global = context->Global();
+  Local<Value> s0 = global->Get(v8_str("sym0"));
+  Local<Value> s0b = global->Get(v8_str("sym0b"));
+  Local<Value> s1 = global->Get(v8_str("sym1"));
+  Local<Value> s2 = global->Get(v8_str("sym2"));
+  Local<Value> s3 = global->Get(v8_str("sym3"));
+  Local<Value> s4 = global->Get(v8_str("sym4"));
+  CHECK(SameSymbol(sym0, Handle<String>(String::Cast(*s0))));
+  CHECK(SameSymbol(sym0b, Handle<String>(String::Cast(*s0b))));
+  CHECK(SameSymbol(sym1, Handle<String>(String::Cast(*s1))));
+  CHECK(SameSymbol(sym2, Handle<String>(String::Cast(*s2))));
+  CHECK(SameSymbol(sym3, Handle<String>(String::Cast(*s3))));
+  CHECK(SameSymbol(sym4, Handle<String>(String::Cast(*s4))));
+}
+
+
 THREADED_TEST(ToArrayIndex) {
   v8::HandleScope scope;
   LocalContext context;
index cd8a6aff382934ae11c8ce45212e967f6c1fedd7..a3ac3028af284ad731d108a76d9fdd788eafa5ae 100755 (executable)
@@ -63,7 +63,7 @@ TEST(ScanKeywords) {
     int length = i::StrLength(key_token.keyword);
     CHECK(static_cast<int>(sizeof(buffer)) >= length);
     {
-      i::Utf8ToUC16CharacterStream stream(keyword, length);
+      i::Utf8ToUtf16CharacterStream stream(keyword, length);
       i::Scanner scanner(&unicode_cache);
       // The scanner should parse Harmony keywords for this test.
       scanner.SetHarmonyScoping(true);
@@ -74,7 +74,7 @@ TEST(ScanKeywords) {
     }
     // Removing characters will make keyword matching fail.
     {
-      i::Utf8ToUC16CharacterStream stream(keyword, length - 1);
+      i::Utf8ToUtf16CharacterStream stream(keyword, length - 1);
       i::Scanner scanner(&unicode_cache);
       scanner.Initialize(&stream);
       CHECK_EQ(i::Token::IDENTIFIER, scanner.Next());
@@ -85,7 +85,7 @@ TEST(ScanKeywords) {
     for (int j = 0; j < static_cast<int>(ARRAY_SIZE(chars_to_append)); ++j) {
       memmove(buffer, keyword, length);
       buffer[length] = chars_to_append[j];
-      i::Utf8ToUC16CharacterStream stream(buffer, length + 1);
+      i::Utf8ToUtf16CharacterStream stream(buffer, length + 1);
       i::Scanner scanner(&unicode_cache);
       scanner.Initialize(&stream);
       CHECK_EQ(i::Token::IDENTIFIER, scanner.Next());
@@ -95,7 +95,7 @@ TEST(ScanKeywords) {
     {
       memmove(buffer, keyword, length);
       buffer[length - 1] = '_';
-      i::Utf8ToUC16CharacterStream stream(buffer, length);
+      i::Utf8ToUtf16CharacterStream stream(buffer, length);
       i::Scanner scanner(&unicode_cache);
       scanner.Initialize(&stream);
       CHECK_EQ(i::Token::IDENTIFIER, scanner.Next());
@@ -255,7 +255,7 @@ TEST(StandAlonePreParser) {
   uintptr_t stack_limit = i::Isolate::Current()->stack_guard()->real_climit();
   for (int i = 0; programs[i]; i++) {
     const char* program = programs[i];
-    i::Utf8ToUC16CharacterStream stream(
+    i::Utf8ToUtf16CharacterStream stream(
         reinterpret_cast<const i::byte*>(program),
         static_cast<unsigned>(strlen(program)));
     i::CompleteParserRecorder log;
@@ -291,7 +291,7 @@ TEST(StandAlonePreParserNoNatives) {
   uintptr_t stack_limit = i::Isolate::Current()->stack_guard()->real_climit();
   for (int i = 0; programs[i]; i++) {
     const char* program = programs[i];
-    i::Utf8ToUC16CharacterStream stream(
+    i::Utf8ToUtf16CharacterStream stream(
         reinterpret_cast<const i::byte*>(program),
         static_cast<unsigned>(strlen(program)));
     i::CompleteParserRecorder log;
@@ -326,8 +326,9 @@ TEST(RegressChromium62639) {
   // and then used the invalid currently scanned literal. This always
   // failed in debug mode, and sometimes crashed in release mode.
 
-  i::Utf8ToUC16CharacterStream stream(reinterpret_cast<const i::byte*>(program),
-                                      static_cast<unsigned>(strlen(program)));
+  i::Utf8ToUtf16CharacterStream stream(
+      reinterpret_cast<const i::byte*>(program),
+      static_cast<unsigned>(strlen(program)));
   i::ScriptDataImpl* data =
       i::ParserApi::PreParse(&stream, NULL, false);
   CHECK(data->HasError());
@@ -392,7 +393,7 @@ TEST(PreParseOverflow) {
 
   uintptr_t stack_limit = i::Isolate::Current()->stack_guard()->real_climit();
 
-  i::Utf8ToUC16CharacterStream stream(
+  i::Utf8ToUtf16CharacterStream stream(
       reinterpret_cast<const i::byte*>(*program),
       static_cast<unsigned>(kProgramSize));
   i::CompleteParserRecorder log;
@@ -449,10 +450,10 @@ void TestCharacterStream(const char* ascii_source,
   i::Handle<i::String> uc16_string(
       FACTORY->NewExternalStringFromTwoByte(&resource));
 
-  i::ExternalTwoByteStringUC16CharacterStream uc16_stream(
+  i::ExternalTwoByteStringUtf16CharacterStream uc16_stream(
       i::Handle<i::ExternalTwoByteString>::cast(uc16_string), start, end);
-  i::GenericStringUC16CharacterStream string_stream(ascii_string, start, end);
-  i::Utf8ToUC16CharacterStream utf8_stream(
+  i::GenericStringUtf16CharacterStream string_stream(ascii_string, start, end);
+  i::Utf8ToUtf16CharacterStream utf8_stream(
       reinterpret_cast<const i::byte*>(ascii_source), end);
   utf8_stream.SeekForward(start);
 
@@ -575,12 +576,14 @@ TEST(Utf8CharacterStream) {
   char buffer[kAllUtf8CharsSizeU];
   unsigned cursor = 0;
   for (int i = 0; i <= kMaxUC16Char; i++) {
-    cursor += unibrow::Utf8::Encode(buffer + cursor, i);
+    cursor += unibrow::Utf8::Encode(buffer + cursor,
+                                    i,
+                                    unibrow::Utf16::kNoPreviousCharacter);
   }
   ASSERT(cursor == kAllUtf8CharsSizeU);
 
-  i::Utf8ToUC16CharacterStream stream(reinterpret_cast<const i::byte*>(buffer),
-                                      kAllUtf8CharsSizeU);
+  i::Utf8ToUtf16CharacterStream stream(reinterpret_cast<const i::byte*>(buffer),
+                                       kAllUtf8CharsSizeU);
   for (int i = 0; i <= kMaxUC16Char; i++) {
     CHECK_EQU(i, stream.pos());
     int32_t c = stream.Advance();
@@ -610,7 +613,7 @@ TEST(Utf8CharacterStream) {
 
 #undef CHECK_EQU
 
-void TestStreamScanner(i::UC16CharacterStream* stream,
+void TestStreamScanner(i::Utf16CharacterStream* stream,
                        i::Token::Value* expected_tokens,
                        int skip_pos = 0,  // Zero means not skipping.
                        int skip_to = 0) {
@@ -633,8 +636,8 @@ TEST(StreamScanner) {
   v8::V8::Initialize();
 
   const char* str1 = "{ foo get for : */ <- \n\n /*foo*/ bib";
-  i::Utf8ToUC16CharacterStream stream1(reinterpret_cast<const i::byte*>(str1),
-                                       static_cast<unsigned>(strlen(str1)));
+  i::Utf8ToUtf16CharacterStream stream1(reinterpret_cast<const i::byte*>(str1),
+                                        static_cast<unsigned>(strlen(str1)));
   i::Token::Value expectations1[] = {
       i::Token::LBRACE,
       i::Token::IDENTIFIER,
@@ -652,8 +655,8 @@ TEST(StreamScanner) {
   TestStreamScanner(&stream1, expectations1, 0, 0);
 
   const char* str2 = "case default const {THIS\nPART\nSKIPPED} do";
-  i::Utf8ToUC16CharacterStream stream2(reinterpret_cast<const i::byte*>(str2),
-                                       static_cast<unsigned>(strlen(str2)));
+  i::Utf8ToUtf16CharacterStream stream2(reinterpret_cast<const i::byte*>(str2),
+                                        static_cast<unsigned>(strlen(str2)));
   i::Token::Value expectations2[] = {
       i::Token::CASE,
       i::Token::DEFAULT,
@@ -683,7 +686,7 @@ TEST(StreamScanner) {
   for (int i = 0; i <= 4; i++) {
      expectations3[6 - i] = i::Token::ILLEGAL;
      expectations3[5 - i] = i::Token::EOS;
-     i::Utf8ToUC16CharacterStream stream3(
+     i::Utf8ToUtf16CharacterStream stream3(
          reinterpret_cast<const i::byte*>(str3),
          static_cast<unsigned>(strlen(str3)));
      TestStreamScanner(&stream3, expectations3, 1, 1 + i);
@@ -692,7 +695,7 @@ TEST(StreamScanner) {
 
 
 void TestScanRegExp(const char* re_source, const char* expected) {
-  i::Utf8ToUC16CharacterStream stream(
+  i::Utf8ToUtf16CharacterStream stream(
        reinterpret_cast<const i::byte*>(re_source),
        static_cast<unsigned>(strlen(re_source)));
   i::Scanner scanner(i::Isolate::Current()->unicode_cache());
@@ -748,6 +751,67 @@ TEST(RegExpScanning) {
 }
 
 
+static int Utf8LengthHelper(const char* s) {
+  int len = strlen(s);
+  int character_length = len;
+  for (int i = 0; i < len; i++) {
+    unsigned char c = s[i];
+    int input_offset = 0;
+    int output_adjust = 0;
+    if (c > 0x7f) {
+      if (c < 0xc0) continue;
+      if (c >= 0xf0) {
+        if (c >= 0xf8) {
+          // 5 and 6 byte UTF-8 sequences turn into a kBadChar for each UTF-8
+          // byte.
+          continue;  // Handle first UTF-8 byte.
+        }
+        if ((c & 7) == 0 && ((s[i + 1] & 0x30) == 0)) {
+          // This 4 byte sequence could have been coded as a 3 byte sequence.
+          // Record a single kBadChar for the first byte and continue.
+          continue;
+        }
+        input_offset = 3;
+        // 4 bytes of UTF-8 turn into 2 UTF-16 code units.
+        character_length -= 2;
+      } else if (c >= 0xe0) {
+        if ((c & 0xf) == 0 && ((s[i + 1] & 0x20) == 0)) {
+          // This 3 byte sequence could have been coded as a 2 byte sequence.
+          // Record a single kBadChar for the first byte and continue.
+          continue;
+        }
+        input_offset = 2;
+        // 3 bytes of UTF-8 turn into 1 UTF-16 code unit.
+        output_adjust = 2;
+      } else {
+        if ((c & 0x1e) == 0) {
+          // This 2 byte sequence could have been coded as a 1 byte sequence.
+          // Record a single kBadChar for the first byte and continue.
+          continue;
+        }
+        input_offset = 1;
+        // 2 bytes of UTF-8 turn into 1 UTF-16 code unit.
+        output_adjust = 1;
+      }
+      bool bad = false;
+      for (int j = 1; j <= input_offset; j++) {
+        if ((s[i + j] & 0xc0) != 0x80) {
+          // Bad UTF-8 sequence turns the first in the sequence into kBadChar,
+          // which is a single UTF-16 code unit.
+          bad = true;
+          break;
+        }
+      }
+      if (!bad) {
+        i += input_offset;
+        character_length -= output_adjust;
+      }
+    }
+  }
+  return character_length;
+}
+
+
 TEST(ScopePositions) {
   // Test the parser for correctly setting the start and end positions
   // of a scope. We check the scope positions of exactly one scope
@@ -835,6 +899,91 @@ TEST(ScopePositions) {
     { "  for ", "(let x in {})\n"
       "    statement;", "\n"
       "  more;", i::BLOCK_SCOPE, i::EXTENDED_MODE },
+    // Check that 6-byte and 4-byte encodings of UTF-8 strings do not throw
+    // the preparser off in terms of byte offsets.
+    // 6 byte encoding.
+    { "  'foo\355\240\201\355\260\211';\n"
+      "  (function fun", "(a,b) { infunction; }", ")();",
+      i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+    // 4 byte encoding.
+    { "  'foo\360\220\220\212';\n"
+      "  (function fun", "(a,b) { infunction; }", ")();",
+      i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+    // 3 byte encoding of \u0fff.
+    { "  'foo\340\277\277';\n"
+      "  (function fun", "(a,b) { infunction; }", ")();",
+      i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+    // Broken 6 byte encoding with missing last byte.
+    { "  'foo\355\240\201\355\211';\n"
+      "  (function fun", "(a,b) { infunction; }", ")();",
+      i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+    // Broken 3 byte encoding of \u0fff with missing last byte.
+    { "  'foo\340\277';\n"
+      "  (function fun", "(a,b) { infunction; }", ")();",
+      i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+    // Broken 3 byte encoding of \u0fff with missing 2 last bytes.
+    { "  'foo\340';\n"
+      "  (function fun", "(a,b) { infunction; }", ")();",
+      i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+    // Broken 3 byte encoding of \u00ff should be a 2 byte encoding.
+    { "  'foo\340\203\277';\n"
+      "  (function fun", "(a,b) { infunction; }", ")();",
+      i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+    // Broken 3 byte encoding of \u007f should be a 2 byte encoding.
+    { "  'foo\340\201\277';\n"
+      "  (function fun", "(a,b) { infunction; }", ")();",
+      i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+    // Unpaired lead surrogate.
+    { "  'foo\355\240\201';\n"
+      "  (function fun", "(a,b) { infunction; }", ")();",
+      i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+    // Unpaired lead surrogate where following code point is a 3 byte sequence.
+    { "  'foo\355\240\201\340\277\277';\n"
+      "  (function fun", "(a,b) { infunction; }", ")();",
+      i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+    // Unpaired lead surrogate where following code point is a 4 byte encoding
+    // of a trail surrogate.
+    { "  'foo\355\240\201\360\215\260\211';\n"
+      "  (function fun", "(a,b) { infunction; }", ")();",
+      i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+    // Unpaired trail surrogate.
+    { "  'foo\355\260\211';\n"
+      "  (function fun", "(a,b) { infunction; }", ")();",
+      i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+    // 2 byte encoding of \u00ff.
+    { "  'foo\303\277';\n"
+      "  (function fun", "(a,b) { infunction; }", ")();",
+      i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+    // Broken 2 byte encoding of \u00ff with missing last byte.
+    { "  'foo\303';\n"
+      "  (function fun", "(a,b) { infunction; }", ")();",
+      i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+    // Broken 2 byte encoding of \u007f should be a 1 byte encoding.
+    { "  'foo\301\277';\n"
+      "  (function fun", "(a,b) { infunction; }", ")();",
+      i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+    // Illegal 5 byte encoding.
+    { "  'foo\370\277\277\277\277';\n"
+      "  (function fun", "(a,b) { infunction; }", ")();",
+      i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+    // Illegal 6 byte encoding.
+    { "  'foo\374\277\277\277\277\277';\n"
+      "  (function fun", "(a,b) { infunction; }", ")();",
+      i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+    // Illegal 0xfe byte
+    { "  'foo\376\277\277\277\277\277\277';\n"
+      "  (function fun", "(a,b) { infunction; }", ")();",
+      i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+    // Illegal 0xff byte
+    { "  'foo\377\277\277\277\277\277\277\277';\n"
+      "  (function fun", "(a,b) { infunction; }", ")();",
+      i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+    { "  'foo';\n"
+      "  (function fun", "(a,b) { 'bar\355\240\201\355\260\213'; }", ")();",
+      i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+    { "  'foo';\n"
+      "  (function fun", "(a,b) { 'bar\360\220\220\214'; }", ")();",
+      i::FUNCTION_SCOPE, i::CLASSIC_MODE },
     { NULL, NULL, NULL, i::EVAL_SCOPE, i::CLASSIC_MODE }
   };
 
@@ -848,20 +997,24 @@ TEST(ScopePositions) {
   i::FLAG_harmony_scoping = true;
 
   for (int i = 0; source_data[i].outer_prefix; i++) {
-    int kPrefixLen = i::StrLength(source_data[i].outer_prefix);
-    int kInnerLen = i::StrLength(source_data[i].inner_source);
-    int kSuffixLen = i::StrLength(source_data[i].outer_suffix);
+    int kPrefixLen = Utf8LengthHelper(source_data[i].outer_prefix);
+    int kInnerLen = Utf8LengthHelper(source_data[i].inner_source);
+    int kSuffixLen = Utf8LengthHelper(source_data[i].outer_suffix);
+    int kPrefixByteLen = i::StrLength(source_data[i].outer_prefix);
+    int kInnerByteLen = i::StrLength(source_data[i].inner_source);
+    int kSuffixByteLen = i::StrLength(source_data[i].outer_suffix);
     int kProgramSize = kPrefixLen + kInnerLen + kSuffixLen;
-    i::Vector<char> program = i::Vector<char>::New(kProgramSize + 1);
-    int length = i::OS::SNPrintF(program, "%s%s%s",
-                                 source_data[i].outer_prefix,
-                                 source_data[i].inner_source,
-                                 source_data[i].outer_suffix);
-    CHECK(length == kProgramSize);
+    int kProgramByteSize = kPrefixByteLen + kInnerByteLen + kSuffixByteLen;
+    i::Vector<char> program = i::Vector<char>::New(kProgramByteSize + 1);
+    i::OS::SNPrintF(program, "%s%s%s",
+                             source_data[i].outer_prefix,
+                             source_data[i].inner_source,
+                             source_data[i].outer_suffix);
 
     // Parse program source.
     i::Handle<i::String> source(
-        FACTORY->NewStringFromAscii(i::CStrVector(program.start())));
+        FACTORY->NewStringFromUtf8(i::CStrVector(program.start())));
+    CHECK_EQ(source->length(), kProgramSize);
     i::Handle<i::Script> script = FACTORY->NewScript(source);
     i::Parser parser(script, i::kAllowLazy | i::EXTENDED_MODE, NULL, NULL);
     i::CompilationInfo info(script);
@@ -894,7 +1047,7 @@ void TestParserSync(i::Handle<i::String> source, int flags) {
   // Preparse the data.
   i::CompleteParserRecorder log;
   i::Scanner scanner(i::Isolate::Current()->unicode_cache());
-  i::GenericStringUC16CharacterStream stream(source, 0, source->length());
+  i::GenericStringUtf16CharacterStream stream(source, 0, source->length());
   scanner.SetHarmonyScoping(harmony_scoping);
   scanner.Initialize(&stream);
   v8::preparser::PreParser::PreParseResult result =