From 9569b20db28fdb80e5f7b889dce4f45a6089e27f Mon Sep 17 00:00:00 2001 From: "yangguo@chromium.org" Date: Wed, 19 Dec 2012 13:27:20 +0000 Subject: [PATCH] Replace the use CharacterStreams in Heap::AllocateSymbolInternal and String::ComputeHash R=yangguo@chromium.org BUG= Review URL: https://chromiumcodereview.appspot.com/11593007 Patch from Dan Carney . git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@13242 ce2b1a6d-e550-0410-aec6-3dcde31c8c00 --- src/heap-inl.h | 28 ++++- src/heap.cc | 126 ++++++++++++++++------ src/heap.h | 14 ++- src/json-parser.h | 12 ++- src/objects-inl.h | 160 +++++++++++++-------------- src/objects.cc | 257 +++++++++++++++++++++++--------------------- src/objects.h | 123 +++++++++------------ src/profile-generator.cc | 21 ++-- src/unicode.h | 4 - test/cctest/test-strings.cc | 45 +++----- 10 files changed, 431 insertions(+), 359 deletions(-) diff --git a/src/heap-inl.h b/src/heap-inl.h index e5f2900..6d2e958 100644 --- a/src/heap-inl.h +++ b/src/heap-inl.h @@ -98,12 +98,34 @@ MaybeObject* Heap::AllocateStringFromUtf8(Vector str, } +template<> +bool inline Heap::IsOneByte(Vector str, int chars) { + // TODO(dcarney): incorporate Latin-1 check when Latin-1 is supported? + // ASCII only check. + return chars == str.length(); +} + + +template<> +bool inline Heap::IsOneByte(String* str, int chars) { + return str->IsOneByteRepresentation(); +} + + MaybeObject* Heap::AllocateSymbol(Vector str, int chars, uint32_t hash_field) { - unibrow::Utf8InputBuffer<> buffer(str.start(), - static_cast(str.length())); - return AllocateInternalSymbol(&buffer, chars, hash_field); + if (IsOneByte(str, chars)) return AllocateAsciiSymbol(str, hash_field); + return AllocateInternalSymbol(str, chars, hash_field); +} + + +template +MaybeObject* Heap::AllocateInternalSymbol(T t, int chars, uint32_t hash_field) { + if (IsOneByte(t, chars)) { + return AllocateInternalSymbol(t, chars, hash_field); + } + return AllocateInternalSymbol(t, chars, hash_field); } diff --git a/src/heap.cc b/src/heap.cc index a37aa5a..8f96f4b 100644 --- a/src/heap.cc +++ b/src/heap.cc @@ -3302,8 +3302,8 @@ static inline bool Between(uint32_t character, uint32_t from, uint32_t to) { MUST_USE_RESULT static inline MaybeObject* MakeOrFindTwoCharacterString( Heap* heap, - uint32_t c1, - uint32_t c2) { + uint16_t c1, + uint16_t c2) { String* symbol; // Numeric strings have a different hash algorithm not known by // LookupTwoCharsSymbolIfExists, so we skip this step for such strings. @@ -3352,8 +3352,8 @@ MaybeObject* Heap::AllocateConsString(String* first, String* second) { // dictionary. Check whether we already have the string in the symbol // table to prevent creation of many unneccesary strings. if (length == 2) { - unsigned c1 = first->Get(0); - unsigned c2 = second->Get(0); + uint16_t c1 = first->Get(0); + uint16_t c2 = second->Get(0); return MakeOrFindTwoCharacterString(this, c1, c2); } @@ -3467,8 +3467,8 @@ MaybeObject* Heap::AllocateSubString(String* buffer, // Optimization for 2-byte strings often used as keys in a decompression // dictionary. Check whether we already have the string in the symbol // table to prevent creation of many unneccesary strings. - unsigned c1 = buffer->Get(start); - unsigned c2 = buffer->Get(start + 1); + uint16_t c1 = buffer->Get(start); + uint16_t c2 = buffer->Get(start + 1); return MakeOrFindTwoCharacterString(this, c1, c2); } @@ -4624,27 +4624,88 @@ Map* Heap::SymbolMapForString(String* string) { } -MaybeObject* Heap::AllocateInternalSymbol(unibrow::CharacterStream* buffer, - int chars, - uint32_t hash_field) { - ASSERT(chars >= 0); - // Ensure the chars matches the number of characters in the buffer. - ASSERT(static_cast(chars) == buffer->Utf16Length()); - // Determine whether the string is ASCII. - bool is_ascii = true; - while (buffer->has_more()) { - if (buffer->GetNext() > unibrow::Utf8::kMaxOneByteChar) { - is_ascii = false; - break; +template +class AllocateInternalSymbolHelper { + public: + static void WriteOneByteData(T t, char* chars, int len); + static void WriteTwoByteData(T t, uint16_t* chars, int len); + private: + DISALLOW_COPY_AND_ASSIGN(AllocateInternalSymbolHelper); +}; + + +template<> +class AllocateInternalSymbolHelper< Vector > { + public: + static inline void WriteOneByteData(Vector vector, + char* chars, + int len) { + // Only works for ascii. + ASSERT(vector.length() == len); + memcpy(chars, vector.start(), len); + } + + static inline void WriteTwoByteData(Vector vector, + uint16_t* chars, + int len) { + const uint8_t* stream = reinterpret_cast(vector.start()); + unsigned stream_length = vector.length(); + while (stream_length != 0) { + unsigned consumed = 0; + uint32_t c = unibrow::Utf8::ValueOf(stream, stream_length, &consumed); + ASSERT(c != unibrow::Utf8::kBadChar); + ASSERT(consumed <= stream_length); + stream_length -= consumed; + stream += consumed; + if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) { + len -= 2; + if (len < 0) break; + *chars++ = unibrow::Utf16::LeadSurrogate(c); + *chars++ = unibrow::Utf16::TrailSurrogate(c); + } else { + len -= 1; + if (len < 0) break; + *chars++ = c; + } } + ASSERT(stream_length == 0); + ASSERT(len == 0); } - buffer->Rewind(); + private: + DISALLOW_COPY_AND_ASSIGN(AllocateInternalSymbolHelper); +}; + + +template<> +class AllocateInternalSymbolHelper { + public: + static inline void WriteOneByteData(String* s, char* chars, int len) { + ASSERT(s->length() == len); + String::WriteToFlat(s, chars, 0, len); + } + + static inline void WriteTwoByteData(String* s, uint16_t* chars, int len) { + ASSERT(s->length() == len); + String::WriteToFlat(s, chars, 0, len); + } + + private: + DISALLOW_COPY_AND_ASSIGN(AllocateInternalSymbolHelper); +}; + + +template +MaybeObject* Heap::AllocateInternalSymbol(T t, + int chars, + uint32_t hash_field) { + typedef AllocateInternalSymbolHelper H; + ASSERT(chars >= 0); // Compute map and object size. int size; Map* map; - if (is_ascii) { + if (is_one_byte) { if (chars > SeqOneByteString::kMaxLength) { return Failure::OutOfMemoryException(); } @@ -4674,21 +4735,26 @@ MaybeObject* Heap::AllocateInternalSymbol(unibrow::CharacterStream* buffer, ASSERT_EQ(size, answer->Size()); - // Fill in the characters. - int i = 0; - while (i < chars) { - uint32_t character = buffer->GetNext(); - if (character > unibrow::Utf16::kMaxNonSurrogateCharCode) { - answer->Set(i++, unibrow::Utf16::LeadSurrogate(character)); - answer->Set(i++, unibrow::Utf16::TrailSurrogate(character)); - } else { - answer->Set(i++, character); - } + if (is_one_byte) { + H::WriteOneByteData(t, SeqOneByteString::cast(answer)->GetChars(), chars); + } else { + H::WriteTwoByteData(t, SeqTwoByteString::cast(answer)->GetChars(), chars); } return answer; } +// Need explicit instantiations. +template +MaybeObject* Heap::AllocateInternalSymbol(String*, int, uint32_t); +template +MaybeObject* Heap::AllocateInternalSymbol(String*, int, uint32_t); +template +MaybeObject* Heap::AllocateInternalSymbol(Vector, + int, + uint32_t); + + MaybeObject* Heap::AllocateRawOneByteString(int length, PretenureFlag pretenure) { if (length < 0 || length > SeqOneByteString::kMaxLength) { diff --git a/src/heap.h b/src/heap.h index 6d06f68..c9a17ab 100644 --- a/src/heap.h +++ b/src/heap.h @@ -764,12 +764,16 @@ class Heap { Vector str, uint32_t hash_field); - MUST_USE_RESULT MaybeObject* AllocateInternalSymbol( - unibrow::CharacterStream* buffer, int chars, uint32_t hash_field); + template + static inline bool IsOneByte(T t, int chars); - MUST_USE_RESULT MaybeObject* AllocateExternalSymbol( - Vector str, - int chars); + template + MUST_USE_RESULT inline MaybeObject* AllocateInternalSymbol( + T t, int chars, uint32_t hash_field); + + template + MUST_USE_RESULT MaybeObject* AllocateInternalSymbol( + T t, int chars, uint32_t hash_field); // Allocates and partially initializes a String. There are two String // encodings: ASCII and two byte. These functions allocate a string of the diff --git a/src/json-parser.h b/src/json-parser.h index 6b38fdc..f6140f1 100644 --- a/src/json-parser.h +++ b/src/json-parser.h @@ -631,7 +631,17 @@ Handle JsonParser::ScanJsonString() { position_); } if (c0 < 0x20) return Handle::null(); - running_hash = StringHasher::AddCharacterCore(running_hash, c0); + if (static_cast(c0) > + unibrow::Utf16::kMaxNonSurrogateCharCode) { + running_hash = + StringHasher::AddCharacterCore(running_hash, + unibrow::Utf16::LeadSurrogate(c0)); + running_hash = + StringHasher::AddCharacterCore(running_hash, + unibrow::Utf16::TrailSurrogate(c0)); + } else { + running_hash = StringHasher::AddCharacterCore(running_hash, c0); + } position++; if (position >= source_length_) return Handle::null(); c0 = seq_source_->SeqOneByteStringGet(position); diff --git a/src/objects-inl.h b/src/objects-inl.h index 7493887..bb56f3f 100644 --- a/src/objects-inl.h +++ b/src/objects-inl.h @@ -2584,8 +2584,7 @@ void String::Visit( case kConsStringTag | kOneByteStringTag: case kConsStringTag | kTwoByteStringTag: - string = cons_op.Operate(ConsString::cast(string), &offset, &type, - &length); + string = cons_op.Operate(string, &offset, &type, &length); if (string == NULL) return; slice_offset = offset; ASSERT(length == static_cast(string->length())); @@ -2777,6 +2776,11 @@ const uint16_t* ExternalTwoByteString::ExternalTwoByteStringGetData( } +String* ConsStringNullOp::Operate(String*, unsigned*, int32_t*, unsigned*) { + return NULL; +} + + unsigned ConsStringIteratorOp::OffsetForDepth(unsigned depth) { return depth & kDepthMask; } @@ -2805,42 +2809,38 @@ void ConsStringIteratorOp::Pop() { } -void ConsStringIteratorOp::Reset() { - depth_ = 0; - maximum_depth_ = 0; +bool ConsStringIteratorOp::HasMore() { + return depth_ != 0; } -bool ConsStringIteratorOp::HasMore() { - return depth_ != 0; +void ConsStringIteratorOp::Reset() { + depth_ = 0; } -bool ConsStringIteratorOp::ContinueOperation(ContinueResponse* response) { +String* ConsStringIteratorOp::ContinueOperation(int32_t* type_out, + unsigned* length_out) { bool blew_stack; - int32_t type; - unsigned length; - String* string = NextLeaf(&blew_stack, &type, &length); + String* string = NextLeaf(&blew_stack, type_out, length_out); // String found. if (string != NULL) { - consumed_ += length; - response->string_ = string; - response->offset_ = 0; - response->length_ = length; - response->type_ = type; - return true; + // Verify output. + ASSERT(*length_out == static_cast(string->length())); + ASSERT(*type_out == string->map()->instance_type()); + return string; } // Traversal complete. - if (!blew_stack) return false; - // Restart search. - Reset(); - // TODO(dcarney) This is unnecessary. - // After a reset, we don't need a String::Visit - response->string_ = root_; - response->offset_ = consumed_; - response->length_ = root_length_; - response->type_ = root_type_; - return true; + if (!blew_stack) return NULL; + // Restart search from root. + unsigned offset_out; + string = Search(&offset_out, type_out, length_out); + // Verify output. + ASSERT(string == NULL || offset_out == 0); + ASSERT(string == NULL || + *length_out == static_cast(string->length())); + ASSERT(string == NULL || *type_out == string->map()->instance_type()); + return string; } @@ -2857,18 +2857,24 @@ StringCharacterStream::StringCharacterStream( end_(NULL), op_(op) { op->Reset(); - String::Visit(string, - offset, *this, *op, string->map()->instance_type(), string->length()); + int32_t type = string->map()->instance_type(); + unsigned length = string->length(); + String::Visit(string, offset, *this, *op, type, length); } bool StringCharacterStream::HasMore() { if (buffer8_ != end_) return true; if (!op_->HasMore()) return false; - ConsStringIteratorOp::ContinueResponse response; - if (!op_->ContinueOperation(&response)) return false; - String::Visit(response.string_, - response.offset_, *this, *op_, response.type_, response.length_); + unsigned length; + int32_t type; + String* string = op_->ContinueOperation(&type, &length); + if (string == NULL) return false; + ASSERT(!string->IsConsString()); + ASSERT(string->length() != 0); + ConsStringNullOp null_op; + String::Visit(string, 0, *this, null_op, type, length); + ASSERT(buffer8_ != end_); return true; } @@ -5138,7 +5144,7 @@ bool StringHasher::has_trivial_hash() { } -uint32_t StringHasher::AddCharacterCore(uint32_t running_hash, uint32_t c) { +uint32_t StringHasher::AddCharacterCore(uint32_t running_hash, uint16_t c) { running_hash += c; running_hash += (running_hash << 10); running_hash ^= (running_hash >> 6); @@ -5157,66 +5163,62 @@ uint32_t StringHasher::GetHashCore(uint32_t running_hash) { } -void StringHasher::AddCharacter(uint32_t c) { - if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) { - AddSurrogatePair(c); // Not inlined. - return; - } +void StringHasher::AddCharacter(uint16_t c) { // Use the Jenkins one-at-a-time hash function to update the hash // for the given character. raw_running_hash_ = AddCharacterCore(raw_running_hash_, c); - // Incremental array index computation. - if (is_array_index_) { - if (c < '0' || c > '9') { - is_array_index_ = false; - } else { - int d = c - '0'; - if (is_first_char_) { - is_first_char_ = false; - if (c == '0' && length_ > 1) { - is_array_index_ = false; - return; - } - } - if (array_index_ > 429496729U - ((d + 2) >> 3)) { - is_array_index_ = false; - } else { - array_index_ = array_index_ * 10 + d; - } - } - } } -void StringHasher::AddCharacterNoIndex(uint32_t c) { - ASSERT(!is_array_index()); - if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) { - AddSurrogatePairNoIndex(c); // Not inlined. - return; +bool StringHasher::UpdateIndex(uint16_t c) { + ASSERT(is_array_index_); + if (c < '0' || c > '9') { + is_array_index_ = false; + return false; } - raw_running_hash_ = AddCharacterCore(raw_running_hash_, c); + int d = c - '0'; + if (is_first_char_) { + is_first_char_ = false; + if (c == '0' && length_ > 1) { + is_array_index_ = false; + return false; + } + } + if (array_index_ > 429496729U - ((d + 2) >> 3)) { + is_array_index_ = false; + return false; + } + array_index_ = array_index_ * 10 + d; + return true; } -uint32_t StringHasher::GetHash() { - // Get the calculated raw hash value and do some more bit ops to distribute - // the hash further. Ensure that we never return zero as the hash value. - return GetHashCore(raw_running_hash_); +template +inline void StringHasher::AddCharacters(const Char* chars, int length) { + ASSERT(sizeof(Char) == 1 || sizeof(Char) == 2); + int i = 0; + if (is_array_index_) { + for (; i < length; i++) { + AddCharacter(chars[i]); + if (!UpdateIndex(chars[i])) { + i++; + break; + } + } + } + for (; i < length; i++) { + ASSERT(!is_array_index_); + AddCharacter(chars[i]); + } } template -uint32_t HashSequentialString(const schar* chars, int length, uint32_t seed) { +uint32_t StringHasher::HashSequentialString(const schar* chars, + int length, + uint32_t seed) { StringHasher hasher(length, seed); - if (!hasher.has_trivial_hash()) { - int i; - for (i = 0; hasher.is_array_index() && (i < length); i++) { - hasher.AddCharacter(chars[i]); - } - for (; i < length; i++) { - hasher.AddCharacterNoIndex(chars[i]); - } - } + if (!hasher.has_trivial_hash()) hasher.AddCharacters(chars, length); return hasher.GetHashField(); } diff --git a/src/objects.cc b/src/objects.cc index b2110a9..cea724f 100644 --- a/src/objects.cc +++ b/src/objects.cc @@ -7029,24 +7029,36 @@ void StringInputBuffer::Seek(unsigned pos) { } -String* ConsStringIteratorOp::Operate(ConsString* cons_string, - unsigned* offset_out, int32_t* type_out, unsigned* length_out) { - ASSERT(*length_out == (unsigned)cons_string->length()); - ASSERT(depth_ == 0); - // Push the root string. - PushLeft(cons_string); +String* ConsStringIteratorOp::Operate(String* string, + unsigned* offset_out, + int32_t* type_out, + unsigned* length_out) { + ASSERT(string->IsConsString()); + ConsString* cons_string = ConsString::cast(string); + // Set up search data. root_ = cons_string; - root_type_ = *type_out; - root_length_ = *length_out; consumed_ = *offset_out; - unsigned targetOffset = *offset_out; + // Now search. + return Search(offset_out, type_out, length_out); +} + + +String* ConsStringIteratorOp::Search(unsigned* offset_out, + int32_t* type_out, + unsigned* length_out) { + ConsString* cons_string = root_; + // Reset the stack, pushing the root string. + depth_ = 1; + maximum_depth_ = 1; + frames_[0] = cons_string; + const unsigned consumed = consumed_; unsigned offset = 0; while (true) { // Loop until the string is found which contains the target offset. String* string = cons_string->first(); unsigned length = string->length(); int32_t type; - if (targetOffset < offset + length) { + if (consumed < offset + length) { // Target offset is in the left branch. // Keep going if we're still in a ConString. type = string->map()->instance_type(); @@ -7075,6 +7087,7 @@ String* ConsStringIteratorOp::Operate(ConsString* cons_string, // Account for the possibility of an empty right leaf. // This happens only if we have asked for an offset outside the string. if (length == 0) { + // Reset depth so future operations will return null immediately. Reset(); return NULL; } @@ -7085,9 +7098,8 @@ String* ConsStringIteratorOp::Operate(ConsString* cons_string, } ASSERT(length != 0); // Adjust return values and exit. - unsigned innerOffset = targetOffset - offset; - consumed_ += length - innerOffset; - *offset_out = innerOffset; + consumed_ = offset + length; + *offset_out = consumed - offset; *type_out = type; *length_out = length; return string; @@ -7097,8 +7109,9 @@ String* ConsStringIteratorOp::Operate(ConsString* cons_string, } -String* ConsStringIteratorOp::NextLeaf( - bool* blew_stack, int32_t* type_out, unsigned* length_out) { +String* ConsStringIteratorOp::NextLeaf(bool* blew_stack, + int32_t* type_out, + unsigned* length_out) { while (true) { // Tree traversal complete. if (depth_ == 0) { @@ -7122,6 +7135,7 @@ String* ConsStringIteratorOp::NextLeaf( if (length == 0) continue; *length_out = length; *type_out = type; + consumed_ += length; return string; } cons_string = ConsString::cast(string); @@ -7134,8 +7148,11 @@ String* ConsStringIteratorOp::NextLeaf( type = string->map()->instance_type(); if ((type & kStringRepresentationMask) != kConsStringTag) { AdjustMaximumDepth(); + unsigned length = (unsigned) string->length(); + ASSERT(length != 0); + *length_out = length; *type_out = type; - *length_out = string->length(); + consumed_ += length; return string; } cons_string = ConsString::cast(string); @@ -7674,28 +7691,62 @@ bool String::IsTwoByteEqualTo(Vector str) { } +class IteratingStringHasher: public StringHasher { + public: + static inline uint32_t Hash(String* string, uint32_t seed) { + const unsigned len = static_cast(string->length()); + IteratingStringHasher hasher(len, seed); + if (hasher.has_trivial_hash()) { + return hasher.GetHashField(); + } + int32_t type = string->map()->instance_type(); + ConsStringNullOp null_op; + String::Visit(string, 0, hasher, null_op, type, len); + // Flat strings terminate immediately. + if (hasher.consumed_ == len) { + ASSERT(!string->IsConsString()); + return hasher.GetHashField(); + } + ASSERT(string->IsConsString()); + // This is a ConsString, iterate across it. + ConsStringIteratorOp op; + unsigned offset = 0; + unsigned leaf_length = len; + string = op.Operate(string, &offset, &type, &leaf_length); + while (true) { + ASSERT(hasher.consumed_ < len); + String::Visit(string, 0, hasher, null_op, type, leaf_length); + if (hasher.consumed_ == len) break; + string = op.ContinueOperation(&type, &leaf_length); + // This should be taken care of by the length check. + ASSERT(string != NULL); + } + return hasher.GetHashField(); + } + inline void VisitOneByteString(const uint8_t* chars, unsigned length) { + AddCharacters(chars, static_cast(length)); + consumed_ += length; + } + inline void VisitTwoByteString(const uint16_t* chars, unsigned length) { + AddCharacters(chars, static_cast(length)); + consumed_ += length; + } + + private: + inline IteratingStringHasher(int len, uint32_t seed) + : StringHasher(len, seed), + consumed_(0) {} + unsigned consumed_; + DISALLOW_COPY_AND_ASSIGN(IteratingStringHasher); +}; + + uint32_t String::ComputeAndSetHash() { // Should only be called if hash code has not yet been computed. ASSERT(!HasHashCode()); - const int len = length(); - - // Compute the hash code. - uint32_t field = 0; - if (StringShape(this).IsSequentialAscii()) { - field = HashSequentialString(SeqOneByteString::cast(this)->GetChars(), - len, - GetHeap()->HashSeed()); - } else if (StringShape(this).IsSequentialTwoByte()) { - field = HashSequentialString(SeqTwoByteString::cast(this)->GetChars(), - len, - GetHeap()->HashSeed()); - } else { - StringInputBuffer buffer(this); - field = ComputeHashField(&buffer, len, GetHeap()->HashSeed()); - } - // Store the hash code in the object. + uint32_t field = IteratingStringHasher::Hash(this, GetHeap()->HashSeed()); set_hash_field(field); // Check the hash code is there. @@ -7799,57 +7850,59 @@ uint32_t StringHasher::MakeArrayIndexHash(uint32_t value, int length) { } -void StringHasher::AddSurrogatePair(uc32 c) { - uint16_t lead = unibrow::Utf16::LeadSurrogate(c); - AddCharacter(lead); - uint16_t trail = unibrow::Utf16::TrailSurrogate(c); - AddCharacter(trail); -} - - -void StringHasher::AddSurrogatePairNoIndex(uc32 c) { - uint16_t lead = unibrow::Utf16::LeadSurrogate(c); - AddCharacterNoIndex(lead); - uint16_t trail = unibrow::Utf16::TrailSurrogate(c); - AddCharacterNoIndex(trail); -} - - uint32_t StringHasher::GetHashField() { if (length_ <= String::kMaxHashCalcLength) { - if (is_array_index()) { - return MakeArrayIndexHash(array_index(), length_); + if (is_array_index_) { + return MakeArrayIndexHash(array_index_, length_); } - return (GetHash() << String::kHashShift) | String::kIsNotArrayIndexMask; + return (GetHashCore(raw_running_hash_) << String::kHashShift) | + String::kIsNotArrayIndexMask; } else { return (length_ << String::kHashShift) | String::kIsNotArrayIndexMask; } } -uint32_t String::ComputeHashField(unibrow::CharacterStream* buffer, - int length, - uint32_t seed) { +uint32_t StringHasher::ComputeHashField(unibrow::CharacterStream* buffer, + int length, + uint32_t seed) { + typedef unibrow::Utf16 u; StringHasher hasher(length, seed); - // Very long strings have a trivial hash that doesn't inspect the // string contents. if (hasher.has_trivial_hash()) { return hasher.GetHashField(); } - // Do the iterative array index computation as long as there is a // chance this is an array index. - while (buffer->has_more() && hasher.is_array_index()) { - hasher.AddCharacter(buffer->GetNext()); + if (hasher.is_array_index_) { + while (buffer->has_more()) { + uint32_t c = buffer->GetNext(); + if (c > u::kMaxNonSurrogateCharCode) { + uint16_t c1 = u::LeadSurrogate(c); + uint16_t c2 = u::TrailSurrogate(c); + hasher.AddCharacter(c1); + hasher.AddCharacter(c2); + if (!hasher.UpdateIndex(c1)) break; + if (!hasher.UpdateIndex(c2)) break; + } else { + hasher.AddCharacter(c); + if (!hasher.UpdateIndex(c)) break; + } + } } - // Process the remaining characters without updating the array // index. while (buffer->has_more()) { - hasher.AddCharacterNoIndex(buffer->GetNext()); + ASSERT(!hasher.is_array_index_); + uint32_t c = buffer->GetNext(); + if (c > u::kMaxNonSurrogateCharCode) { + hasher.AddCharacter(u::LeadSurrogate(c)); + hasher.AddCharacter(u::TrailSurrogate(c)); + } else { + hasher.AddCharacter(c); + } } - return hasher.GetHashField(); } @@ -11667,7 +11720,7 @@ class Utf8SymbolKey : public HashTableKey { unibrow::Utf8InputBuffer<> buffer(string_.start(), static_cast(string_.length())); chars_ = buffer.Utf16Length(); - hash_field_ = String::ComputeHashField(&buffer, chars_, seed_); + hash_field_ = StringHasher::ComputeHashField(&buffer, chars_, seed_); uint32_t result = hash_field_ >> String::kHashShift; ASSERT(result != 0); // Ensure that the hash value of 0 is never computed. return result; @@ -11697,29 +11750,9 @@ class SequentialSymbolKey : public HashTableKey { : string_(string), hash_field_(0), seed_(seed) { } uint32_t Hash() { - StringHasher hasher(string_.length(), seed_); - - // Very long strings have a trivial hash that doesn't inspect the - // string contents. - if (hasher.has_trivial_hash()) { - hash_field_ = hasher.GetHashField(); - } else { - int i = 0; - // Do the iterative array index computation as long as there is a - // chance this is an array index. - while (i < string_.length() && hasher.is_array_index()) { - hasher.AddCharacter(static_cast(string_[i])); - i++; - } - - // Process the remaining characters without updating the array - // index. - while (i < string_.length()) { - hasher.AddCharacterNoIndex(static_cast(string_[i])); - i++; - } - hash_field_ = hasher.GetHashField(); - } + hash_field_ = StringHasher::HashSequentialString(string_.start(), + string_.length(), + seed_); uint32_t result = hash_field_ >> String::kHashShift; ASSERT(result != 0); // Ensure that the hash value of 0 is never computed. @@ -11764,32 +11797,9 @@ class SubStringAsciiSymbolKey : public HashTableKey { uint32_t Hash() { ASSERT(length_ >= 0); ASSERT(from_ + length_ <= string_->length()); - StringHasher hasher(length_, string_->GetHeap()->HashSeed()); - - // Very long strings have a trivial hash that doesn't inspect the - // string contents. - if (hasher.has_trivial_hash()) { - hash_field_ = hasher.GetHashField(); - } else { - int i = 0; - // Do the iterative array index computation as long as there is a - // chance this is an array index. - while (i < length_ && hasher.is_array_index()) { - hasher.AddCharacter(static_cast( - string_->SeqOneByteStringGet(i + from_))); - i++; - } - - // Process the remaining characters without updating the array - // index. - while (i < length_) { - hasher.AddCharacterNoIndex(static_cast( - string_->SeqOneByteStringGet(i + from_))); - i++; - } - hash_field_ = hasher.GetHashField(); - } - + char* chars = string_->GetChars() + from_; + hash_field_ = StringHasher::HashSequentialString( + chars, length_, string_->GetHeap()->HashSeed()); uint32_t result = hash_field_ >> String::kHashShift; ASSERT(result != 0); // Ensure that the hash value of 0 is never computed. return result; @@ -11864,8 +11874,7 @@ class SymbolKey : public HashTableKey { return string_; } // Otherwise allocate a new symbol. - StringInputBuffer buffer(string_); - return heap->AllocateInternalSymbol(&buffer, + return heap->AllocateInternalSymbol(string_, string_->length(), string_->hash_field()); } @@ -12635,7 +12644,7 @@ MaybeObject* SymbolTable::LookupString(String* string, Object** s) { // algorithm. class TwoCharHashTableKey : public HashTableKey { public: - TwoCharHashTableKey(uint32_t c1, uint32_t c2, uint32_t seed) + TwoCharHashTableKey(uint16_t c1, uint16_t c2, uint32_t seed) : c1_(c1), c2_(c2) { // Char 1. uint32_t hash = seed; @@ -12651,17 +12660,17 @@ class TwoCharHashTableKey : public HashTableKey { hash ^= hash >> 11; hash += hash << 15; if ((hash & String::kHashBitMask) == 0) hash = StringHasher::kZeroHash; + hash_ = hash; #ifdef DEBUG - StringHasher hasher(2, seed); - hasher.AddCharacter(c1); - hasher.AddCharacter(c2); // If this assert fails then we failed to reproduce the two-character // version of the string hashing algorithm above. One reason could be // that we were passed two digits as characters, since the hash // algorithm is different in that case. - ASSERT_EQ(static_cast(hasher.GetHash()), static_cast(hash)); + uint16_t chars[2] = {c1, c2}; + uint32_t check_hash = StringHasher::HashSequentialString(chars, 2, seed); + hash = (hash << String::kHashShift) | String::kIsNotArrayIndexMask; + ASSERT_EQ(static_cast(hash), static_cast(check_hash)); #endif - hash_ = hash; } bool IsMatch(Object* o) { @@ -12686,8 +12695,8 @@ class TwoCharHashTableKey : public HashTableKey { } private: - uint32_t c1_; - uint32_t c2_; + uint16_t c1_; + uint16_t c2_; uint32_t hash_; }; @@ -12706,8 +12715,8 @@ bool SymbolTable::LookupSymbolIfExists(String* string, String** symbol) { } -bool SymbolTable::LookupTwoCharsSymbolIfExists(uint32_t c1, - uint32_t c2, +bool SymbolTable::LookupTwoCharsSymbolIfExists(uint16_t c1, + uint16_t c2, String** symbol) { TwoCharHashTableKey key(c1, c2, GetHeap()->HashSeed()); int entry = FindEntry(&key); diff --git a/src/objects.h b/src/objects.h index ebacbff..bd8de17 100644 --- a/src/objects.h +++ b/src/objects.h @@ -3043,7 +3043,7 @@ class SymbolTable: public HashTable { // true if it is found, assigning the symbol to the given output // parameter. bool LookupSymbolIfExists(String* str, String** symbol); - bool LookupTwoCharsSymbolIfExists(uint32_t c1, uint32_t c2, String** symbol); + bool LookupTwoCharsSymbolIfExists(uint16_t c1, uint16_t c2, String** symbol); // Casting. static inline SymbolTable* cast(Object* obj); @@ -6929,30 +6929,14 @@ class StringHasher { public: explicit inline StringHasher(int length, uint32_t seed); - // Returns true if the hash of this string can be computed without - // looking at the contents. - inline bool has_trivial_hash(); - - // Add a character to the hash and update the array index calculation. - inline void AddCharacter(uint32_t c); + template + static inline uint32_t HashSequentialString(const schar* chars, + int length, + uint32_t seed); - // Adds a character to the hash but does not update the array index - // calculation. This can only be called when it has been verified - // that the input is not an array index. - inline void AddCharacterNoIndex(uint32_t c); - - // Add a character above 0xffff as a surrogate pair. These can get into - // the hasher through the routines that take a UTF-8 string and make a symbol. - void AddSurrogatePair(uc32 c); - void AddSurrogatePairNoIndex(uc32 c); - - // Returns the value to store in the hash field of a string with - // the given length and contents. - uint32_t GetHashField(); - - // Returns true if the characters seen so far make up a legal array - // index. - bool is_array_index() { return is_array_index_; } + static uint32_t ComputeHashField(unibrow::CharacterStream* buffer, + int length, + uint32_t seed); // Calculated hash value for a string consisting of 1 to // String::kMaxArrayIndexSize digits with no leading zeros (except "0"). @@ -6964,51 +6948,36 @@ class StringHasher { // use 27 instead. static const int kZeroHash = 27; - private: - uint32_t array_index() { - ASSERT(is_array_index()); - return array_index_; - } - - inline uint32_t GetHash(); - // Reusable parts of the hashing algorithm. - INLINE(static uint32_t AddCharacterCore(uint32_t running_hash, uint32_t c)); + INLINE(static uint32_t AddCharacterCore(uint32_t running_hash, uint16_t c)); INLINE(static uint32_t GetHashCore(uint32_t running_hash)); - int length_; - uint32_t raw_running_hash_; - uint32_t array_index_; - bool is_array_index_; - bool is_first_char_; - friend class TwoCharHashTableKey; - - template friend class JsonParser; -}; - - -class IncrementalAsciiStringHasher { - public: - explicit inline IncrementalAsciiStringHasher(uint32_t seed, char first_char); - inline void AddCharacter(uc32 c); - inline uint32_t GetHash(); + protected: + // Returns the value to store in the hash field of a string with + // the given length and contents. + uint32_t GetHashField(); + // Returns true if the hash of this string can be computed without + // looking at the contents. + inline bool has_trivial_hash(); + // Adds a block of characters to the hash. + template + inline void AddCharacters(const Char* chars, int len); private: + // Add a character to the hash. + inline void AddCharacter(uint16_t c); + // Update index. Returns true if string is still an index. + inline bool UpdateIndex(uint16_t c); + int length_; uint32_t raw_running_hash_; uint32_t array_index_; bool is_array_index_; - char first_char_; + bool is_first_char_; + DISALLOW_COPY_AND_ASSIGN(StringHasher); }; -// Calculates string hash. -template -inline uint32_t HashSequentialString(const schar* chars, - int length, - uint32_t seed); - - // The characteristics of a string are stored in its map. Retrieving these // few bits of information is moderately expensive, involving two memory // loads where the second is dependent on the first. To improve efficiency @@ -7227,10 +7196,6 @@ class String: public HeapObject { // Returns a hash value used for the property table inline uint32_t Hash(); - static uint32_t ComputeHashField(unibrow::CharacterStream* buffer, - int length, - uint32_t seed); - static bool ComputeArrayIndex(unibrow::CharacterStream* buffer, uint32_t* index, int length); @@ -7870,22 +7835,30 @@ class StringInputBuffer: public unibrow::InputBuffer { }; +// A ConsStringOp that returns null. +// Useful when the operation to apply on a ConsString +// requires an expensive data structure. +class ConsStringNullOp { + public: + inline ConsStringNullOp() {} + static inline String* Operate(String*, unsigned*, int32_t*, unsigned*); + private: + DISALLOW_COPY_AND_ASSIGN(ConsStringNullOp); +}; + + // This maintains an off-stack representation of the stack frames required // to traverse a ConsString, allowing an entirely iterative and restartable // traversal of the entire string // Note: this class is not GC-safe. class ConsStringIteratorOp { public: - struct ContinueResponse { - String* string_; - unsigned offset_; - unsigned length_; - int32_t type_; - }; inline ConsStringIteratorOp() {} - String* Operate(ConsString* cons_string, unsigned* offset_out, - int32_t* type_out, unsigned* length_out); - inline bool ContinueOperation(ContinueResponse* response); + String* Operate(String* string, + unsigned* offset_out, + int32_t* type_out, + unsigned* length_out); + inline String* ContinueOperation(int32_t* type_out, unsigned* length_out); inline void Reset(); inline bool HasMore(); @@ -7902,6 +7875,9 @@ class ConsStringIteratorOp { inline void AdjustMaximumDepth(); inline void Pop(); String* NextLeaf(bool* blew_stack, int32_t* type_out, unsigned* length_out); + String* Search(unsigned* offset_out, + int32_t* type_out, + unsigned* length_out); unsigned depth_; unsigned maximum_depth_; @@ -7910,8 +7886,6 @@ class ConsStringIteratorOp { ConsString* frames_[kStackSize]; unsigned consumed_; ConsString* root_; - int32_t root_type_; - unsigned root_length_; DISALLOW_COPY_AND_ASSIGN(ConsStringIteratorOp); }; @@ -7919,8 +7893,9 @@ class ConsStringIteratorOp { // Note: this class is not GC-safe. class StringCharacterStream { public: - inline StringCharacterStream( - String* string, unsigned offset, ConsStringIteratorOp* op); + inline StringCharacterStream(String* string, + unsigned offset, + ConsStringIteratorOp* op); inline uint16_t GetNext(); inline bool HasMore(); inline void Reset(String* string, unsigned offset, ConsStringIteratorOp* op); diff --git a/src/profile-generator.cc b/src/profile-generator.cc index 6601e51..e14e0f3 100644 --- a/src/profile-generator.cc +++ b/src/profile-generator.cc @@ -112,7 +112,7 @@ const char* StringsStorage::GetCopy(const char* src) { OS::StrNCpy(dst, src, len); dst[len] = '\0'; uint32_t hash = - HashSequentialString(dst.start(), len, HEAP->HashSeed()); + StringHasher::HashSequentialString(dst.start(), len, HEAP->HashSeed()); return AddOrDisposeString(dst.start(), hash); } @@ -145,7 +145,7 @@ const char* StringsStorage::GetVFormatted(const char* format, va_list args) { DeleteArray(str.start()); return format; } - uint32_t hash = HashSequentialString( + uint32_t hash = StringHasher::HashSequentialString( str.start(), len, HEAP->HashSeed()); return AddOrDisposeString(str.start(), hash); } @@ -156,8 +156,8 @@ const char* StringsStorage::GetName(String* name) { int length = Min(kMaxNameSize, name->length()); SmartArrayPointer data = name->ToCString(DISALLOW_NULLS, ROBUST_STRING_TRAVERSAL, 0, length); - uint32_t hash = - HashSequentialString(*data, length, name->GetHeap()->HashSeed()); + uint32_t hash = StringHasher::HashSequentialString( + *data, length, name->GetHeap()->HashSeed()); return AddOrDisposeString(data.Detach(), hash); } return ""; @@ -1451,9 +1451,9 @@ void HeapObjectsMap::RemoveDeadEntries() { SnapshotObjectId HeapObjectsMap::GenerateId(v8::RetainedObjectInfo* info) { SnapshotObjectId id = static_cast(info->GetHash()); const char* label = info->GetLabel(); - id ^= HashSequentialString(label, - static_cast(strlen(label)), - HEAP->HashSeed()); + id ^= StringHasher::HashSequentialString(label, + static_cast(strlen(label)), + HEAP->HashSeed()); intptr_t element_count = info->GetElementCount(); if (element_count != -1) id ^= ComputeIntegerHash(static_cast(element_count), @@ -2940,9 +2940,10 @@ class NativeGroupRetainedObjectInfo : public v8::RetainedObjectInfo { NativeGroupRetainedObjectInfo* NativeObjectsExplorer::FindOrAddGroupInfo( const char* label) { const char* label_copy = collection_->names()->GetCopy(label); - uint32_t hash = HashSequentialString(label_copy, - static_cast(strlen(label_copy)), - HEAP->HashSeed()); + uint32_t hash = StringHasher::HashSequentialString( + label_copy, + static_cast(strlen(label_copy)), + HEAP->HashSeed()); HashMap::Entry* entry = native_groups_.Lookup(const_cast(label_copy), hash, true); if (entry->value == NULL) { diff --git a/src/unicode.h b/src/unicode.h index 91b16c9..f2f257d 100644 --- a/src/unicode.h +++ b/src/unicode.h @@ -170,10 +170,6 @@ class Utf8 { // that match are coded as a 4 byte UTF-8 sequence. static const unsigned kBytesSavedByCombiningSurrogates = 2; static const unsigned kSizeOfUnmatchedSurrogate = 3; - - private: - template friend class Utf8InputBuffer; - friend class Test; static inline uchar ValueOf(const byte* str, unsigned length, unsigned* cursor); diff --git a/test/cctest/test-strings.cc b/test/cctest/test-strings.cc index 6f0f96d..652a60a 100644 --- a/test/cctest/test-strings.cc +++ b/test/cctest/test-strings.cc @@ -345,37 +345,24 @@ void AccumulateStats(Handle cons_string, ConsStringStats* stats) { void AccumulateStatsWithOperator( ConsString* cons_string, ConsStringStats* stats) { - // Init op. + unsigned offset = 0; + int32_t type = cons_string->map()->instance_type(); + unsigned length = static_cast(cons_string->length()); ConsStringIteratorOp op; - op.Reset(); - // Use response for initial search and on blown stack. - ConsStringIteratorOp::ContinueResponse response; - response.string_ = cons_string; - response.offset_ = 0; - response.type_ = cons_string->map()->instance_type(); - response.length_ = (uint32_t) cons_string->length(); + String* string = op.Operate(cons_string, &offset, &type, &length); + CHECK(string != NULL); while (true) { - String* string = op.Operate(ConsString::cast(response.string_), - &response.offset_, - &response.type_, - &response.length_); - CHECK(string != NULL); - while (true) { - // Accumulate stats. - stats->leaves_++; - stats->chars_ += string->length(); - // Check for completion. - bool keep_going_fast_check = op.HasMore(); - bool keep_going = op.ContinueOperation(&response); - if (!keep_going) return; - // Verify no false positives for fast check. - CHECK(keep_going_fast_check); - CHECK(response.string_ != NULL); - // Blew stack. Restart outer loop. - if (response.string_->IsConsString()) break; - string = response.string_; - } - }; + ASSERT(!string->IsConsString()); + // Accumulate stats. + stats->leaves_++; + stats->chars_ += string->length(); + // Check for completion. + bool keep_going_fast_check = op.HasMore(); + string = op.ContinueOperation(&type, &length); + if (string == NULL) return; + // Verify no false positives for fast check. + CHECK(keep_going_fast_check); + } } -- 2.7.4