// ----------------------------------------------------------------------------
// UTF8Buffer
-UTF8Buffer::UTF8Buffer() {
- static const int kInitialCapacity = 1 * KB;
- data_ = NewArray<char>(kInitialCapacity);
- limit_ = ComputeLimit(data_, kInitialCapacity);
- Reset();
- ASSERT(Capacity() == kInitialCapacity && pos() == 0);
-}
+UTF8Buffer::UTF8Buffer() : data_(NULL), limit_(NULL) { }
UTF8Buffer::~UTF8Buffer() {
- DeleteArray(data_);
+ if (data_ != NULL) DeleteArray(data_);
}
int old_capacity = Capacity();
int old_position = pos();
int new_capacity =
- Min(old_capacity * 2, old_capacity + kCapacityGrowthLimit);
+ Min(old_capacity * 3, old_capacity + kCapacityGrowthLimit);
char* new_data = NewArray<char>(new_capacity);
memcpy(new_data, data_, old_position);
DeleteArray(data_);
position_ = position;
- // Reset literals buffer
- literals_.Reset();
-
// Set c0_ (one character ahead)
ASSERT(kCharacterLookaheadBufferSize == 1);
Advance();
+ // Initializer current_ to not refer to a literal buffer.
+ current_.literal_buffer = NULL;
// Skip initial whitespace allowing HTML comment ends just like
// after a newline and scan first token.
void Scanner::StartLiteral() {
- next_.literal_pos = literals_.pos();
+ // Use the first buffer unless it's currently in use by the current_ token.
+ // In most cases we won't have two literals/identifiers in a row, so
+ // the second buffer won't be used very often and is unlikely to grow much.
+ UTF8Buffer* free_buffer =
+ (current_.literal_buffer != &literal_buffer_1_) ? &literal_buffer_1_
+ : &literal_buffer_2_;
+ next_.literal_buffer = free_buffer;
+ free_buffer->Reset();
}
void Scanner::AddChar(uc32 c) {
- literals_.AddChar(c);
+ next_.literal_buffer->AddChar(c);
}
void Scanner::TerminateLiteral() {
- next_.literal_end = literals_.pos();
AddChar(0);
}
void Scanner::Scan() {
+ next_.literal_buffer = NULL;
Token::Value token;
has_line_terminator_before_next_ = false;
do {
~UTF8Buffer();
void AddChar(uc32 c) {
+ ASSERT_NOT_NULL(data_);
if (cursor_ <= limit_ &&
static_cast<unsigned>(c) <= unibrow::Utf8::kMaxOneByteChar) {
*cursor_++ = static_cast<char>(c);
}
}
- void Reset() { cursor_ = data_; }
- int pos() const { return cursor_ - data_; }
+ void Reset() {
+ if (data_ == NULL) {
+ data_ = NewArray<char>(kInitialCapacity);
+ limit_ = ComputeLimit(data_, kInitialCapacity);
+ }
+ cursor_ = data_;
+ }
+
+ int pos() const {
+ ASSERT_NOT_NULL(data_);
+ return cursor_ - data_;
+ }
+
char* data() const { return data_; }
private:
+ static const int kInitialCapacity = 256;
char* data_;
char* cursor_;
char* limit_;
int Capacity() const {
+ ASSERT_NOT_NULL(data_);
return (limit_ - data_) + unibrow::Utf8::kMaxEncodedSize;
}
// token returned by Next()). The string is 0-terminated and in
// UTF-8 format; they may contain 0-characters. Literal strings are
// collected for identifiers, strings, and numbers.
+ // These functions only give the correct result if the literal
+ // was scanned between calls to StartLiteral() and TerminateLiteral().
const char* literal_string() const {
- return &literals_.data()[current_.literal_pos];
+ return current_.literal_buffer->data();
}
int literal_length() const {
- return current_.literal_end - current_.literal_pos;
- }
-
- Vector<const char> next_literal() const {
- return Vector<const char>(next_literal_string(), next_literal_length());
+ // Excluding terminal '\0' added by TerminateLiteral().
+ return current_.literal_buffer->pos() - 1;
}
// Returns the literal string for the next token (the token that
// would be returned if Next() were called).
const char* next_literal_string() const {
- return &literals_.data()[next_.literal_pos];
+ return next_.literal_buffer->data();
}
// Returns the length of the next token (that would be returned if
// Next() were called).
int next_literal_length() const {
- return next_.literal_end - next_.literal_pos;
+ return next_.literal_buffer->pos() - 1;
+ }
+
+ Vector<const char> next_literal() const {
+ return Vector<const char>(next_literal_string(),
+ next_literal_length());
}
// Scans the input as a regular expression pattern, previous
// Buffer to hold literal values (identifiers, strings, numbers)
// using 0-terminated UTF-8 encoding.
- UTF8Buffer literals_;
+ UTF8Buffer literal_buffer_1_;
+ UTF8Buffer literal_buffer_2_;
bool stack_overflow_;
static StaticResource<Utf8Decoder> utf8_decoder_;
struct TokenDesc {
Token::Value token;
Location location;
- int literal_pos, literal_end;
+ UTF8Buffer* literal_buffer;
};
TokenDesc current_; // desc for current token (as returned by Next())