1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "tools/gn/tokenizer.h"
7 #include "base/logging.h"
8 #include "tools/gn/input_file.h"
12 bool CouldBeTwoCharOperatorBegin(char c) {
13 return c == '<' || c == '>' || c == '!' || c == '=' || c == '-' ||
14 c == '+' || c == '|' || c == '&';
17 bool CouldBeTwoCharOperatorEnd(char c) {
18 return c == '=' || c == '|' || c == '&';
21 bool CouldBeOneCharOperator(char c) {
22 return c == '=' || c == '<' || c == '>' || c == '+' || c == '!' ||
23 c == ':' || c == '|' || c == '&' || c == '-';
26 bool CouldBeOperator(char c) {
27 return CouldBeOneCharOperator(c) || CouldBeTwoCharOperatorBegin(c);
30 bool IsScoperChar(char c) {
31 return c == '(' || c == ')' || c == '[' || c == ']' || c == '{' || c == '}';
34 Token::Type GetSpecificOperatorType(base::StringPiece value) {
42 return Token::PLUS_EQUALS;
44 return Token::MINUS_EQUALS;
46 return Token::EQUAL_EQUAL;
48 return Token::NOT_EQUAL;
50 return Token::LESS_EQUAL;
52 return Token::GREATER_EQUAL;
54 return Token::LESS_THAN;
56 return Token::GREATER_THAN;
58 return Token::BOOLEAN_AND;
60 return Token::BOOLEAN_OR;
65 return Token::INVALID;
70 Tokenizer::Tokenizer(const InputFile* input_file, Err* err)
71 : input_file_(input_file),
72 input_(input_file->contents()),
79 Tokenizer::~Tokenizer() {
83 std::vector<Token> Tokenizer::Tokenize(const InputFile* input_file, Err* err) {
84 Tokenizer t(input_file, err);
88 std::vector<Token> Tokenizer::Run() {
89 DCHECK(tokens_.empty());
94 Location location = GetCurrentLocation();
96 Token::Type type = ClassifyCurrent();
97 if (type == Token::INVALID) {
98 *err_ = GetErrorForInvalidToken(location);
101 size_t token_begin = cur_;
102 AdvanceToEndOfToken(location, type);
105 size_t token_end = cur_;
107 base::StringPiece token_value(&input_.data()[token_begin],
108 token_end - token_begin);
110 if (type == Token::UNCLASSIFIED_OPERATOR)
111 type = GetSpecificOperatorType(token_value);
112 if (type == Token::IDENTIFIER) {
113 if (token_value == "if")
115 else if (token_value == "else")
117 else if (token_value == "true")
118 type = Token::TRUE_TOKEN;
119 else if (token_value == "false")
120 type = Token::FALSE_TOKEN;
123 // TODO(brettw) This just strips comments from the token stream. This
124 // is probably wrong, they should be removed at a later stage so we can
125 // do things like rewrite the file. But this makes the parser simpler and
127 if (type != Token::COMMENT)
128 tokens_.push_back(Token(location, type, token_value));
130 if (err_->has_error())
136 size_t Tokenizer::ByteOffsetOfNthLine(const base::StringPiece& buf, int n) {
144 while (cur_byte < buf.size()) {
145 if (IsNewline(buf, cur_byte)) {
152 return static_cast<size_t>(-1);
156 bool Tokenizer::IsNewline(const base::StringPiece& buffer, size_t offset) {
157 DCHECK(offset < buffer.size());
158 // We may need more logic here to handle different line ending styles.
159 return buffer[offset] == '\n';
163 void Tokenizer::AdvanceToNextToken() {
164 while (!at_end() && IsCurrentWhitespace())
168 Token::Type Tokenizer::ClassifyCurrent() const {
170 char next_char = cur_char();
171 if (IsAsciiDigit(next_char))
172 return Token::INTEGER;
173 if (next_char == '"')
174 return Token::STRING;
176 // Note: '-' handled specially below.
177 if (next_char != '-' && CouldBeOperator(next_char))
178 return Token::UNCLASSIFIED_OPERATOR;
180 if (IsIdentifierFirstChar(next_char))
181 return Token::IDENTIFIER;
183 if (next_char == '[')
184 return Token::LEFT_BRACKET;
185 if (next_char == ']')
186 return Token::RIGHT_BRACKET;
187 if (next_char == '(')
188 return Token::LEFT_PAREN;
189 if (next_char == ')')
190 return Token::RIGHT_PAREN;
191 if (next_char == '{')
192 return Token::LEFT_BRACE;
193 if (next_char == '}')
194 return Token::RIGHT_BRACE;
196 if (next_char == '.')
198 if (next_char == ',')
201 if (next_char == '#')
202 return Token::COMMENT;
204 // For the case of '-' differentiate between a negative number and anything
206 if (next_char == '-') {
208 return Token::UNCLASSIFIED_OPERATOR; // Just the minus before end of
210 char following_char = input_[cur_ + 1];
211 if (IsAsciiDigit(following_char))
212 return Token::INTEGER;
213 return Token::UNCLASSIFIED_OPERATOR;
216 return Token::INVALID;
219 void Tokenizer::AdvanceToEndOfToken(const Location& location,
225 } while (!at_end() && IsAsciiDigit(cur_char()));
227 // Require the char after a number to be some kind of space, scope,
230 if (!IsCurrentWhitespace() && !CouldBeOperator(c) &&
231 !IsScoperChar(c) && c != ',') {
232 *err_ = Err(GetCurrentLocation(),
233 "This is not a valid number.",
235 // Highlight the number.
236 err_->AppendRange(LocationRange(location, GetCurrentLocation()));
241 case Token::STRING: {
242 char initial = cur_char();
243 Advance(); // Advance past initial "
246 *err_ = Err(LocationRange(location, GetCurrentLocation()),
247 "Unterminated string literal.",
248 "Don't leave me hanging like this!");
251 if (IsCurrentStringTerminator(initial)) {
252 Advance(); // Skip past last "
254 } else if (cur_char() == '\n') {
255 *err_ = Err(LocationRange(location, GetCurrentLocation()),
256 "Newline in string constant.");
263 case Token::UNCLASSIFIED_OPERATOR:
264 // Some operators are two characters, some are one.
265 if (CouldBeTwoCharOperatorBegin(cur_char())) {
266 if (CanIncrement() && CouldBeTwoCharOperatorEnd(input_[cur_ + 1]))
272 case Token::IDENTIFIER:
273 while (!at_end() && IsIdentifierContinuingChar(cur_char()))
277 case Token::LEFT_BRACKET:
278 case Token::RIGHT_BRACKET:
279 case Token::LEFT_BRACE:
280 case Token::RIGHT_BRACE:
281 case Token::LEFT_PAREN:
282 case Token::RIGHT_PAREN:
285 Advance(); // All are one char.
290 while (!at_end() && !IsCurrentNewline())
296 *err_ = Err(location, "Everything is all messed up",
297 "Please insert system disk in drive A: and press any key.");
303 bool Tokenizer::IsCurrentWhitespace() const {
305 char c = input_[cur_];
306 // Note that tab (0x09) is illegal.
307 return c == 0x0A || c == 0x0B || c == 0x0C || c == 0x0D || c == 0x20;
310 bool Tokenizer::IsCurrentStringTerminator(char quote_char) const {
312 if (cur_char() != quote_char)
315 // Check for escaping. \" is not a string terminator, but \\" is. Count
316 // the number of preceeding backslashes.
317 int num_backslashes = 0;
318 for (int i = static_cast<int>(cur_) - 1; i >= 0 && input_[i] == '\\'; i--)
321 // Even backslashes mean that they were escaping each other and don't count
322 // as escaping this quote.
323 return (num_backslashes % 2) == 0;
326 bool Tokenizer::IsCurrentNewline() const {
327 return IsNewline(input_, cur_);
330 void Tokenizer::Advance() {
331 DCHECK(cur_ < input_.size());
332 if (IsCurrentNewline()) {
341 Location Tokenizer::GetCurrentLocation() const {
342 return Location(input_file_, line_number_, char_in_line_);
345 Err Tokenizer::GetErrorForInvalidToken(const Location& location) const {
347 if (cur_char() == ';') {
349 help = "Semicolons are not needed, delete this one.";
350 } else if (cur_char() == '\t') {
352 help = "You got a tab character in here. Tabs are evil. "
353 "Convert to spaces.";
354 } else if (cur_char() == '/' && cur_ + 1 < input_.size() &&
355 (input_[cur_ + 1] == '/' || input_[cur_ + 1] == '*')) {
356 // Different types of comments.
357 help = "Comments should start with # instead";
359 help = "I have no idea what this is.";
362 return Err(location, "Invalid token.", help);