--- /dev/null
+/*
+ * Copyright 2017 Samsung Electronics Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Lexer.hpp"
+#include "../UniversalSwitchLog.hpp"
+
+#include <sstream>
+#include <map>
+#include <set>
+#include <unordered_map>
+#include <unordered_set>
+#include <iomanip>
+
+/**
+ * Lexing support
+ *
+ * Lexing is a process of spliting text (array of characters) into atomic entities called tokens,
+ * which later on will be user by parser. Every token has text (which is part of source code),
+ * location in source code (in our case only the beginning) and type. Token's type is important,
+ * as it's main decision making tool for parser.
+ *
+ * Note, that this lexer's tokens definitions are overgreedy. For example number is defined as
+ * all characters, starting with digit, then followed by digit or letter (a-z or A-Z) or underscore.
+ * This is mostly for safety (maybe integer followed by identifier is valid grammar in parser later on?)
+ * and to make errors slightly clearer.
+ *
+ * Token's type is determined by first character, except for dot. Single dot will be OPERATOR, unless
+ * it's followed by digit, in which case it will be a DOUBLE. Also INTEGER followed by dot will be
+ * promoted to DOUBLE.
+ *
+ * Lexing itself is happening in lex() method. For given source code position every type of
+ * token is tried until one matching is found or an error is reported (in that case lexing ends).
+ * There's no order restrictions, except that test for operator must be performed after test for number.
+ */
+
+/**
+ * @brief Returns textual representation of TokenType enumerator
+ */
+std::string toString(TokenType t)
+{
+ switch (t) {
+ case TokenType::OPERATOR:
+ return "OPERATOR";
+ case TokenType::IDENTIFIER:
+ return "IDENTIFIER";
+ case TokenType::KEYWORD:
+ return "KEYWORD";
+ case TokenType::INTEGER:
+ return "INTEGER";
+ case TokenType::DOUBLE:
+ return "DOUBLE";
+ case TokenType::STRING:
+ return "STRING";
+ case TokenType::END_OF_LINE:
+ return "END_OF_LINE";
+ }
+ ASSERT(0);
+ return "";
+}
+
+/**
+ * @brief Keywords
+ *
+ * When parsing identifier, it's text will be searched for in keywords set.
+ * If found, resulting token will be KEYWORD rather than IDENTIFIER.
+ */
+static std::unordered_set<std::string> keywords {
+ "true", "false"
+};
+
+/**
+ * @brief Operators
+ *
+ * All acceptable operators, sorted by length.
+ * Note, that when adding new operator, you must put it in collection with correct size (first element of the pair).
+ */
+static std::vector<std::pair<unsigned int, std::unordered_set<std::string>>> operatorsbyLength {
+ { 2, { "==", "!=", "<=", ">=" } },
+ {
+ 1, {
+ "<", ">", "(", ")", ",", "=", "[", "]", "+", "-", ".", "*", "/", "{", "}",
+ "!", "-", ":",
+ }
+ },
+};
+
+TokenLocation::TokenLocation(std::shared_ptr<LineInfo> line, unsigned int offset) :
+ line(std::move(line)), offset(offset) { }
+
+const std::string &TokenLocation::fileName() const
+{
+ return *line->fileName;
+}
+
+unsigned int TokenLocation::lineNum() const
+{
+ return line->lineNum;
+}
+
+unsigned int TokenLocation::offsetNum() const
+{
+ return offset;
+}
+
+const std::string &TokenLocation::lineContent() const
+{
+ return line->lineContent;
+}
+
+std::string TokenLocation::toString() const
+{
+ std::ostringstream ostr;
+ ostr << *line->fileName << ":" << line->lineNum << ":" << offset;
+ return ostr.str();
+}
+
+Token::Token(std::string text, TokenLocation location_, TokenType type) :
+ text_(std::move(text)), location_(std::move(location_)), type_(type)
+{
+}
+
+const std::string &Token::text() const
+{
+ return text_;
+}
+
+const TokenLocation &Token::location() const
+{
+ return location_;
+}
+
+TokenType Token::type() const
+{
+ return type_;
+}
+
+/**
+ * @brief Implementation of lexing functionality
+ */
+class Lexer
+{
+ /**
+ * @brief Produced tokens
+ */
+ std::vector<Token> result;
+ /**
+ * @brief Placeholder for error message, if any. First error ends lexing.
+ */
+ std::string &errorMessage;
+
+ /**
+ * @brief Name of the file to lex, used only for reporting errors.
+ */
+ const std::string &fileName;
+
+ /**
+ * @brief File's contents to lex.
+ */
+ const std::string &source;
+
+ /**
+ * @brief Copy of the file name, as shared string. It will be shared among all tokens from this file.
+ */
+ std::shared_ptr<std::string> fileNamePtr;
+
+ /**
+ * @brief LineInfo object representing currently lexed line.
+ */
+ std::shared_ptr<TokenLocation::LineInfo> lineInfo;
+
+ /**
+ * @brief Index of the current line
+ */
+ unsigned int line = 0;
+
+ /**
+ * @brief Index in source of the current's line first character.
+ */
+ unsigned int lineStart = 0;
+
+ /**
+ * @brief Index in source of currently lexed character.
+ */
+ unsigned int sourceIndex = 0;
+
+ /**
+ * @brief Updates internal states, when new line was detected. Also called at initialization (when line == 0).
+ */
+ void newLine()
+ {
+ if (line != 0)
+ addToken(sourceIndex, TokenType::END_OF_LINE);
+ lineStart = sourceIndex + (line != 0 ? 1 : 0);
+ ++line;
+ auto nextLineIndex = lineStart;
+ for (; nextLineIndex < source.size() && source[nextLineIndex] != '\n'; ++nextLineIndex);
+ auto lineContent = source.substr(lineStart, nextLineIndex - lineStart);
+ lineInfo = std::make_shared<TokenLocation::LineInfo>(TokenLocation::LineInfo{
+ lineContent, fileNamePtr, line
+ });
+ }
+ /**
+ * @brief Returns true, if c is white space character. Currently any non-writeable character
+ * (with ascii code equal or less to 32) is considered whitespace (tab, vertical tab and so on).
+ */
+ static bool isWhitespace(char c)
+ {
+ return c <= ' ';
+ }
+ /**
+ * @brief Returns true, if c is a ascii letter
+ */
+ static bool isAlpha(char c)
+ {
+ return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
+ }
+ /**
+ * @brief Returns true, if c is a digit
+ */
+ static bool isDigit(char c)
+ {
+ return (c >= '0' && c <= '9');
+ }
+ /**
+ * @brief Returns true, if c is character, that can be part of identifier (letter, digit or underscore)
+ */
+ static bool isAlNum(char c)
+ {
+ return isAlpha(c) || isDigit(c) || c == '_';
+ }
+ /**
+ * @brief Creates new TokenLocation object for current character
+ */
+ TokenLocation location() const
+ {
+ return { lineInfo, 1 + sourceIndex - lineStart };
+ }
+ /**
+ * @brief Sets error message (only first error will be recorded, in case more than one happen before loop ended)
+ */
+ void setErrorMessage(const std::string &msg)
+ {
+ if (errorMessage.empty()) {
+ errorMessage = location().toString() + ": " + msg;
+ }
+ }
+ /**
+ * @brief Skips current character, calls newLine if end of line character was skipped
+ */
+ void skipCharacter()
+ {
+ if (source[sourceIndex] == '\n')
+ newLine();
+ ++sourceIndex;
+ }
+ /**
+ * @brief Creates and adds to result new token
+ *
+ * Token's text will span from currentIndex to untilCharIndex. After call currentIndex will be set
+ * to untilCharIndex.
+ */
+ void addToken(unsigned int untilCharIndex, TokenType type)
+ {
+ ASSERT(untilCharIndex >= sourceIndex);
+ auto text = source.substr(sourceIndex, untilCharIndex - sourceIndex);
+ if (type == TokenType::IDENTIFIER && keywords.find(text) != keywords.end()) type = TokenType::KEYWORD;
+ result.push_back(Token{ std::move(text), location(), type });
+ sourceIndex = untilCharIndex;
+ }
+ /**
+ * @brief Skips all white space characters, calling newLine as needed
+ */
+ bool skipWhiteSpaces()
+ {
+ if (!isWhitespace(source[sourceIndex])) return false;
+ while (sourceIndex < source.size() && isWhitespace(source[sourceIndex])) skipCharacter();
+ return true;
+ }
+ /**
+ * @brief Tries to parse number (INTEGER or DOUBLE)
+ *
+ * In regexp terms number will be one of following:
+ * \.[0-9][a-zA-Z_0-9]*
+ * [0-9][a-zA-Z_0-9]*(\.[a-zA-Z_0-9]*)?
+ * Returns true, if parsed, false if not.
+ */
+ bool tryNumber()
+ {
+ // NOTE: this does parse number, but doesn't support scientific (like 1.2e10) notation
+ size_t index = sourceIndex;
+ if (!isDigit(source[index])) {
+ if (source[index] == '.' && index + 1 < source.size() && isDigit(source[index])) {
+ index += 2;
+ while (index < source.size() && isAlNum(source[index])) ++index;
+ addToken(index, TokenType::DOUBLE);
+ return true;
+ }
+ return false;
+ }
+ while (index < source.size() && isAlNum(source[index])) ++index;
+ TokenType type = TokenType::INTEGER;
+ if (index < source.size() && source[index] == '.') {
+ ++index;
+ while (index < source.size() && isAlNum(source[index])) ++index;
+ type = TokenType::DOUBLE;
+ }
+ addToken(index, type);
+ return true;
+ }
+ /**
+ * @brief Tries to parse string
+ *
+ * String might be enclosed with ' or " at user choosing. String enclosed with ' might contain
+ * any number of ", the reverse is also true.
+ * It's an error to not finish string before end of line.
+ * Returns true, if parsed, false if not.
+ */
+ bool tryString()
+ {
+ if (source[sourceIndex] != '\'' && source[sourceIndex] != '"') return false;
+ const auto startChar = source[sourceIndex];
+ skipCharacter();
+ auto index = sourceIndex;
+ const auto startLine = line;
+ const auto startOffset = 1 + sourceIndex - lineStart;
+ while (true) {
+ if (index >= source.size()) {
+ setErrorMessage("unfinished string at line " + std::to_string(startLine) +
+ ", offset " + std::to_string(startOffset));
+ break;
+ }
+ auto c = source[index];
+ if (c == '\n') {
+ setErrorMessage("unexpected end of line, while parsing string starting at offset " + std::to_string(startOffset));
+ break;
+ }
+ if (c == startChar) {
+ addToken(index, TokenType::STRING);
+ skipCharacter();
+ break;
+ }
+ ++index;
+ }
+ return true;
+ }
+ /**
+ * @brief Tries to parse identifier or keyword
+ *
+ * Keyword is also an identifier, so first Identifier is parsed. Identifier is stream of letters, digits or underscores,
+ * not starting with digit. If identifier is found in keywords set, then it is promoted to KEYWORD.
+ * Returns true, if parsed, false if not.
+ */
+ bool tryIdentifierOrKeyword()
+ {
+ if (!isAlpha(source[sourceIndex])) return false;
+ auto index = sourceIndex;
+ while (index < source.size() && isAlNum(source[index])) ++index;
+ addToken(index, TokenType::IDENTIFIER);
+ return true;
+ }
+ /**
+ * @brief Tries to parse operator
+ *
+ * Operator parsing should be the last one (for example dot should be first checked, if it's not part of number).
+ * Operators are tried from longest to shortest.
+ * Returns true, if parsed, false if not.
+ */
+ bool tryOperator()
+ {
+ for (auto &oper : operatorsbyLength) {
+ auto len = std::get<0>(oper);
+ if (sourceIndex + len <= source.size()) {
+ auto &operSet = std::get<1>(oper);
+ auto operToCheck = source.substr(sourceIndex, len);
+ if (operSet.find(operToCheck) != operSet.end()) {
+ addToken(sourceIndex + len, TokenType::OPERATOR);
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+public:
+ /**
+ * @brief Constructor
+ */
+ Lexer(std::string &errorMessage, const std::string &fileName, const std::string &source) :
+ errorMessage(errorMessage), fileName(fileName), source(source)
+ {
+ }
+
+ /** @brief Main lexing function
+ *
+ * Every new token will consume at least one character. Function will try all token types,
+ * until either end of file will be reached, error happens when parsing or no token will
+ * be produced (in which case unrecognized character error will be created)
+ */
+ std::vector<Token> lex()
+ {
+ using namespace std::literals::string_literals;
+
+ fileNamePtr = std::make_shared<std::string>(fileName);
+ newLine();
+ while (errorMessage.empty() && sourceIndex < source.size()) {
+ if (skipWhiteSpaces() ||
+ tryIdentifierOrKeyword() ||
+ tryOperator() ||
+ tryString() ||
+ tryNumber())
+ continue;
+ std::ostringstream tmp;
+ tmp << "invalid character 0x" <<
+ std::setfill('0') << std::setw(2) << std::hex << (unsigned int)source[sourceIndex];
+ setErrorMessage(tmp.str());
+ return {};
+ }
+ if (!errorMessage.empty()) return {};
+ if (result.empty() || result.back().type() != TokenType::END_OF_LINE)
+ addToken(sourceIndex, TokenType::END_OF_LINE);
+ return move(result);
+ }
+};
+
+std::vector<Token> lexTest(std::string &errorMessage, const std::string &fileName, const std::string &testSource)
+{
+ Lexer p{ errorMessage, fileName, testSource };
+ return p.lex();
+}
--- /dev/null
+/*
+ * Copyright 2017 Samsung Electronics Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TEST_LEXER
+#define TEST_LEXER
+
+#include <vector>
+#include <string>
+#include <memory>
+
+/**
+ * @brief Type of the token
+ *
+ * @var OPERATOR one of predefined operators (for example + - and so on). Note, that : is also an operator.
+ * @var IDENTIFIER identifier, consists of ascii letters (a-z or A-Z), digits or underscore, doesn't start with digit.
+ * @var KEYWORD identifier, that has special meaning (for example true or false). Keyword can't be used to create new variable.
+ * @var INTEGER number, consists of ascii letters (a-z or A-Z), digits or underscore, starts with digit.
+ * @var DOUBLE the same as integer, but with single dot. Dot might be first character (.123), in middle (123.456) or last (123.)
+ * @var STRING text enclosed with ' or ".
+ * @var END_OF_LINE special token representing end of line
+ */
+enum class TokenType {
+ OPERATOR, IDENTIFIER, KEYWORD, INTEGER, DOUBLE, STRING, END_OF_LINE
+};
+
+std::string toString(TokenType t);
+
+/**
+ * @brief Location of the beginning of the token in source file
+ */
+class TokenLocation
+{
+public:
+ /**
+ * @brief Structure representing information about single line of source code
+ *
+ * @var lineContent text of the given line of source code
+ * @var fileName shared pointer to the fileName string
+ * @var lineNum line number, counting from one
+ */
+ struct LineInfo {
+ std::string lineContent;
+ std::shared_ptr<std::string> fileName;
+ unsigned int lineNum;
+ };
+
+ /**
+ * @brief constructor
+ *
+ * @var line shared LineInfo object, representing line. It's expected, that the same line
+ * will get the same LineInfo object, but it's not mandatory.
+ * @var offset offset from the beginning of the line, counting from one
+ */
+ TokenLocation(std::shared_ptr<LineInfo> line, unsigned int offset);
+
+ /**
+ * @brief Returns name of the file, to which token location points
+ */
+ const std::string &fileName() const;
+
+ /**
+ * @brief Returns line's index (counting from one), to which token location points
+ */
+ unsigned int lineNum() const;
+
+ /**
+ * @brief Returns offset from the beginning of the line (counting from one), to which token location points
+ */
+ unsigned int offsetNum() const;
+
+ /**
+ * @brief Returns whole line's content (text)
+ */
+ const std::string &lineContent() const;
+
+ /**
+ * @brief Returns token location as text, in format "fileName:lineNum:offsetNum"
+ */
+ std::string toString() const;
+private:
+ /// \cond
+ std::shared_ptr<LineInfo> line;
+ unsigned int offset;
+ /// \endcond
+};
+
+/**
+ * @brief Token objects represent single token of code
+ *
+ * Every object has it's location pointing to source, it's full text (for some types might be empty)
+ * and it's type
+ */
+class Token
+{
+public:
+ /**
+ * @brief Constructor
+ */
+ Token(std::string text, TokenLocation location, TokenType type);
+
+ /**
+ * @brief Returns text of the token, for some types might be empty
+ */
+ const std::string &text() const;
+
+ /**
+ * @brief Returns location of the beginning of the token's text
+ */
+ const TokenLocation &location() const;
+
+ /**
+ * @brief Returns token's type
+ */
+ TokenType type() const;
+private:
+ /// \cond
+ std::string text_;
+ TokenLocation location_;
+ TokenType type_;
+ /// \endcond
+};
+
+/**
+ * @bief Turns testSource into vector of tokens
+ *
+ * Turns every character of testSource into tokens. Whitespaces are ignored (not present as tokens).
+ * Non whitespace character not acceptable as part of the token will cause an error, in which case
+ * errorMessage will be set and empty vector will be returned. In case of successful non-empty vector
+ * of tokens will be returned (at minimum it will contain single END_OF_LINE token) and errorMessage
+ * will be empty.
+ * Note, that lexing is greedy, the longest text matching will be accepted, even if at later stages
+ * it will make token unparseable. For example 123abc will be lexed as integer 123abc, rather than
+ * integer 123 followed by identifier abc. It's done this way to make further parsing errors easier to understand.
+ */
+std::vector<Token> lexTest(std::string &errorMessage, const std::string &fileName, const std::string &testSource);
+
+#endif
\ No newline at end of file
--- /dev/null
+/*
+ * Copyright 2017 Samsung Electronics Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include "batch/Lexer.hpp"
+
+TEST(TestExec, simpleLexer)
+{
+ std::string error;
+ auto tokens = lexTest(error, "test",
+ // 1 3 5 7 911 3 5 7 921 3 5 7 931 3 5 7 9
+ "v = function(1, 'qwe', \"rty\", true)\n"
+ "v\n");
+ ASSERT_TRUE(error.empty()) << error;
+
+ const auto expectedTokens = std::vector<std::tuple<std::string, unsigned int, unsigned int, TokenType>> {
+ { "v", 1, 1, TokenType::IDENTIFIER},
+ { "=", 1, 3, TokenType::OPERATOR },
+ { "function", 1, 5, TokenType::IDENTIFIER },
+ { "(", 1, 13, TokenType::OPERATOR },
+ { "1", 1, 14, TokenType::INTEGER },
+ { ",", 1, 15, TokenType::OPERATOR },
+ { "qwe", 1, 18, TokenType::STRING },
+ { ",", 1, 22, TokenType::OPERATOR },
+ { "rty", 1, 25, TokenType::STRING },
+ { ",", 1, 29, TokenType::OPERATOR },
+ { "true", 1, 31, TokenType::KEYWORD },
+ { ")", 1, 35, TokenType::OPERATOR },
+ { "", 1, 36, TokenType::END_OF_LINE },
+
+ { "v", 2, 1, TokenType::IDENTIFIER },
+ { "", 2, 2, TokenType::END_OF_LINE },
+ };
+
+ ASSERT_EQ(tokens.size(), expectedTokens.size());
+
+ for (auto i = 0u; i < tokens.size(); ++i) {
+ ASSERT_EQ(tokens[i].text(), std::get<0>(expectedTokens[i])) << "token " << i;
+ ASSERT_EQ(tokens[i].location().lineNum(), std::get<1>(expectedTokens[i])) << "token " << i;
+ ASSERT_EQ(tokens[i].location().offsetNum(), std::get<2>(expectedTokens[i])) << "token " << i;
+ ASSERT_EQ(tokens[i].type(), std::get<3>(expectedTokens[i])) << "token " << i;
+ }
+}
+
+TEST(TestExec, activityLexing)
+{
+ std::string error;
+ auto tokens = lexTest(error, "test",
+ "TAP");
+ ASSERT_TRUE(error.empty()) << error;
+
+ ASSERT_FALSE(tokens.empty());
+ ASSERT_EQ(tokens[0].type(), TokenType::IDENTIFIER);
+}
+
+int main(int argc, char *argv[])
+{
+ try {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+ } catch (...) {
+ return 1;
+ }
+}
+