2 * lex.c - ktap lexical analyzer
4 * This file is part of ktap by Jovi Zhangwei.
6 * Copyright (C) 2012-2013 Jovi Zhangwei <jovi.zhangwei@gmail.com>.
8 * Copyright (C) 1994-2013 Lua.org, PUC-Rio.
9 * - The part of code in this file is copied from lua initially.
10 * - lua's MIT license is compatible with GPL.
12 * ktap is free software; you can redistribute it and/or modify it
13 * under the terms and conditions of the GNU General Public License,
14 * version 2, as published by the Free Software Foundation.
16 * ktap is distributed in the hope it will be useful, but WITHOUT
17 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
18 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
21 * You should have received a copy of the GNU General Public License along with
22 * this program; if not, write to the Free Software Foundation, Inc.,
23 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
30 #include "../include/ktap_types.h"
31 #include "../include/ktap_opcodes.h"
34 #define next(ls) (ls->current = *ls->ptr++)
36 #define currIsNewline(ls) (ls->current == '\n' || ls->current == '\r')
38 #define KTAP_MINBUFFER 32
41 static const char *const ktap_tokens [] = {
42 "trace", "trace_end", "argevent", "argname", "cdef",
43 "arg1", "arg2", "arg3", "arg4", "arg5", "arg6", "arg7", "arg9", "arg9",
44 "profile", "tick", "<<<",
45 "and", "break", "do", "else", "elseif",
46 "end", "false", "for", "function", "goto", "if",
47 "in", "local", "nil", "not", "or", "repeat",
48 "return", "then", "true", "until", "while",
49 "..", "...", "==", ">=", "<=", "!=", "+=", "::", "<eof>",
50 "<number>", "<name>", "<string>", "<symbol>"
53 #define save_and_next(ls) (save(ls, ls->current), next(ls))
55 static void lexerror(ktap_lexstate *ls, const char *msg, int token);
57 static void save(ktap_lexstate *ls, int c)
59 ktap_mbuffer *b = ls->buff;
60 if (mbuff_len(b) + 1 > mbuff_size(b)) {
62 if (mbuff_size(b) >= MAX_SIZET / 2)
63 lexerror(ls, "lexical element too long", 0);
64 newsize = mbuff_size(b) * 2;
65 mbuff_resize(b, newsize);
67 b->buffer[mbuff_len(b)++] = (char)c;
73 for (i = 0; i < NUM_RESERVED; i++) {
74 ktap_string *ts = ktapc_ts_new(ktap_tokens[i]);
75 ts->tsv.extra = (u8)(i+1); /* reserved word */
79 const char *lex_token2str(ktap_lexstate *ls, int token)
81 if (token < FIRST_RESERVED) {
82 ktap_assert(token == (unsigned char)token);
83 return (isprint(token)) ? ktapc_sprintf(KTAP_QL("%c"), token) :
84 ktapc_sprintf("char(%d)", token);
86 const char *s = ktap_tokens[token - FIRST_RESERVED];
88 return ktapc_sprintf(KTAP_QS, s);
94 static const char *txtToken(ktap_lexstate *ls, int token)
101 return ktapc_sprintf(KTAP_QS, mbuff(ls->buff));
103 return lex_token2str(ls, token);
107 static void lexerror(ktap_lexstate *ls, const char *msg, int token)
109 char buff[KTAP_IDSIZE];
112 ktapc_chunkid(buff, getstr(ls->source), KTAP_IDSIZE);
113 newmsg = ktapc_sprintf("%s:%d: %s", buff, ls->linenumber, msg);
115 newmsg = ktapc_sprintf("%s near %s", newmsg, txtToken(ls, token));
116 printf("lexerror: %s\n", newmsg);
120 void lex_syntaxerror(ktap_lexstate *ls, const char *msg)
122 lexerror(ls, msg, ls->t.token);
126 * creates a new string and anchors it in function's table so that
127 * it will not be collected until the end of the function's compilation
128 * (by that time it should be anchored in function's prototype)
130 ktap_string *lex_newstring(ktap_lexstate *ls, const char *str, size_t l)
132 const ktap_value *o; /* entry for `str' */
133 ktap_value val; /* entry for `str' */
135 ktap_string *ts = ktapc_ts_newlstr(str, l); /* create new string */
136 set_string(&tsv, ts);
137 o = ktapc_table_get(ls->fs->h, &tsv);
138 if (is_nil(o)) { /* not in use yet? (see 'addK') */
139 /* boolean value does not need GC barrier;
140 table has no metatable, so it does not need to invalidate cache */
141 set_boolean(&val, 1); /* t[string] = true */
142 ktapc_table_setvalue(ls->fs->h, &tsv, &val);
148 * increment line number and skips newline sequence (any of
149 * \n, \r, \n\r, or \r\n)
151 static void inclinenumber(ktap_lexstate *ls)
153 int old = ls->current;
154 ktap_assert(currIsNewline(ls));
155 next(ls); /* skip `\n' or `\r' */
156 if (currIsNewline(ls) && ls->current != old)
157 next(ls); /* skip `\n\r' or `\r\n' */
158 if (++ls->linenumber >= MAX_INT)
159 lex_syntaxerror(ls, "chunk has too many lines");
162 void lex_setinput(ktap_lexstate *ls, char *ptr, ktap_string *source, int firstchar)
165 ls->current = firstchar;
166 ls->lookahead.token = TK_EOS; /* no look-ahead token */
172 ls->envn = ktapc_ts_new(KTAP_ENV); /* create env name */
173 mbuff_resize(ls->buff, KTAP_MINBUFFER); /* initialize buffer */
177 * =======================================================
179 * =======================================================
181 static int check_next(ktap_lexstate *ls, const char *set)
183 if (ls->current == '\0' || !strchr(set, ls->current))
190 * change all characters 'from' in buffer to 'to'
192 static void buffreplace(ktap_lexstate *ls, char from, char to)
194 size_t n = mbuff_len(ls->buff);
195 char *p = mbuff(ls->buff);
197 if (p[n] == from) p[n] = to;
200 #if !defined(getlocaledecpoint)
201 #define getlocaledecpoint() (localeconv()->decimal_point[0])
204 #define mbuff2d(b,e) ktapc_str2d(mbuff(b), mbuff_len(b) - 1, e)
207 * in case of format error, try to change decimal point separator to
208 * the one defined in the current locale and check again
210 static void trydecpoint(ktap_lexstate *ls, ktap_seminfo *seminfo)
212 char old = ls->decpoint;
213 ls->decpoint = getlocaledecpoint();
214 buffreplace(ls, old, ls->decpoint); /* try new decimal separator */
215 if (!mbuff2d(ls->buff, &seminfo->r)) {
216 /* format error with correct decimal point: no more options */
217 buffreplace(ls, ls->decpoint, '.'); /* undo change (for error message) */
218 lexerror(ls, "malformed number", TK_NUMBER);
223 * this function is quite liberal in what it accepts, as 'ktapc_str2d'
224 * will reject ill-formed numerals.
226 static void read_numeral(ktap_lexstate *ls, ktap_seminfo *seminfo)
228 const char *expo = "Ee";
229 int first = ls->current;
231 ktap_assert(isdigit(ls->current));
233 if (first == '0' && check_next(ls, "Xx")) /* hexadecimal? */
236 if (check_next(ls, expo)) /* exponent part? */
237 check_next(ls, "+-"); /* optional exponent sign */
238 if (isxdigit(ls->current) || ls->current == '.')
244 buffreplace(ls, '.', ls->decpoint); /* follow locale for decimal point */
245 if (!mbuff2d(ls->buff, &seminfo->r)) /* format error? */
246 trydecpoint(ls, seminfo); /* try to update decimal point separator */
250 * skip a sequence '[=*[' or ']=*]' and return its number of '='s or
251 * -1 if sequence is malformed
253 static int skip_sep(ktap_lexstate *ls)
258 ktap_assert(s == '[' || s == ']');
260 while (ls->current == '=') {
264 return (ls->current == s) ? count : (-count) - 1;
267 static void read_long_string(ktap_lexstate *ls, ktap_seminfo *seminfo, int sep)
269 save_and_next(ls); /* skip 2nd `[' */
270 if (currIsNewline(ls)) /* string starts with a newline? */
271 inclinenumber(ls); /* skip it */
273 switch (ls->current) {
275 lexerror(ls, (seminfo) ? "unfinished long string" :
276 "unfinished long comment", TK_EOS);
277 break; /* to avoid warnings */
279 if (skip_sep(ls) == sep) {
280 save_and_next(ls); /* skip 2nd `]' */
289 /* avoid wasting space */
291 mbuff_reset(ls->buff);
305 seminfo->ts = lex_newstring(ls, mbuff(ls->buff) + (2 + sep),
306 mbuff_len(ls->buff) - 2*(2 + sep));
309 static void escerror(ktap_lexstate *ls, int *c, int n, const char *msg)
312 mbuff_reset(ls->buff); /* prepare error message */
314 for (i = 0; i < n && c[i] != EOZ; i++)
316 lexerror(ls, msg, TK_STRING);
319 static int readhexaesc(ktap_lexstate *ls)
321 int c[3], i; /* keep input for error message */
322 int r = 0; /* result accumulator */
323 c[0] = 'x'; /* for error message */
324 for (i = 1; i < 3; i++) { /* read two hexa digits */
327 escerror(ls, c, i + 1, "hexadecimal digit expected");
328 r = (r << 4) + ktapc_hexavalue(c[i]);
333 static int readdecesc(ktap_lexstate *ls)
336 int r = 0; /* result accumulator */
337 for (i = 0; i < 3 && isdigit(ls->current); i++) { /* read up to 3 digits */
339 r = 10*r + c[i] - '0';
343 escerror(ls, c, i, "decimal escape too large");
347 static void read_string(ktap_lexstate *ls, int del, ktap_seminfo *seminfo)
349 save_and_next(ls); /* keep delimiter (for error messages) */
350 while (ls->current != del) {
351 switch (ls->current) {
353 lexerror(ls, "unfinished string", TK_EOS);
354 break; /* to avoid warnings */
357 lexerror(ls, "unfinished string", TK_STRING);
358 break; /* to avoid warnings */
359 case '\\': { /* escape sequences */
360 int c; /* final character to be saved */
361 next(ls); /* do not save the `\' */
362 switch (ls->current) {
363 case 'a': c = '\a'; goto read_save;
364 case 'b': c = '\b'; goto read_save;
365 case 'f': c = '\f'; goto read_save;
366 case 'n': c = '\n'; goto read_save;
367 case 'r': c = '\r'; goto read_save;
368 case 't': c = '\t'; goto read_save;
369 case 'v': c = '\v'; goto read_save;
370 case 'x': c = readhexaesc(ls); goto read_save;
371 case '\n': case '\r':
372 inclinenumber(ls); c = '\n'; goto only_save;
373 case '\\': case '\"': case '\'':
374 c = ls->current; goto read_save;
375 case EOZ: goto no_save; /* will raise an error next loop */
376 case 'z': { /* zap following span of spaces */
377 next(ls); /* skip the 'z' */
378 while (isspace(ls->current)) {
379 if (currIsNewline(ls))
387 if (!isdigit(ls->current))
388 escerror(ls, &ls->current, 1, "invalid escape sequence");
389 /* digital escape \ddd */
395 next(ls); /* read next character */
397 save(ls, c); /* save 'c' */
405 save_and_next(ls); /* skip delimiter */
406 seminfo->ts = lex_newstring(ls, mbuff(ls->buff) + 1, mbuff_len(ls->buff) - 2);
409 static int llex(ktap_lexstate *ls, ktap_seminfo *seminfo)
411 mbuff_reset(ls->buff);
414 switch (ls->current) {
415 case '\n': case '\r': { /* line breaks */
419 case ' ': case '\f': case '\t': case '\v': { /* spaces */
424 while (!currIsNewline(ls) && ls->current != EOZ)
425 next(ls); /* skip until end of line (or end of file) */
429 case '-': { /* '-' or '--' (comment) */
431 if (ls->current != '-')
433 /* else is a comment */
435 if (ls->current == '[') { /* long comment? */
436 int sep = skip_sep(ls);
437 mbuff_reset(ls->buff); /* `skip_sep' may dirty the buffer */
439 read_long_string(ls, NULL, sep); /* skip long comment */
440 mbuff_reset(ls->buff); /* previous call may dirty the buff. */
444 /* else short comment */
445 while (!currIsNewline(ls) && ls->current != EOZ)
446 next(ls); /* skip until end of line (or end of file) */
450 case '[': { /* long string or simply '[' */
451 int sep = skip_sep(ls);
453 read_long_string(ls, seminfo, sep);
459 lexerror(ls, "invalid long string delimiter", TK_STRING);
463 if (ls->current != '=')
472 if (ls->current != '=')
481 if (ls->current == '=')
483 else if (ls->current == '<') {
485 if (ls->current == '<') {
487 return TK_AGGR_ASSIGN;
495 if (ls->current != '=')
504 if (ls->current != '=')
513 if (ls->current != ':')
520 case '"': case '\'': { /* short literal strings */
521 read_string(ls, ls->current, seminfo);
524 case '`': { /* short literal kernel symbol */
525 read_string(ls, ls->current, seminfo);
528 case '.': { /* '.', '..', '...', or number */
530 if (check_next(ls, ".")) {
531 if (check_next(ls, "."))
532 return TK_DOTS; /* '...' */
534 return TK_CONCAT; /* '..' */
536 else if (!isdigit(ls->current))
538 /* else go through */
540 case '0': case '1': case '2': case '3': case '4':
541 case '5': case '6': case '7': case '8': case '9': {
542 read_numeral(ls, seminfo);
550 if (ls->current != '&')
559 if (ls->current != '|')
567 if (islalpha(ls->current)) {
568 /* identifier or reserved word? */
572 } while (islalnum(ls->current));
573 ts = lex_newstring(ls, mbuff(ls->buff),
574 mbuff_len(ls->buff));
576 if (isreserved(ts)) /* reserved word? */
577 return ts->tsv.extra - 1 +
582 } else { /* single-char tokens (+ - / ...) */
592 void lex_read_string_until(ktap_lexstate *ls, int c)
597 mbuff_reset(ls->buff);
599 while (ls->current == ' ')
604 } while (ls->current != c && ls->current != EOZ);
606 if (ls->current != c) {
607 sprintf(errmsg, "expect %c", c);
608 lexerror(ls, errmsg, 0);
611 ts = lex_newstring(ls, mbuff(ls->buff), mbuff_len(ls->buff));
612 ls->t.seminfo.ts = ts;
613 ls->t.token = TK_STRING;
616 void lex_next(ktap_lexstate *ls)
618 ls->lastline = ls->linenumber;
619 if (ls->lookahead.token != TK_EOS) { /* is there a look-ahead token? */
620 ls->t = ls->lookahead; /* use this one */
621 ls->lookahead.token = TK_EOS; /* and discharge it */
623 ls->t.token = llex(ls, &ls->t.seminfo); /* read next token */
626 int lex_lookahead(ktap_lexstate *ls)
628 ktap_assert(ls->lookahead.token == TK_EOS);
629 ls->lookahead.token = llex(ls, &ls->lookahead.seminfo);
630 return ls->lookahead.token;