15 # Function to buffer a character.
16 action bufChar { array[pos] = fc; pos = pos + 1; }
18 # Function to clear the buffer.
19 action clearBuf { pos = 0; }
21 # Functions to dump tokens as they are matched.
77 # Alpha numberic characters or underscore.
80 # Alpha charactres or underscore.
83 # Symbols. Upon entering clear the buffer. On all transitions
84 # buffer a character. Upon leaving dump the symbol.
85 symbol = ( punct - [_'"] ) >clearBuf $bufChar %symbol;
87 # Identifier. Upon entering clear the buffer. On all transitions
88 # buffer a character. Upon leaving, dump the identifier.
89 ident = (alphau . alnumu*) >clearBuf $bufChar %ident;
91 # Match single characters inside literal strings. Or match
92 # an escape sequence. Buffers the charater matched.
94 ( extend - ['\\] ) @bufChar |
95 ( '\\' . extend @bufChar );
97 ( extend - ["\\] ) @bufChar |
98 ( '\\' . extend @bufChar );
100 # Single quote and double quota literals. At the start clear
101 # the buffer. Upon leaving dump the literal.
102 sliteral = ('\'' @clearBuf . sliteralChar* . '\'' ) %literal;
103 dliteral = ('"' @clearBuf . dliteralChar* . '"' ) %literal;
104 literal = sliteral | dliteral;
106 # Whitespace is standard ws, newlines and control codes.
107 whitespace = any - 33 .. 126;
109 # Describe both c style comments and c++ style comments. The
110 # priority bump on tne terminator of the comments brings us
111 # out of the extend* which matches everything.
112 ccComment = '//' . extend* $0 . '\n' @1;
113 cComment = '/!' . extend* $0 . '!/' @1;
115 # Match an integer. We don't bother clearing the buf or filling it.
116 # The float machine overlaps with int and it will do it.
117 integer = digit+ %integer;
119 # Match a float. Upon entering the machine clear the buf, buffer
120 # characters on every trans and dump the float upon leaving.
121 float = ( digit+ . '.' . digit+ ) >clearBuf $bufChar %float;
123 # Match a hex. Upon entering the hex part, clear the buf, buffer characters
124 # on every trans and dump the hex on leaving transitions.
125 hex = '0x' . xdigit+ >clearBuf $bufChar %hex;
127 # Or together all the lanuage elements.
138 # Star the language elements. It is critical in this type of application
139 # that we decrease the priority of out transitions before doing so. This
140 # is so that when we see 'aa' we stay in the fin machine to match an ident
141 # of length two and not wrap around to the front to match two idents of
143 clang_main = ( fin $1 %0 )*;
145 # This machine matches everything, taking note of newlines.
146 newline = ( any | '\n' @{ line = line + 1; } )*;
148 # The final fsm is the lexer intersected with the newline machine which
149 # will count lines for us. Since the newline machine accepts everything,
150 # the strings accepted is goverened by the clang_main machine, onto which
151 # the newline machine overlays line counting.
152 main := clang_main & newline;
155 "999 0xaAFF99 99.99 /!\n!/ 'lksdj' //\n\"\n\nliteral\n\n\n\"0x00aba foobardd.ddsf 0x0.9\n"
156 "wordwithnum00asdf\n000wordfollowsnum,makes new symbol\n\nfinishing early /! unfinished ...\n"
177 ident(1,17): wordwithnum00asdf
179 ident(2,14): wordfollowsnum
184 ident(4,9): finishing