16 # Function to buffer a character.
17 action bufChar { array[pos] = fc; pos = pos + 1; }
19 # Function to clear the buffer.
20 action clearBuf { pos = 0; }
22 # Functions to dump tokens as they are matched.
78 # Alpha numberic characters or underscore.
81 # Alpha charactres or underscore.
84 # Symbols. Upon entering clear the buffer. On all transitions
85 # buffer a character. Upon leaving dump the symbol.
86 symbol = ( punct - [_'"] ) >clearBuf $bufChar %symbol;
88 # Identifier. Upon entering clear the buffer. On all transitions
89 # buffer a character. Upon leaving, dump the identifier.
90 ident = (alphau . alnumu*) >clearBuf $bufChar %ident;
92 # Match single characters inside literal strings. Or match
93 # an escape sequence. Buffers the charater matched.
95 ( extend - ['\\] ) @bufChar |
96 ( '\\' . extend @bufChar );
98 ( extend - ["\\] ) @bufChar |
99 ( '\\' . extend @bufChar );
101 # Single quote and double quota literals. At the start clear
102 # the buffer. Upon leaving dump the literal.
103 sliteral = ('\'' @clearBuf . sliteralChar* . '\'' ) %literal;
104 dliteral = ('"' @clearBuf . dliteralChar* . '"' ) %literal;
105 literal = sliteral | dliteral;
107 # Whitespace is standard ws, newlines and control codes.
108 whitespace = any - 33 .. 126;
110 # Describe both c style comments and c++ style comments. The
111 # priority bump on tne terminator of the comments brings us
112 # out of the extend* which matches everything.
113 ccComment = '//' . extend* $0 . '\n' @1;
114 cComment = '/!' . extend* $0 . '!/' @1;
116 # Match an integer. We don't bother clearing the buf or filling it.
117 # The float machine overlaps with int and it will do it.
118 integer = digit+ %integer;
120 # Match a float. Upon entering the machine clear the buf, buffer
121 # characters on every trans and dump the float upon leaving.
122 float = ( digit+ . '.' . digit+ ) >clearBuf $bufChar %float;
124 # Match a hex. Upon entering the hex part, clear the buf, buffer characters
125 # on every trans and dump the hex on leaving transitions.
126 hex = '0x' . xdigit+ >clearBuf $bufChar %hex;
128 # Or together all the lanuage elements.
139 # Star the language elements. It is critical in this type of application
140 # that we decrease the priority of out transitions before doing so. This
141 # is so that when we see 'aa' we stay in the fin machine to match an ident
142 # of length two and not wrap around to the front to match two idents of
144 clang_main = ( fin $1 %0 )*;
146 # This machine matches everything, taking note of newlines.
147 newline = ( any | '\n' @{ line = line + 1; } )*;
149 # The final fsm is the lexer intersected with the newline machine which
150 # will count lines for us. Since the newline machine accepts everything,
151 # the strings accepted is goverened by the clang_main machine, onto which
152 # the newline machine overlays line counting.
153 main := clang_main & newline;
156 "999 0xaAFF99 99.99 /!\n!/ 'lksdj' //\n\"\n\nliteral\n\n\n\"0x00aba foobardd.ddsf 0x0.9\n"
157 "wordwithnum00asdf\n000wordfollowsnum,makes new symbol\n\nfinishing early /! unfinished ...\n"
178 ident(1,17): wordwithnum00asdf
180 ident(2,14): wordfollowsnum
185 ident(4,9): finishing