3 * A mini C-like language scanner.
7 #include <objc/Object.h>
10 #define IDENT_BUFLEN 256
12 @interface Clang : Object
15 /* State machine operation data. */
19 char identBuf[IDENT_BUFLEN+1];
25 - (void) executeWithData:(const char *)data len:(int)len;
33 # Function to buffer a character.
35 if ( identLen < IDENT_BUFLEN ) {
36 identBuf[identLen] = fc;
41 # Function to clear the buffer.
46 # Functions to dump tokens as they are matched.
48 identBuf[identLen] = 0;
49 printf("ident(%i): %s\n", curLine, identBuf);
52 identBuf[identLen] = 0;
53 printf("literal(%i): %s\n", curLine, identBuf);
56 identBuf[identLen] = 0;
57 printf("float(%i): %s\n", curLine, identBuf);
60 identBuf[identLen] = 0;
61 printf("int(%i): %s\n", curLine, identBuf);
64 identBuf[identLen] = 0;
65 printf("hex(%i): 0x%s\n", curLine, identBuf);
68 identBuf[identLen] = 0;
69 printf("symbol(%i): %s\n", curLine, identBuf);
72 # Alpha numberic characters or underscore.
75 # Alpha charactres or underscore.
78 # Symbols. Upon entering clear the buffer. On all transitions
79 # buffer a character. Upon leaving dump the symbol.
80 symbol = ( punct - [_'"] ) >clearBuf $bufChar %symbol;
82 # Identifier. Upon entering clear the buffer. On all transitions
83 # buffer a character. Upon leaving, dump the identifier.
84 ident = (alphau . alnumu*) >clearBuf $bufChar %ident;
86 # Match single characters inside literal strings. Or match
87 # an escape sequence. Buffers the charater matched.
89 ( extend - ['\\] ) @bufChar |
90 ( '\\' . extend @bufChar );
92 ( extend - ["\\] ) @bufChar |
93 ( '\\' . extend @bufChar );
95 # Single quote and double quota literals. At the start clear
96 # the buffer. Upon leaving dump the literal.
97 sliteral = ('\'' @clearBuf . sliteralChar* . '\'' ) %literal;
98 dliteral = ('"' @clearBuf . dliteralChar* . '"' ) %literal;
99 literal = sliteral | dliteral;
101 # Whitespace is standard ws, newlines and control codes.
102 whitespace = any - 0x21..0x7e;
104 # Describe both c style comments and c++ style comments. The
105 # priority bump on tne terminator of the comments brings us
106 # out of the extend* which matches everything.
107 ccComment = '//' . extend* $0 . '\n' @1;
108 cComment = '/*' . extend* $0 . '*/' @1;
110 # Match an integer. We don't bother clearing the buf or filling it.
111 # The float machine overlaps with int and it will do it.
114 # Match a float. Upon entering the machine clear the buf, buffer
115 # characters on every trans and dump the float upon leaving.
116 float = ( digit+ . '.' . digit+ ) >clearBuf $bufChar %float;
118 # Match a hex. Upon entering the hex part, clear the buf, buffer characters
119 # on every trans and dump the hex on leaving transitions.
120 hex = '0x' . xdigit+ >clearBuf $bufChar %hex;
122 # Or together all the lanuage elements.
133 # Star the language elements. It is critical in this type of application
134 # that we decrease the priority of out transitions before doing so. This
135 # is so that when we see 'aa' we stay in the fin machine to match an ident
136 # of length two and not wrap around to the front to match two idents of
138 clang_main = ( fin $1 %0 )*;
140 # This machine matches everything, taking note of newlines.
141 newline = ( any | '\n' @{ curLine += 1; } )*;
143 # The final fsm is the lexer intersected with the newline machine which
144 # will count lines for us. Since the newline machine accepts everything,
145 # the strings accepted is goverened by the clang_main machine, onto which
146 # the newline machine overlays line counting.
147 main := clang_main & newline;
150 @implementation Clang
161 - (void) executeWithData:(const char *)data len:(int)len;
163 const char *p = data;
164 const char *pe = data + len;
165 const char *eof = pe;
172 if ( cs == Clang_error )
174 if ( cs >= Clang_first_final )
186 void test( char *buf )
188 int len = strlen(buf);
189 fsm = [[Clang alloc] init];
191 [fsm executeWithData:buf len:len];
192 if ( [fsm finish] > 0 )
201 "999 0xaAFF99 99.99 /*\n"
208 "\"0x00aba foobardd.ddsf 0x0.9\n" );
211 "wordwithnum00asdf\n"
212 "000wordfollowsnum,makes new symbol\n"
214 "finishing early /* unfinished ...\n" );
224 "#define _AAPL_RESIZE_H\n"
226 "#include <assert.h>\n"
228 "#ifdef AAPL_NAMESPACE\n"
231 "#define LIN_DEFAULT_STEP 256\n"
232 "#define EXPN_UP( existing, needed ) \\\n"
233 " need > eng ? (ned<<1) : eing\n"
238 "#ifdef AAPL_NAMESPACE\n"
239 "#endif /* _AAPL_RESIZE_H */\n" );
243 #ifdef _____OUTPUT_____
262 ident(1): wordwithnum00asdf
264 ident(2): wordfollowsnum
274 ident(8): _AAPL_RESIZE_H
284 ident(12): AAPL_NAMESPACE
292 ident(15): LIN_DEFAULT_STEP
320 ident(22): AAPL_NAMESPACE