4 * High efficiency lexical state parser
6 * Copyright (C)2011-2014 Andy Green <andy@warmcat.com>
10 * Usage: gcc minilex.c -o minilex && ./minilex > lextable.h
12 * Run it twice to test parsing on the generated table on stderr
19 /* set of parsable strings -- ALL LOWER CASE */
29 "sec-websocket-draft:",
32 "sec-websocket-extensions:",
33 "sec-websocket-key1:",
34 "sec-websocket-key2:",
35 "sec-websocket-protocol:",
37 "sec-websocket-accept:",
38 "sec-websocket-nonce:",
43 "access-control-request-headers:",
58 "sec-websocket-version:",
59 "sec-websocket-origin:",
69 "access-control-allow-origin:",
72 "content-disposition:",
82 "if-unmodified-since:",
87 "proxy-authenticate:",
88 "proxy-authorization:",
93 "strict-transport-security:",
100 "", /* not matchable */
105 * b7 = 0 = 1-byte seq
108 * 0x00 - 0x07, then terminal as given in 2nd byte
110 * no match: go fwd 3 byte, match: jump fwd by amt in +1/+2 bytes
112 * no match: die, match go fwd 1 byte
115 unsigned char lextable[] = {
116 #include "lextable.h"
130 struct state state[1000];
133 #define FAIL_CHAR 0x08
135 int lextable_decode(int pos, char c)
139 if (lextable[pos] & (1 << 7)) { /* 1-byte, fail on mismatch */
140 if ((lextable[pos] & 0x7f) != c)
144 if (lextable[pos] == FAIL_CHAR)
147 } else { /* b7 = 0, end or 3-byte */
148 if (lextable[pos] < FAIL_CHAR) /* terminal marker */
151 if (lextable[pos] == c) /* goto */
152 return pos + (lextable[pos + 1]) +
153 (lextable[pos + 2] << 8);
173 while (n < sizeof(set) / sizeof(set[0])) {
179 if (set[n][0] == '\0') {
187 for (y = 0; y < state[walk].count; y++)
188 if (state[walk].c[y] == set[n][m]) {
189 /* exists -- go forward */
190 walk = state[walk].state[y];
198 /* something we didn't see before */
200 state[walk].c[state[walk].count] = set[n][m];
202 state[walk].state[state[walk].count] = next;
209 state[walk].c[0] = n++;
210 state[walk].state[0] = 0; /* terminal marker */
211 state[walk].count = 1;
215 for (n = 0; n < next; n++) {
216 state[n].bytepos = walk;
217 walk += (2 * state[n].count);
220 /* compute everyone's position first */
224 for (n = 0; n < next; n++) {
226 state[n].real_pos = pos;
228 for (m = 0; m < state[n].count; m++) {
230 if (state[n].state[m] == 0)
231 pos += 2; /* terminal marker */
232 else { /* c is a character */
233 if ((state[state[n].state[m]].bytepos -
238 if (m == state[n].count - 1)
248 for (n = 0; n < next; n++) {
249 for (m = 0; m < state[n].count; m++) {
252 fprintf(stdout, "/* pos %04x: %3d */ ",
253 state[n].real_pos, n);
255 fprintf(stdout, " ");
258 saw = state[n].state[m];
260 if (saw == 0) { // c is a terminal then
263 fprintf(stderr, "terminal too big\n");
267 fprintf(stdout, " 0x%02X, 0x%02X "
269 "/* - terminal marker %2d - */,\n",
270 y >> 8, y & 0xff, y & 0x7f);
276 /* c is a character */
279 if (prev < 32 || prev > 126)
283 if ((state[saw].bytepos - walk) == 2) {
284 fprintf(stdout, " 0x%02X /* '%c' -> */,\n",
291 j = state[saw].real_pos - pos;
295 "Jump > 64K bytes ahead (%d to %d)\n",
296 state[n].real_pos, state[saw].real_pos);
299 fprintf(stdout, " 0x%02X /* '%c' */, 0x%02X, 0x%02X "
300 "/* (to 0x%04X state %3d) */,\n",
303 state[saw].real_pos, saw);
306 if (m == state[n].count - 1) {
308 " 0x%02X, /* fail */\n",
317 fprintf(stdout, "/* total size %d bytes */\n", pos);
320 * Try to parse every legal input string
323 for (n = 0; n < sizeof(set) / sizeof(set[0]); n++) {
328 if (set[n][0] == '\0')
331 fprintf(stderr, " trying '%s'\n", set[n]);
334 walk = lextable_decode(walk, set[n][m]);
336 fprintf(stderr, "failed\n");
340 if (lextable[walk] < FAIL_CHAR) {
341 y = (lextable[walk] << 8) + lextable[walk + 1];
348 fprintf(stderr, "decode failed %d\n", y);
353 fprintf(stderr, "All decode OK\n");