2 * Copyright (C) 2002 Andrew Tridgell
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of the GNU General Public License as published by the Free
6 * Software Foundation; either version 3 of the License, or (at your option)
9 * This program is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 51
16 * Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 * The idea is that changes that don't affect the resulting C code should not
23 * change the hash. This is achieved by folding white-space and other
24 * non-semantic fluff in the input into a single unified format.
26 * This unifier was design to match the output of the unifier in compilercache,
27 * which is flex based. The major difference is that this unifier is much
28 * faster (about 2x) and more forgiving of syntactic errors. Continuing on
29 * syntactic errors is important to cope with C/C++ extensions in the local
30 * compiler (for example, inline assembly systems).
35 static const char *const s_tokens[] = {
36 "...", ">>=", "<<=", "+=", "-=", "*=", "/=", "%=", "&=", "^=",
37 "|=", ">>", "<<", "++", "--", "->", "&&", "||", "<=", ">=",
38 "==", "!=", ";", "{", "<%", "}", "%>", ",", ":", "=",
39 "(", ")", "[", "<:", "]", ":>", ".", "&", "!", "~",
40 "-", "+", "*", "/", "%", "<", ">", "^", "|", "?",
55 unsigned char num_toks;
59 /* build up the table used by the unifier */
70 memset(tokens, 0, sizeof(tokens));
71 for (c = 0; c < 128; c++) {
72 if (isalpha(c) || c == '_') tokens[c].type |= C_ALPHA;
73 if (isdigit(c)) tokens[c].type |= C_DIGIT;
74 if (isspace(c)) tokens[c].type |= C_SPACE;
75 if (isxdigit(c)) tokens[c].type |= C_HEX;
77 tokens['\''].type |= C_QUOTE;
78 tokens['"'].type |= C_QUOTE;
79 tokens['l'].type |= C_FLOAT;
80 tokens['L'].type |= C_FLOAT;
81 tokens['f'].type |= C_FLOAT;
82 tokens['F'].type |= C_FLOAT;
83 tokens['U'].type |= C_FLOAT;
84 tokens['u'].type |= C_FLOAT;
86 tokens['-'].type |= C_SIGN;
87 tokens['+'].type |= C_SIGN;
89 for (i = 0; s_tokens[i]; i++) {
91 tokens[c].type |= C_TOKEN;
92 tokens[c].toks[tokens[c].num_toks] = s_tokens[i];
97 /* buffer up characters before hashing them */
99 pushchar(struct mdfour *hash, unsigned char c)
101 static unsigned char buf[64];
106 hash_buffer(hash, (char *)buf, len);
109 hash_buffer(hash, NULL, 0);
115 hash_buffer(hash, (char *)buf, len);
120 /* hash some C/C++ code after unifying */
122 unify(struct mdfour *hash, unsigned char *p, size_t size)
130 for (ofs = 0; ofs < size;) {
132 if ((size-ofs) > 2 && p[ofs+1] == ' ' && isdigit(p[ofs+2])) {
135 } while (ofs < size && p[ofs] != '\n');
139 pushchar(hash, p[ofs]);
141 } while (ofs < size && p[ofs] != '\n');
142 pushchar(hash, '\n');
148 if (tokens[p[ofs]].type & C_ALPHA) {
150 pushchar(hash, p[ofs]);
152 } while (ofs < size && (tokens[p[ofs]].type & (C_ALPHA|C_DIGIT)));
153 pushchar(hash, '\n');
157 if (tokens[p[ofs]].type & C_DIGIT) {
159 pushchar(hash, p[ofs]);
161 } while (ofs < size &&
162 ((tokens[p[ofs]].type & C_DIGIT) || p[ofs] == '.'));
163 if (ofs < size && (p[ofs] == 'x' || p[ofs] == 'X')) {
165 pushchar(hash, p[ofs]);
167 } while (ofs < size && (tokens[p[ofs]].type & C_HEX));
169 if (ofs < size && (p[ofs] == 'E' || p[ofs] == 'e')) {
170 pushchar(hash, p[ofs]);
172 while (ofs < size && (tokens[p[ofs]].type & (C_DIGIT|C_SIGN))) {
173 pushchar(hash, p[ofs]);
177 while (ofs < size && (tokens[p[ofs]].type & C_FLOAT)) {
178 pushchar(hash, p[ofs]);
181 pushchar(hash, '\n');
185 if (tokens[p[ofs]].type & C_SPACE) {
188 } while (ofs < size && (tokens[p[ofs]].type & C_SPACE));
192 if (tokens[p[ofs]].type & C_QUOTE) {
194 pushchar(hash, p[ofs]);
197 while (ofs < size-1 && p[ofs] == '\\') {
198 pushchar(hash, p[ofs]);
199 pushchar(hash, p[ofs+1]);
202 pushchar(hash, p[ofs]);
203 } while (ofs < size && p[ofs] != q);
204 pushchar(hash, '\n');
209 if (tokens[p[ofs]].type & C_TOKEN) {
211 for (i = 0; i < tokens[q].num_toks; i++) {
212 unsigned char *s = (unsigned char *)tokens[q].toks[i];
213 int len = strlen((char *)s);
214 if (size >= ofs+len && memcmp(&p[ofs], s, len) == 0) {
216 for (j = 0; s[j]; j++) {
217 pushchar(hash, s[j]);
220 pushchar(hash, '\n');
224 if (i < tokens[q].num_toks) {
229 pushchar(hash, p[ofs]);
230 pushchar(hash, '\n');
237 /* hash a file that consists of preprocessor output, but remove any line
238 number information from the hash
241 unify_hash(struct mdfour *hash, const char *fname)
246 if (!read_file(fname, 0, &data, &size)) {
247 stats_update(STATS_PREPROCESSOR);
250 unify(hash, (unsigned char *)data, size);