1 // Copyright (C) 2002 Andrew Tridgell
2 // Copyright (C) 2009-2018 Joel Rosdahl
4 // This program is free software; you can redistribute it and/or modify it
5 // under the terms of the GNU General Public License as published by the Free
6 // Software Foundation; either version 3 of the License, or (at your option)
9 // This program is distributed in the hope that it will be useful, but WITHOUT
10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14 // You should have received a copy of the GNU General Public License along with
15 // this program; if not, write to the Free Software Foundation, Inc., 51
16 // Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 // The idea is that changes that don't affect the resulting C code should not
21 // change the hash. This is achieved by folding white-space and other
22 // non-semantic fluff in the input into a single unified format.
24 // This unifier was design to match the output of the unifier in compilercache,
25 // which is flex based. The major difference is that this unifier is much
26 // faster (about 2x) and more forgiving of syntactic errors. Continuing on
27 // syntactic errors is important to cope with C/C++ extensions in the local
28 // compiler (for example, inline assembly systems).
34 static bool print_unified = true;
36 static const char *const s_tokens[] = {
37 "...", ">>=", "<<=", "+=", "-=", "*=", "/=", "%=", "&=", "^=",
38 "|=", ">>", "<<", "++", "--", "->", "&&", "||", "<=", ">=",
39 "==", "!=", ";", "{", "<%", "}", "%>", ",", ":", "=",
40 "(", ")", "[", "<:", "]", ":>", ".", "&", "!", "~",
41 "-", "+", "*", "/", "%", "<", ">", "^", "|", "?",
56 unsigned char num_toks;
60 // Build up the table used by the unifier.
70 memset(tokens, 0, sizeof(tokens));
71 for (unsigned char c = 0; c < 128; c++) {
72 if (isalpha(c) || c == '_') {
73 tokens[c].type |= C_ALPHA;
76 tokens[c].type |= C_DIGIT;
79 tokens[c].type |= C_SPACE;
82 tokens[c].type |= C_HEX;
85 tokens['\''].type |= C_QUOTE;
86 tokens['"'].type |= C_QUOTE;
87 tokens['l'].type |= C_FLOAT;
88 tokens['L'].type |= C_FLOAT;
89 tokens['f'].type |= C_FLOAT;
90 tokens['F'].type |= C_FLOAT;
91 tokens['U'].type |= C_FLOAT;
92 tokens['u'].type |= C_FLOAT;
94 tokens['-'].type |= C_SIGN;
95 tokens['+'].type |= C_SIGN;
97 for (int i = 0; s_tokens[i]; i++) {
98 unsigned char c = s_tokens[i][0];
99 tokens[c].type |= C_TOKEN;
100 tokens[c].toks[tokens[c].num_toks] = s_tokens[i];
101 tokens[c].num_toks++;
105 // Buffer up characters before hashing them.
107 pushchar(struct hash *hash, unsigned char c)
109 static unsigned char buf[64];
114 hash_buffer(hash, (char *)buf, len);
116 printf("%.*s", (int) len, buf);
120 hash_buffer(hash, NULL, 0);
126 hash_buffer(hash, (char *)buf, len);
128 printf("%.*s", (int) len, buf);
134 // Hash some C/C++ code after unifying.
136 unify(struct hash *hash, unsigned char *p, size_t size)
140 for (size_t ofs = 0; ofs < size;) {
142 if ((size-ofs) > 2 && p[ofs+1] == ' ' && isdigit(p[ofs+2])) {
145 } while (ofs < size && p[ofs] != '\n');
149 pushchar(hash, p[ofs]);
151 } while (ofs < size && p[ofs] != '\n');
152 pushchar(hash, '\n');
158 if (tokens[p[ofs]].type & C_ALPHA) {
160 pushchar(hash, p[ofs]);
162 } while (ofs < size && (tokens[p[ofs]].type & (C_ALPHA|C_DIGIT)));
163 pushchar(hash, '\n');
167 if (tokens[p[ofs]].type & C_DIGIT) {
169 pushchar(hash, p[ofs]);
171 } while (ofs < size &&
172 ((tokens[p[ofs]].type & C_DIGIT) || p[ofs] == '.'));
173 if (ofs < size && (p[ofs] == 'x' || p[ofs] == 'X')) {
175 pushchar(hash, p[ofs]);
177 } while (ofs < size && (tokens[p[ofs]].type & C_HEX));
179 if (ofs < size && (p[ofs] == 'E' || p[ofs] == 'e')) {
180 pushchar(hash, p[ofs]);
182 while (ofs < size && (tokens[p[ofs]].type & (C_DIGIT|C_SIGN))) {
183 pushchar(hash, p[ofs]);
187 while (ofs < size && (tokens[p[ofs]].type & C_FLOAT)) {
188 pushchar(hash, p[ofs]);
191 pushchar(hash, '\n');
195 if (tokens[p[ofs]].type & C_SPACE) {
198 } while (ofs < size && (tokens[p[ofs]].type & C_SPACE));
202 if (tokens[p[ofs]].type & C_QUOTE) {
203 unsigned char q = p[ofs];
204 pushchar(hash, p[ofs]);
207 while (ofs < size-1 && p[ofs] == '\\') {
208 pushchar(hash, p[ofs]);
209 pushchar(hash, p[ofs+1]);
212 pushchar(hash, p[ofs]);
213 } while (ofs < size && p[ofs] != q);
214 pushchar(hash, '\n');
219 if (tokens[p[ofs]].type & C_TOKEN) {
220 unsigned char q = p[ofs];
222 for (i = 0; i < tokens[q].num_toks; i++) {
223 const unsigned char *s = (const unsigned char *)tokens[q].toks[i];
224 int len = strlen((const char *)s);
225 if (size >= ofs+len && memcmp(&p[ofs], s, len) == 0) {
227 for (j = 0; s[j]; j++) {
228 pushchar(hash, s[j]);
231 pushchar(hash, '\n');
235 if (i < tokens[q].num_toks) {
240 pushchar(hash, p[ofs]);
241 pushchar(hash, '\n');
248 // Hash a file that consists of preprocessor output, but remove any line number
249 // information from the hash.
251 unify_hash(struct hash *hash, const char *fname, bool debug)
255 if (!read_file(fname, 0, &data, &size)) {
256 stats_update(STATS_PREPROCESSOR);
259 print_unified = debug;
260 unify(hash, (unsigned char *)data, size);