Imported Upstream version 3.7
[platform/upstream/ccache.git] / src / unify.c
1 // Copyright (C) 2002 Andrew Tridgell
2 // Copyright (C) 2009-2019 Joel Rosdahl
3 //
4 // This program is free software; you can redistribute it and/or modify it
5 // under the terms of the GNU General Public License as published by the Free
6 // Software Foundation; either version 3 of the License, or (at your option)
7 // any later version.
8 //
9 // This program is distributed in the hope that it will be useful, but WITHOUT
10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 // more details.
13 //
14 // You should have received a copy of the GNU General Public License along with
15 // this program; if not, write to the Free Software Foundation, Inc., 51
16 // Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17
18 // C/C++ unifier
19 //
20 // The idea is that changes that don't affect the resulting C code should not
21 // change the hash. This is achieved by folding white-space and other
22 // non-semantic fluff in the input into a single unified format.
23 //
24 // This unifier was design to match the output of the unifier in compilercache,
25 // which is flex based. The major difference is that this unifier is much
26 // faster (about 2x) and more forgiving of syntactic errors. Continuing on
27 // syntactic errors is important to cope with C/C++ extensions in the local
28 // compiler (for example, inline assembly systems).
29
30 #include "ccache.h"
31 #include "hash.h"
32 #include "unify.h"
33
34 static bool print_unified = true;
35
36 static const char *const s_tokens[] = {
37         "...", ">>=", "<<=", "+=", "-=", "*=", "/=", "%=", "&=", "^=",
38         "|=",  ">>",  "<<",  "++", "--", "->", "&&", "||", "<=", ">=",
39         "==",  "!=",  ";",   "{",  "<%", "}",  "%>", ",",  ":",  "=",
40         "(",   ")",   "[",   "<:", "]",  ":>", ".",  "&",  "!",  "~",
41         "-",   "+",   "*",   "/",  "%",  "<",  ">",  "^",  "|",  "?",
42         0
43 };
44
45 #define C_ALPHA 1
46 #define C_SPACE 2
47 #define C_TOKEN 4
48 #define C_QUOTE 8
49 #define C_DIGIT 16
50 #define C_HEX   32
51 #define C_FLOAT 64
52 #define C_SIGN  128
53
54 static struct {
55         unsigned char type;
56         unsigned char num_toks;
57         const char *toks[7];
58 } tokens[256];
59
60 // Build up the table used by the unifier.
61 static void
62 build_table(void)
63 {
64         static bool done;
65         if (done) {
66                 return;
67         }
68         done = true;
69
70         memset(tokens, 0, sizeof(tokens));
71         for (unsigned char c = 0; c < 128; c++) {
72                 if (isalpha(c) || c == '_') {
73                         tokens[c].type |= C_ALPHA;
74                 }
75                 if (isdigit(c)) {
76                         tokens[c].type |= C_DIGIT;
77                 }
78                 if (isspace(c)) {
79                         tokens[c].type |= C_SPACE;
80                 }
81                 if (isxdigit(c)) {
82                         tokens[c].type |= C_HEX;
83                 }
84         }
85         tokens['\''].type |= C_QUOTE;
86         tokens['"'].type |= C_QUOTE;
87         tokens['l'].type |= C_FLOAT;
88         tokens['L'].type |= C_FLOAT;
89         tokens['f'].type |= C_FLOAT;
90         tokens['F'].type |= C_FLOAT;
91         tokens['U'].type |= C_FLOAT;
92         tokens['u'].type |= C_FLOAT;
93
94         tokens['-'].type |= C_SIGN;
95         tokens['+'].type |= C_SIGN;
96
97         for (int i = 0; s_tokens[i]; i++) {
98                 unsigned char c = s_tokens[i][0];
99                 tokens[c].type |= C_TOKEN;
100                 tokens[c].toks[tokens[c].num_toks] = s_tokens[i];
101                 tokens[c].num_toks++;
102         }
103 }
104
105 // Buffer up characters before hashing them.
106 static void
107 pushchar(struct hash *hash, unsigned char c)
108 {
109         static unsigned char buf[64];
110         static size_t len;
111
112         if (c == 0) {
113                 if (len > 0) {
114                         hash_buffer(hash, (char *)buf, len);
115                         if (print_unified) {
116                                 printf("%.*s", (int) len, buf);
117                         }
118                         len = 0;
119                 }
120                 return;
121         }
122
123         buf[len++] = c;
124         if (len == 64) {
125                 hash_buffer(hash, (char *)buf, len);
126                 if (print_unified) {
127                         printf("%.*s", (int) len, buf);
128                 }
129                 len = 0;
130         }
131 }
132
133 // Hash some C/C++ code after unifying.
134 static void
135 unify(struct hash *hash, unsigned char *p, size_t size)
136 {
137         build_table();
138
139         for (size_t ofs = 0; ofs < size;) {
140                 if (p[ofs] == '#') {
141                         if ((size-ofs) > 2 && p[ofs+1] == ' ' && isdigit(p[ofs+2])) {
142                                 do {
143                                         ofs++;
144                                 } while (ofs < size && p[ofs] != '\n');
145                                 ofs++;
146                         } else {
147                                 do {
148                                         pushchar(hash, p[ofs]);
149                                         ofs++;
150                                 } while (ofs < size && p[ofs] != '\n');
151                                 pushchar(hash, '\n');
152                                 ofs++;
153                         }
154                         continue;
155                 }
156
157                 if (tokens[p[ofs]].type & C_ALPHA) {
158                         do {
159                                 pushchar(hash, p[ofs]);
160                                 ofs++;
161                         } while (ofs < size && (tokens[p[ofs]].type & (C_ALPHA|C_DIGIT)));
162                         pushchar(hash, '\n');
163                         continue;
164                 }
165
166                 if (tokens[p[ofs]].type & C_DIGIT) {
167                         do {
168                                 pushchar(hash, p[ofs]);
169                                 ofs++;
170                         } while (ofs < size &&
171                                  ((tokens[p[ofs]].type & C_DIGIT) || p[ofs] == '.'));
172                         if (ofs < size && (p[ofs] == 'x' || p[ofs] == 'X')) {
173                                 do {
174                                         pushchar(hash, p[ofs]);
175                                         ofs++;
176                                 } while (ofs < size && (tokens[p[ofs]].type & C_HEX));
177                         }
178                         if (ofs < size && (p[ofs] == 'E' || p[ofs] == 'e')) {
179                                 pushchar(hash, p[ofs]);
180                                 ofs++;
181                                 while (ofs < size && (tokens[p[ofs]].type & (C_DIGIT|C_SIGN))) {
182                                         pushchar(hash, p[ofs]);
183                                         ofs++;
184                                 }
185                         }
186                         while (ofs < size && (tokens[p[ofs]].type & C_FLOAT)) {
187                                 pushchar(hash, p[ofs]);
188                                 ofs++;
189                         }
190                         pushchar(hash, '\n');
191                         continue;
192                 }
193
194                 if (tokens[p[ofs]].type & C_SPACE) {
195                         do {
196                                 ofs++;
197                         } while (ofs < size && (tokens[p[ofs]].type & C_SPACE));
198                         continue;
199                 }
200
201                 if (tokens[p[ofs]].type & C_QUOTE) {
202                         unsigned char q = p[ofs];
203                         pushchar(hash, p[ofs]);
204                         do {
205                                 ofs++;
206                                 while (ofs < size-1 && p[ofs] == '\\') {
207                                         pushchar(hash, p[ofs]);
208                                         pushchar(hash, p[ofs+1]);
209                                         ofs += 2;
210                                 }
211                                 pushchar(hash, p[ofs]);
212                         } while (ofs < size && p[ofs] != q);
213                         pushchar(hash, '\n');
214                         ofs++;
215                         continue;
216                 }
217
218                 if (tokens[p[ofs]].type & C_TOKEN) {
219                         unsigned char q = p[ofs];
220                         int i;
221                         for (i = 0; i < tokens[q].num_toks; i++) {
222                                 const unsigned char *s = (const unsigned char *)tokens[q].toks[i];
223                                 int len = strlen((const char *)s);
224                                 if (size >= ofs+len && memcmp(&p[ofs], s, len) == 0) {
225                                         int j;
226                                         for (j = 0; s[j]; j++) {
227                                                 pushchar(hash, s[j]);
228                                                 ofs++;
229                                         }
230                                         pushchar(hash, '\n');
231                                         break;
232                                 }
233                         }
234                         if (i < tokens[q].num_toks) {
235                                 continue;
236                         }
237                 }
238
239                 pushchar(hash, p[ofs]);
240                 pushchar(hash, '\n');
241                 ofs++;
242         }
243         pushchar(hash, 0);
244 }
245
246
247 // Hash a file that consists of preprocessor output, but remove any line number
248 // information from the hash.
249 int
250 unify_hash(struct hash *hash, const char *fname, bool debug)
251 {
252         char *data;
253         size_t size;
254         if (!read_file(fname, 0, &data, &size)) {
255                 stats_update(STATS_PREPROCESSOR);
256                 return -1;
257         }
258         print_unified = debug;
259         unify(hash, (unsigned char *)data, size);
260         free(data);
261         return 0;
262 }