Imported Upstream version 3.5.1
[platform/upstream/ccache.git] / src / unify.c
1 // Copyright (C) 2002 Andrew Tridgell
2 // Copyright (C) 2009-2018 Joel Rosdahl
3 //
4 // This program is free software; you can redistribute it and/or modify it
5 // under the terms of the GNU General Public License as published by the Free
6 // Software Foundation; either version 3 of the License, or (at your option)
7 // any later version.
8 //
9 // This program is distributed in the hope that it will be useful, but WITHOUT
10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 // more details.
13 //
14 // You should have received a copy of the GNU General Public License along with
15 // this program; if not, write to the Free Software Foundation, Inc., 51
16 // Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17
18 // C/C++ unifier
19 //
20 // The idea is that changes that don't affect the resulting C code should not
21 // change the hash. This is achieved by folding white-space and other
22 // non-semantic fluff in the input into a single unified format.
23 //
24 // This unifier was design to match the output of the unifier in compilercache,
25 // which is flex based. The major difference is that this unifier is much
26 // faster (about 2x) and more forgiving of syntactic errors. Continuing on
27 // syntactic errors is important to cope with C/C++ extensions in the local
28 // compiler (for example, inline assembly systems).
29
30 #include "ccache.h"
31 #include "hash.h"
32 #include "unify.h"
33
34 static bool print_unified = true;
35
36 static const char *const s_tokens[] = {
37         "...", ">>=", "<<=", "+=", "-=", "*=", "/=", "%=", "&=", "^=",
38         "|=",  ">>",  "<<",  "++", "--", "->", "&&", "||", "<=", ">=",
39         "==",  "!=",  ";",   "{",  "<%", "}",  "%>", ",",  ":",  "=",
40         "(",   ")",   "[",   "<:", "]",  ":>", ".",  "&",  "!",  "~",
41         "-",   "+",   "*",   "/",  "%",  "<",  ">",  "^",  "|",  "?",
42         0
43 };
44
45 #define C_ALPHA 1
46 #define C_SPACE 2
47 #define C_TOKEN 4
48 #define C_QUOTE 8
49 #define C_DIGIT 16
50 #define C_HEX   32
51 #define C_FLOAT 64
52 #define C_SIGN  128
53
54 static struct {
55         unsigned char type;
56         unsigned char num_toks;
57         const char *toks[7];
58 } tokens[256];
59
60 // Build up the table used by the unifier.
61 static void
62 build_table(void)
63 {
64         static bool done;
65         if (done) {
66                 return;
67         }
68         done = true;
69
70         memset(tokens, 0, sizeof(tokens));
71         for (unsigned char c = 0; c < 128; c++) {
72                 if (isalpha(c) || c == '_') {
73                         tokens[c].type |= C_ALPHA;
74                 }
75                 if (isdigit(c)) {
76                         tokens[c].type |= C_DIGIT;
77                 }
78                 if (isspace(c)) {
79                         tokens[c].type |= C_SPACE;
80                 }
81                 if (isxdigit(c)) {
82                         tokens[c].type |= C_HEX;
83                 }
84         }
85         tokens['\''].type |= C_QUOTE;
86         tokens['"'].type |= C_QUOTE;
87         tokens['l'].type |= C_FLOAT;
88         tokens['L'].type |= C_FLOAT;
89         tokens['f'].type |= C_FLOAT;
90         tokens['F'].type |= C_FLOAT;
91         tokens['U'].type |= C_FLOAT;
92         tokens['u'].type |= C_FLOAT;
93
94         tokens['-'].type |= C_SIGN;
95         tokens['+'].type |= C_SIGN;
96
97         for (int i = 0; s_tokens[i]; i++) {
98                 unsigned char c = s_tokens[i][0];
99                 tokens[c].type |= C_TOKEN;
100                 tokens[c].toks[tokens[c].num_toks] = s_tokens[i];
101                 tokens[c].num_toks++;
102         }
103 }
104
105 // Buffer up characters before hashing them.
106 static void
107 pushchar(struct hash *hash, unsigned char c)
108 {
109         static unsigned char buf[64];
110         static size_t len;
111
112         if (c == 0) {
113                 if (len > 0) {
114                         hash_buffer(hash, (char *)buf, len);
115                         if (print_unified) {
116                                 printf("%.*s", (int) len, buf);
117                         }
118                         len = 0;
119                 }
120                 hash_buffer(hash, NULL, 0);
121                 return;
122         }
123
124         buf[len++] = c;
125         if (len == 64) {
126                 hash_buffer(hash, (char *)buf, len);
127                 if (print_unified) {
128                         printf("%.*s", (int) len, buf);
129                 }
130                 len = 0;
131         }
132 }
133
134 // Hash some C/C++ code after unifying.
135 static void
136 unify(struct hash *hash, unsigned char *p, size_t size)
137 {
138         build_table();
139
140         for (size_t ofs = 0; ofs < size;) {
141                 if (p[ofs] == '#') {
142                         if ((size-ofs) > 2 && p[ofs+1] == ' ' && isdigit(p[ofs+2])) {
143                                 do {
144                                         ofs++;
145                                 } while (ofs < size && p[ofs] != '\n');
146                                 ofs++;
147                         } else {
148                                 do {
149                                         pushchar(hash, p[ofs]);
150                                         ofs++;
151                                 } while (ofs < size && p[ofs] != '\n');
152                                 pushchar(hash, '\n');
153                                 ofs++;
154                         }
155                         continue;
156                 }
157
158                 if (tokens[p[ofs]].type & C_ALPHA) {
159                         do {
160                                 pushchar(hash, p[ofs]);
161                                 ofs++;
162                         } while (ofs < size && (tokens[p[ofs]].type & (C_ALPHA|C_DIGIT)));
163                         pushchar(hash, '\n');
164                         continue;
165                 }
166
167                 if (tokens[p[ofs]].type & C_DIGIT) {
168                         do {
169                                 pushchar(hash, p[ofs]);
170                                 ofs++;
171                         } while (ofs < size &&
172                                  ((tokens[p[ofs]].type & C_DIGIT) || p[ofs] == '.'));
173                         if (ofs < size && (p[ofs] == 'x' || p[ofs] == 'X')) {
174                                 do {
175                                         pushchar(hash, p[ofs]);
176                                         ofs++;
177                                 } while (ofs < size && (tokens[p[ofs]].type & C_HEX));
178                         }
179                         if (ofs < size && (p[ofs] == 'E' || p[ofs] == 'e')) {
180                                 pushchar(hash, p[ofs]);
181                                 ofs++;
182                                 while (ofs < size && (tokens[p[ofs]].type & (C_DIGIT|C_SIGN))) {
183                                         pushchar(hash, p[ofs]);
184                                         ofs++;
185                                 }
186                         }
187                         while (ofs < size && (tokens[p[ofs]].type & C_FLOAT)) {
188                                 pushchar(hash, p[ofs]);
189                                 ofs++;
190                         }
191                         pushchar(hash, '\n');
192                         continue;
193                 }
194
195                 if (tokens[p[ofs]].type & C_SPACE) {
196                         do {
197                                 ofs++;
198                         } while (ofs < size && (tokens[p[ofs]].type & C_SPACE));
199                         continue;
200                 }
201
202                 if (tokens[p[ofs]].type & C_QUOTE) {
203                         unsigned char q = p[ofs];
204                         pushchar(hash, p[ofs]);
205                         do {
206                                 ofs++;
207                                 while (ofs < size-1 && p[ofs] == '\\') {
208                                         pushchar(hash, p[ofs]);
209                                         pushchar(hash, p[ofs+1]);
210                                         ofs += 2;
211                                 }
212                                 pushchar(hash, p[ofs]);
213                         } while (ofs < size && p[ofs] != q);
214                         pushchar(hash, '\n');
215                         ofs++;
216                         continue;
217                 }
218
219                 if (tokens[p[ofs]].type & C_TOKEN) {
220                         unsigned char q = p[ofs];
221                         int i;
222                         for (i = 0; i < tokens[q].num_toks; i++) {
223                                 const unsigned char *s = (const unsigned char *)tokens[q].toks[i];
224                                 int len = strlen((const char *)s);
225                                 if (size >= ofs+len && memcmp(&p[ofs], s, len) == 0) {
226                                         int j;
227                                         for (j = 0; s[j]; j++) {
228                                                 pushchar(hash, s[j]);
229                                                 ofs++;
230                                         }
231                                         pushchar(hash, '\n');
232                                         break;
233                                 }
234                         }
235                         if (i < tokens[q].num_toks) {
236                                 continue;
237                         }
238                 }
239
240                 pushchar(hash, p[ofs]);
241                 pushchar(hash, '\n');
242                 ofs++;
243         }
244         pushchar(hash, 0);
245 }
246
247
248 // Hash a file that consists of preprocessor output, but remove any line number
249 // information from the hash.
250 int
251 unify_hash(struct hash *hash, const char *fname, bool debug)
252 {
253         char *data;
254         size_t size;
255         if (!read_file(fname, 0, &data, &size)) {
256                 stats_update(STATS_PREPROCESSOR);
257                 return -1;
258         }
259         print_unified = debug;
260         unify(hash, (unsigned char *)data, size);
261         free(data);
262         return 0;
263 }