unify.c

   1 /*
   2  * Copyright (C) 2002 Andrew Tridgell
   3  *
   4  * This program is free software; you can redistribute it and/or modify it
   5  * under the terms of the GNU General Public License as published by the Free
   6  * Software Foundation; either version 3 of the License, or (at your option)
   7  * any later version.
   8  *
   9  * This program is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
  12  * more details.
  13  *
  14  * You should have received a copy of the GNU General Public License along with
  15  * this program; if not, write to the Free Software Foundation, Inc., 51
  16  * Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  17 */
  18
  19 /*
  20  * C/C++ unifier
  21  *
  22  * The idea is that changes that don't affect the resulting C code should not
  23  * change the hash. This is achieved by folding white-space and other
  24  * non-semantic fluff in the input into a single unified format.
  25  *
  26  * This unifier was design to match the output of the unifier in compilercache,
  27  * which is flex based. The major difference is that this unifier is much
  28  * faster (about 2x) and more forgiving of syntactic errors. Continuing on
  29  * syntactic errors is important to cope with C/C++ extensions in the local
  30  * compiler (for example, inline assembly systems).
  31  */
  32
  33 #include "ccache.h"
  34
  35 static const char *const s_tokens[] = {
  36         "...", ">>=", "<<=", "+=", "-=", "*=", "/=", "%=", "&=", "^=",
  37         "|=",  ">>",  "<<",  "++", "--", "->", "&&", "||", "<=", ">=",
  38         "==",  "!=",  ";",   "{",  "<%", "}",  "%>", ",",  ":",  "=",
  39         "(",   ")",   "[",   "<:", "]",  ":>", ".",  "&",  "!",  "~",
  40         "-",   "+",   "*",   "/",  "%",  "<",  ">",  "^",  "|",  "?",
  41         0
  42 };
  43
  44 #define C_ALPHA 1
  45 #define C_SPACE 2
  46 #define C_TOKEN 4
  47 #define C_QUOTE 8
  48 #define C_DIGIT 16
  49 #define C_HEX   32
  50 #define C_FLOAT 64
  51 #define C_SIGN  128
  52
  53 static struct {
  54         unsigned char type;
  55         unsigned char num_toks;
  56         const char *toks[7];
  57 } tokens[256];
  58
  59 /* build up the table used by the unifier */
  60 static void
  61 build_table(void)
  62 {
  63         unsigned char c;
  64         int i;
  65         static bool done;
  66
  67         if (done) return;
  68         done = true;
  69
  70         memset(tokens, 0, sizeof(tokens));
  71         for (c = 0; c < 128; c++) {
  72                 if (isalpha(c) || c == '_') tokens[c].type |= C_ALPHA;
  73                 if (isdigit(c)) tokens[c].type |= C_DIGIT;
  74                 if (isspace(c)) tokens[c].type |= C_SPACE;
  75                 if (isxdigit(c)) tokens[c].type |= C_HEX;
  76         }
  77         tokens['\''].type |= C_QUOTE;
  78         tokens['"'].type |= C_QUOTE;
  79         tokens['l'].type |= C_FLOAT;
  80         tokens['L'].type |= C_FLOAT;
  81         tokens['f'].type |= C_FLOAT;
  82         tokens['F'].type |= C_FLOAT;
  83         tokens['U'].type |= C_FLOAT;
  84         tokens['u'].type |= C_FLOAT;
  85
  86         tokens['-'].type |= C_SIGN;
  87         tokens['+'].type |= C_SIGN;
  88
  89         for (i = 0; s_tokens[i]; i++) {
  90                 c = s_tokens[i][0];
  91                 tokens[c].type |= C_TOKEN;
  92                 tokens[c].toks[tokens[c].num_toks] = s_tokens[i];
  93                 tokens[c].num_toks++;
  94         }
  95 }
  96
  97 /* buffer up characters before hashing them */
  98 static void
  99 pushchar(struct mdfour *hash, unsigned char c)
 100 {
 101         static unsigned char buf[64];
 102         static size_t len;
 103
 104         if (c == 0) {
 105                 if (len > 0) {
 106                         hash_buffer(hash, (char *)buf, len);
 107                         len = 0;
 108                 }
 109                 hash_buffer(hash, NULL, 0);
 110                 return;
 111         }
 112
 113         buf[len++] = c;
 114         if (len == 64) {
 115                 hash_buffer(hash, (char *)buf, len);
 116                 len = 0;
 117         }
 118 }
 119
 120 /* hash some C/C++ code after unifying */
 121 static void
 122 unify(struct mdfour *hash, unsigned char *p, size_t size)
 123 {
 124         size_t ofs;
 125         unsigned char q;
 126         int i;
 127
 128         build_table();
 129
 130         for (ofs = 0; ofs < size;) {
 131                 if (p[ofs] == '#') {
 132                         if ((size-ofs) > 2 && p[ofs+1] == ' ' && isdigit(p[ofs+2])) {
 133                                 do {
 134                                         ofs++;
 135                                 } while (ofs < size && p[ofs] != '\n');
 136                                 ofs++;
 137                         } else {
 138                                 do {
 139                                         pushchar(hash, p[ofs]);
 140                                         ofs++;
 141                                 } while (ofs < size && p[ofs] != '\n');
 142                                 pushchar(hash, '\n');
 143                                 ofs++;
 144                         }
 145                         continue;
 146                 }
 147
 148                 if (tokens[p[ofs]].type & C_ALPHA) {
 149                         do {
 150                                 pushchar(hash, p[ofs]);
 151                                 ofs++;
 152                         } while (ofs < size && (tokens[p[ofs]].type & (C_ALPHA|C_DIGIT)));
 153                         pushchar(hash, '\n');
 154                         continue;
 155                 }
 156
 157                 if (tokens[p[ofs]].type & C_DIGIT) {
 158                         do {
 159                                 pushchar(hash, p[ofs]);
 160                                 ofs++;
 161                         } while (ofs < size &&
 162                                  ((tokens[p[ofs]].type & C_DIGIT) || p[ofs] == '.'));
 163                         if (ofs < size && (p[ofs] == 'x' || p[ofs] == 'X')) {
 164                                 do {
 165                                         pushchar(hash, p[ofs]);
 166                                         ofs++;
 167                                 } while (ofs < size && (tokens[p[ofs]].type & C_HEX));
 168                         }
 169                         if (ofs < size && (p[ofs] == 'E' || p[ofs] == 'e')) {
 170                                 pushchar(hash, p[ofs]);
 171                                 ofs++;
 172                                 while (ofs < size && (tokens[p[ofs]].type & (C_DIGIT|C_SIGN))) {
 173                                         pushchar(hash, p[ofs]);
 174                                         ofs++;
 175                                 }
 176                         }
 177                         while (ofs < size && (tokens[p[ofs]].type & C_FLOAT)) {
 178                                 pushchar(hash, p[ofs]);
 179                                 ofs++;
 180                         }
 181                         pushchar(hash, '\n');
 182                         continue;
 183                 }
 184
 185                 if (tokens[p[ofs]].type & C_SPACE) {
 186                         do {
 187                                 ofs++;
 188                         } while (ofs < size && (tokens[p[ofs]].type & C_SPACE));
 189                         continue;
 190                 }
 191
 192                 if (tokens[p[ofs]].type & C_QUOTE) {
 193                         q = p[ofs];
 194                         pushchar(hash, p[ofs]);
 195                         do {
 196                                 ofs++;
 197                                 while (ofs < size-1 && p[ofs] == '\\') {
 198                                         pushchar(hash, p[ofs]);
 199                                         pushchar(hash, p[ofs+1]);
 200                                         ofs += 2;
 201                                 }
 202                                 pushchar(hash, p[ofs]);
 203                         } while (ofs < size && p[ofs] != q);
 204                         pushchar(hash, '\n');
 205                         ofs++;
 206                         continue;
 207                 }
 208
 209                 if (tokens[p[ofs]].type & C_TOKEN) {
 210                         q = p[ofs];
 211                         for (i = 0; i < tokens[q].num_toks; i++) {
 212                                 unsigned char *s = (unsigned char *)tokens[q].toks[i];
 213                                 int len = strlen((char *)s);
 214                                 if (size >= ofs+len && memcmp(&p[ofs], s, len) == 0) {
 215                                         int j;
 216                                         for (j = 0; s[j]; j++) {
 217                                                 pushchar(hash, s[j]);
 218                                                 ofs++;
 219                                         }
 220                                         pushchar(hash, '\n');
 221                                         break;
 222                                 }
 223                         }
 224                         if (i < tokens[q].num_toks) {
 225                                 continue;
 226                         }
 227                 }
 228
 229                 pushchar(hash, p[ofs]);
 230                 pushchar(hash, '\n');
 231                 ofs++;
 232         }
 233         pushchar(hash, 0);
 234 }
 235
 236
 237 /* hash a file that consists of preprocessor output, but remove any line
 238    number information from the hash
 239 */
 240 int
 241 unify_hash(struct mdfour *hash, const char *fname)
 242 {
 243         char *data;
 244         size_t size;
 245
 246         if (!read_file(fname, 0, &data, &size)) {
 247                 stats_update(STATS_PREPROCESSOR);
 248                 return -1;
 249         }
 250         unify(hash, (unsigned char *)data, size);
 251         free(data);
 252         return 0;
 253 }