CCache/unify.c

   1 /*
   2    Copyright (C) Andrew Tridgell 2002
   3
   4    This program is free software; you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation; either version 2 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program; if not, write to the Free Software
  16    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  17 */
  18 /*
  19   C/C++ unifier
  20
  21   the idea is that changes that don't affect the resulting C code
  22   should not change the hash. This is achieved by folding white-space
  23   and other non-semantic fluff in the input into a single unified format.
  24
  25   This unifier was design to match the output of the unifier in
  26   compilercache, which is flex based. The major difference is that
  27   this unifier is much faster (about 2x) and more forgiving of
  28   syntactic errors. Continuing on syntactic errors is important to
  29   cope with C/C++ extensions in the local compiler (for example,
  30   inline assembly systems).
  31 */
  32
  33 #include "ccache.h"
  34
  35 static char *s_tokens[] = {
  36         "...",  ">>=",  "<<=",  "+=",   "-=",   "*=",   "/=",   "%=",   "&=",   "^=",
  37         "|=",   ">>",   "<<",   "++",   "--",   "->",   "&&",   "||",   "<=",   ">=",
  38         "==",   "!=",   ";",    "{",    "<%",   "}",    "%>",   ",",    ":",    "=",
  39         "(",    ")",    "[",    "<:",   "]",    ":>",   ".",    "&",    "!",    "~",
  40         "-",    "+",    "*",    "/",    "%",    "<",    ">",    "^",    "|",    "?",
  41         0
  42 };
  43
  44 #define C_ALPHA 1
  45 #define C_SPACE 2
  46 #define C_TOKEN 4
  47 #define C_QUOTE 8
  48 #define C_DIGIT 16
  49 #define C_HEX   32
  50 #define C_FLOAT 64
  51 #define C_SIGN  128
  52
  53 static struct {
  54         unsigned char type;
  55         unsigned char num_toks;
  56         char *toks[7];
  57 } tokens[256];
  58
  59 /* build up the table used by the unifier */
  60 static void build_table(void)
  61 {
  62         unsigned char c;
  63         int i;
  64         static int done;
  65
  66         if (done) return;
  67         done = 1;
  68
  69         memset(tokens, 0, sizeof(tokens));
  70         for (c=0;c<128;c++) {
  71                 if (isalpha(c) || c == '_') tokens[c].type |= C_ALPHA;
  72                 if (isdigit(c)) tokens[c].type |= C_DIGIT;
  73                 if (isspace(c)) tokens[c].type |= C_SPACE;
  74                 if (isxdigit(c)) tokens[c].type |= C_HEX;
  75         }
  76         tokens['\''].type |= C_QUOTE;
  77         tokens['"'].type |= C_QUOTE;
  78         tokens['l'].type |= C_FLOAT;
  79         tokens['L'].type |= C_FLOAT;
  80         tokens['f'].type |= C_FLOAT;
  81         tokens['F'].type |= C_FLOAT;
  82         tokens['U'].type |= C_FLOAT;
  83         tokens['u'].type |= C_FLOAT;
  84
  85         tokens['-'].type |= C_SIGN;
  86         tokens['+'].type |= C_SIGN;
  87
  88         for (i=0;s_tokens[i];i++) {
  89                 c = s_tokens[i][0];
  90                 tokens[c].type |= C_TOKEN;
  91                 tokens[c].toks[tokens[c].num_toks] = s_tokens[i];
  92                 tokens[c].num_toks++;
  93         }
  94 }
  95
  96 /* buffer up characters before hashing them */
  97 static void pushchar(unsigned char c)
  98 {
  99         static unsigned char buf[64];
 100         static int len;
 101
 102         if (c == 0) {
 103                 if (len > 0) {
 104                         hash_buffer((char *)buf, len);
 105                         len = 0;
 106                 }
 107                 hash_buffer(NULL, 0);
 108                 return;
 109         }
 110
 111         buf[len++] = c;
 112         if (len == 64) {
 113                 hash_buffer((char *)buf, len);
 114                 len = 0;
 115         }
 116 }
 117
 118 /* hash some C/C++ code after unifying */
 119 static void unify(unsigned char *p, size_t size)
 120 {
 121         size_t ofs;
 122         unsigned char q;
 123         int i;
 124
 125         build_table();
 126
 127         for (ofs=0; ofs<size;) {
 128                 if (p[ofs] == '#') {
 129                         if ((size-ofs) > 2 && p[ofs+1] == ' ' && isdigit(p[ofs+2])) {
 130                                 do {
 131                                         ofs++;
 132                                 } while (ofs < size && p[ofs] != '\n');
 133                                 ofs++;
 134                         } else {
 135                                 do {
 136                                         pushchar(p[ofs]);
 137                                         ofs++;
 138                                 } while (ofs < size && p[ofs] != '\n');
 139                                 pushchar('\n');
 140                                 ofs++;
 141                         }
 142                         continue;
 143                 }
 144
 145                 if (tokens[p[ofs]].type & C_ALPHA) {
 146                         do {
 147                                 pushchar(p[ofs]);
 148                                 ofs++;
 149                         } while (ofs < size &&
 150                                  (tokens[p[ofs]].type & (C_ALPHA|C_DIGIT)));
 151                         pushchar('\n');
 152                         continue;
 153                 }
 154
 155                 if (tokens[p[ofs]].type & C_DIGIT) {
 156                         do {
 157                                 pushchar(p[ofs]);
 158                                 ofs++;
 159                         } while (ofs < size &&
 160                                  ((tokens[p[ofs]].type & C_DIGIT) || p[ofs] == '.'));
 161                         if (ofs < size && (p[ofs] == 'x' || p[ofs] == 'X')) {
 162                                 do {
 163                                         pushchar(p[ofs]);
 164                                         ofs++;
 165                                 } while (ofs < size && (tokens[p[ofs]].type & C_HEX));
 166                         }
 167                         if (ofs < size && (p[ofs] == 'E' || p[ofs] == 'e')) {
 168                                 pushchar(p[ofs]);
 169                                 ofs++;
 170                                 while (ofs < size &&
 171                                        (tokens[p[ofs]].type & (C_DIGIT|C_SIGN))) {
 172                                         pushchar(p[ofs]);
 173                                         ofs++;
 174                                 }
 175                         }
 176                         while (ofs < size && (tokens[p[ofs]].type & C_FLOAT)) {
 177                                 pushchar(p[ofs]);
 178                                 ofs++;
 179                         }
 180                         pushchar('\n');
 181                         continue;
 182                 }
 183
 184                 if (tokens[p[ofs]].type & C_SPACE) {
 185                         do {
 186                                 ofs++;
 187                         } while (ofs < size && (tokens[p[ofs]].type & C_SPACE));
 188                         continue;
 189                 }
 190
 191                 if (tokens[p[ofs]].type & C_QUOTE) {
 192                         q = p[ofs];
 193                         pushchar(p[ofs]);
 194                         do {
 195                                 ofs++;
 196                                 while (ofs < size-1 && p[ofs] == '\\') {
 197                                         pushchar(p[ofs]);
 198                                         pushchar(p[ofs+1]);
 199                                         ofs+=2;
 200                                 }
 201                                 pushchar(p[ofs]);
 202                         } while (ofs < size && p[ofs] != q);
 203                         pushchar('\n');
 204                         ofs++;
 205                         continue;
 206                 }
 207
 208                 if (tokens[p[ofs]].type & C_TOKEN) {
 209                         q = p[ofs];
 210                         for (i=0;i<tokens[q].num_toks;i++) {
 211                                 unsigned char *s = (unsigned char *)tokens[q].toks[i];
 212                                 int len = strlen((char *)s);
 213                                 if (size >= ofs+len && memcmp(&p[ofs], s, len) == 0) {
 214                                         int j;
 215                                         for (j=0;s[j];j++) {
 216                                                 pushchar(s[j]);
 217                                                 ofs++;
 218                                         }
 219                                         pushchar('\n');
 220                                         break;
 221                                 }
 222                         }
 223                         if (i < tokens[q].num_toks) {
 224                                 continue;
 225                         }
 226                 }
 227
 228                 pushchar(p[ofs]);
 229                 pushchar('\n');
 230                 ofs++;
 231         }
 232         pushchar(0);
 233 }
 234
 235
 236 /* hash a file that consists of preprocessor output, but remove any line
 237    number information from the hash
 238 */
 239 int unify_hash(const char *fname)
 240 {
 241 #ifdef _WIN32
 242         HANDLE file;
 243         HANDLE section;
 244         DWORD filesize_low;
 245         char *map;
 246         int ret = -1;
 247
 248         file = CreateFileA(fname, GENERIC_READ, FILE_SHARE_READ, NULL,
 249                            OPEN_EXISTING, 0, NULL);
 250         if (file != INVALID_HANDLE_VALUE) {
 251                 filesize_low = GetFileSize(file, NULL);
 252                 if (!(filesize_low == INVALID_FILE_SIZE && GetLastError() != NO_ERROR)) {
 253                         section = CreateFileMappingA(file, NULL, PAGE_READONLY, 0, 0, NULL);
 254                         CloseHandle(file);
 255                         if (section != NULL) {
 256                                 map = MapViewOfFile(section, FILE_MAP_READ, 0, 0, 0);
 257                                 CloseHandle(section);
 258                                 if (map != NULL)
 259                                         ret = 0;
 260                         }
 261                 }
 262         }
 263
 264         if (ret == -1) {
 265                 cc_log("Failed to open preprocessor output %s\n", fname);
 266                 stats_update(STATS_PREPROCESSOR);
 267                 return -1;
 268         }
 269
 270         /* pass it through the unifier */
 271         unify((unsigned char *)map, filesize_low);
 272
 273         UnmapViewOfFile(map);
 274
 275         return 0;
 276 #else
 277         int fd;
 278         struct stat st;
 279         char *map;
 280
 281         fd = open(fname, O_RDONLY|O_BINARY);
 282         if (fd == -1 || fstat(fd, &st) != 0) {
 283                 cc_log("Failed to open preprocessor output %s\n", fname);
 284                 stats_update(STATS_PREPROCESSOR);
 285                 return -1;
 286         }
 287
 288         /* we use mmap() to make it easy to handle arbitrarily long
 289            lines in preprocessor output. I have seen lines of over
 290            100k in length, so this is well worth it */
 291         map = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
 292         if (map == (char *)-1) {
 293                 cc_log("Failed to mmap %s\n", fname);
 294                 stats_update(STATS_PREPROCESSOR);
 295                 return -1;
 296         }
 297         close(fd);
 298
 299         /* pass it through the unifier */
 300         unify((unsigned char *)map, st.st_size);
 301
 302         munmap(map, st.st_size);
 303
 304         return 0;
 305 #endif
 306 }
 307