src/unify.c

   1 // Copyright (C) 2002 Andrew Tridgell
   2 // Copyright (C) 2009-2018 Joel Rosdahl
   3 //
   4 // This program is free software; you can redistribute it and/or modify it
   5 // under the terms of the GNU General Public License as published by the Free
   6 // Software Foundation; either version 3 of the License, or (at your option)
   7 // any later version.
   8 //
   9 // This program is distributed in the hope that it will be useful, but WITHOUT
  10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11 // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
  12 // more details.
  13 //
  14 // You should have received a copy of the GNU General Public License along with
  15 // this program; if not, write to the Free Software Foundation, Inc., 51
  16 // Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  17
  18 // C/C++ unifier
  19 //
  20 // The idea is that changes that don't affect the resulting C code should not
  21 // change the hash. This is achieved by folding white-space and other
  22 // non-semantic fluff in the input into a single unified format.
  23 //
  24 // This unifier was design to match the output of the unifier in compilercache,
  25 // which is flex based. The major difference is that this unifier is much
  26 // faster (about 2x) and more forgiving of syntactic errors. Continuing on
  27 // syntactic errors is important to cope with C/C++ extensions in the local
  28 // compiler (for example, inline assembly systems).
  29
  30 #include "ccache.h"
  31 #include "hash.h"
  32 #include "unify.h"
  33
  34 static bool print_unified = true;
  35
  36 static const char *const s_tokens[] = {
  37         "...", ">>=", "<<=", "+=", "-=", "*=", "/=", "%=", "&=", "^=",
  38         "|=",  ">>",  "<<",  "++", "--", "->", "&&", "||", "<=", ">=",
  39         "==",  "!=",  ";",   "{",  "<%", "}",  "%>", ",",  ":",  "=",
  40         "(",   ")",   "[",   "<:", "]",  ":>", ".",  "&",  "!",  "~",
  41         "-",   "+",   "*",   "/",  "%",  "<",  ">",  "^",  "|",  "?",
  42         0
  43 };
  44
  45 #define C_ALPHA 1
  46 #define C_SPACE 2
  47 #define C_TOKEN 4
  48 #define C_QUOTE 8
  49 #define C_DIGIT 16
  50 #define C_HEX   32
  51 #define C_FLOAT 64
  52 #define C_SIGN  128
  53
  54 static struct {
  55         unsigned char type;
  56         unsigned char num_toks;
  57         const char *toks[7];
  58 } tokens[256];
  59
  60 // Build up the table used by the unifier.
  61 static void
  62 build_table(void)
  63 {
  64         static bool done;
  65         if (done) {
  66                 return;
  67         }
  68         done = true;
  69
  70         memset(tokens, 0, sizeof(tokens));
  71         for (unsigned char c = 0; c < 128; c++) {
  72                 if (isalpha(c) || c == '_') {
  73                         tokens[c].type |= C_ALPHA;
  74                 }
  75                 if (isdigit(c)) {
  76                         tokens[c].type |= C_DIGIT;
  77                 }
  78                 if (isspace(c)) {
  79                         tokens[c].type |= C_SPACE;
  80                 }
  81                 if (isxdigit(c)) {
  82                         tokens[c].type |= C_HEX;
  83                 }
  84         }
  85         tokens['\''].type |= C_QUOTE;
  86         tokens['"'].type |= C_QUOTE;
  87         tokens['l'].type |= C_FLOAT;
  88         tokens['L'].type |= C_FLOAT;
  89         tokens['f'].type |= C_FLOAT;
  90         tokens['F'].type |= C_FLOAT;
  91         tokens['U'].type |= C_FLOAT;
  92         tokens['u'].type |= C_FLOAT;
  93
  94         tokens['-'].type |= C_SIGN;
  95         tokens['+'].type |= C_SIGN;
  96
  97         for (int i = 0; s_tokens[i]; i++) {
  98                 unsigned char c = s_tokens[i][0];
  99                 tokens[c].type |= C_TOKEN;
 100                 tokens[c].toks[tokens[c].num_toks] = s_tokens[i];
 101                 tokens[c].num_toks++;
 102         }
 103 }
 104
 105 // Buffer up characters before hashing them.
 106 static void
 107 pushchar(struct hash *hash, unsigned char c)
 108 {
 109         static unsigned char buf[64];
 110         static size_t len;
 111
 112         if (c == 0) {
 113                 if (len > 0) {
 114                         hash_buffer(hash, (char *)buf, len);
 115                         if (print_unified) {
 116                                 printf("%.*s", (int) len, buf);
 117                         }
 118                         len = 0;
 119                 }
 120                 hash_buffer(hash, NULL, 0);
 121                 return;
 122         }
 123
 124         buf[len++] = c;
 125         if (len == 64) {
 126                 hash_buffer(hash, (char *)buf, len);
 127                 if (print_unified) {
 128                         printf("%.*s", (int) len, buf);
 129                 }
 130                 len = 0;
 131         }
 132 }
 133
 134 // Hash some C/C++ code after unifying.
 135 static void
 136 unify(struct hash *hash, unsigned char *p, size_t size)
 137 {
 138         build_table();
 139
 140         for (size_t ofs = 0; ofs < size;) {
 141                 if (p[ofs] == '#') {
 142                         if ((size-ofs) > 2 && p[ofs+1] == ' ' && isdigit(p[ofs+2])) {
 143                                 do {
 144                                         ofs++;
 145                                 } while (ofs < size && p[ofs] != '\n');
 146                                 ofs++;
 147                         } else {
 148                                 do {
 149                                         pushchar(hash, p[ofs]);
 150                                         ofs++;
 151                                 } while (ofs < size && p[ofs] != '\n');
 152                                 pushchar(hash, '\n');
 153                                 ofs++;
 154                         }
 155                         continue;
 156                 }
 157
 158                 if (tokens[p[ofs]].type & C_ALPHA) {
 159                         do {
 160                                 pushchar(hash, p[ofs]);
 161                                 ofs++;
 162                         } while (ofs < size && (tokens[p[ofs]].type & (C_ALPHA|C_DIGIT)));
 163                         pushchar(hash, '\n');
 164                         continue;
 165                 }
 166
 167                 if (tokens[p[ofs]].type & C_DIGIT) {
 168                         do {
 169                                 pushchar(hash, p[ofs]);
 170                                 ofs++;
 171                         } while (ofs < size &&
 172                                  ((tokens[p[ofs]].type & C_DIGIT) || p[ofs] == '.'));
 173                         if (ofs < size && (p[ofs] == 'x' || p[ofs] == 'X')) {
 174                                 do {
 175                                         pushchar(hash, p[ofs]);
 176                                         ofs++;
 177                                 } while (ofs < size && (tokens[p[ofs]].type & C_HEX));
 178                         }
 179                         if (ofs < size && (p[ofs] == 'E' || p[ofs] == 'e')) {
 180                                 pushchar(hash, p[ofs]);
 181                                 ofs++;
 182                                 while (ofs < size && (tokens[p[ofs]].type & (C_DIGIT|C_SIGN))) {
 183                                         pushchar(hash, p[ofs]);
 184                                         ofs++;
 185                                 }
 186                         }
 187                         while (ofs < size && (tokens[p[ofs]].type & C_FLOAT)) {
 188                                 pushchar(hash, p[ofs]);
 189                                 ofs++;
 190                         }
 191                         pushchar(hash, '\n');
 192                         continue;
 193                 }
 194
 195                 if (tokens[p[ofs]].type & C_SPACE) {
 196                         do {
 197                                 ofs++;
 198                         } while (ofs < size && (tokens[p[ofs]].type & C_SPACE));
 199                         continue;
 200                 }
 201
 202                 if (tokens[p[ofs]].type & C_QUOTE) {
 203                         unsigned char q = p[ofs];
 204                         pushchar(hash, p[ofs]);
 205                         do {
 206                                 ofs++;
 207                                 while (ofs < size-1 && p[ofs] == '\\') {
 208                                         pushchar(hash, p[ofs]);
 209                                         pushchar(hash, p[ofs+1]);
 210                                         ofs += 2;
 211                                 }
 212                                 pushchar(hash, p[ofs]);
 213                         } while (ofs < size && p[ofs] != q);
 214                         pushchar(hash, '\n');
 215                         ofs++;
 216                         continue;
 217                 }
 218
 219                 if (tokens[p[ofs]].type & C_TOKEN) {
 220                         unsigned char q = p[ofs];
 221                         int i;
 222                         for (i = 0; i < tokens[q].num_toks; i++) {
 223                                 const unsigned char *s = (const unsigned char *)tokens[q].toks[i];
 224                                 int len = strlen((const char *)s);
 225                                 if (size >= ofs+len && memcmp(&p[ofs], s, len) == 0) {
 226                                         int j;
 227                                         for (j = 0; s[j]; j++) {
 228                                                 pushchar(hash, s[j]);
 229                                                 ofs++;
 230                                         }
 231                                         pushchar(hash, '\n');
 232                                         break;
 233                                 }
 234                         }
 235                         if (i < tokens[q].num_toks) {
 236                                 continue;
 237                         }
 238                 }
 239
 240                 pushchar(hash, p[ofs]);
 241                 pushchar(hash, '\n');
 242                 ofs++;
 243         }
 244         pushchar(hash, 0);
 245 }
 246
 247
 248 // Hash a file that consists of preprocessor output, but remove any line number
 249 // information from the hash.
 250 int
 251 unify_hash(struct hash *hash, const char *fname, bool debug)
 252 {
 253         char *data;
 254         size_t size;
 255         if (!read_file(fname, 0, &data, &size)) {
 256                 stats_update(STATS_PREPROCESSOR);
 257                 return -1;
 258         }
 259         print_unified = debug;
 260         unify(hash, (unsigned char *)data, size);
 261         free(data);
 262         return 0;
 263 }