src/analyze.c

   1 /* Analyze file differences for GNU DIFF.
   2
   3    Copyright (C) 1988-1989, 1992-1995, 1998, 2001-2002, 2004, 2006-2007,
   4    2009-2013, 2015-2021 Free Software Foundation, Inc.
   5
   6    This file is part of GNU DIFF.
   7
   8    This program is free software: you can redistribute it and/or modify
   9    it under the terms of the GNU General Public License as published by
  10    the Free Software Foundation, either version 3 of the License, or
  11    (at your option) any later version.
  12
  13    This program is distributed in the hope that it will be useful,
  14    but WITHOUT ANY WARRANTY; without even the implied warranty of
  15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16    GNU General Public License for more details.
  17
  18    You should have received a copy of the GNU General Public License
  19    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  20
  21 #include "diff.h"
  22 #include <cmpbuf.h>
  23 #include <error.h>
  24 #include <file-type.h>
  25 #include <xalloc.h>
  26
  27 /* The core of the Diff algorithm.  */
  28 #define ELEMENT lin
  29 #define EQUAL(x,y) ((x) == (y))
  30 #define OFFSET lin
  31 #define EXTRA_CONTEXT_FIELDS /* none */
  32 #define NOTE_DELETE(c, xoff) (files[0].changed[files[0].realindexes[xoff]] = 1)
  33 #define NOTE_INSERT(c, yoff) (files[1].changed[files[1].realindexes[yoff]] = 1)
  34 #define USE_HEURISTIC 1
  35 #include <diffseq.h>
  36
  37 /* Discard lines from one file that have no matches in the other file.
  38
  39    A line which is discarded will not be considered by the actual
  40    comparison algorithm; it will be as if that line were not in the file.
  41    The file's 'realindexes' table maps virtual line numbers
  42    (which don't count the discarded lines) into real line numbers;
  43    this is how the actual comparison algorithm produces results
  44    that are comprehensible when the discarded lines are counted.
  45
  46    When we discard a line, we also mark it as a deletion or insertion
  47    so that it will be printed in the output.  */
  48
  49 static void
  50 discard_confusing_lines (struct file_data filevec[])
  51 {
  52   int f;
  53   lin i;
  54   char *discarded[2];
  55   lin *equiv_count[2];
  56   lin *p;
  57
  58   /* Allocate our results.  */
  59   p = xmalloc ((filevec[0].buffered_lines + filevec[1].buffered_lines)
  60                * (2 * sizeof *p));
  61   for (f = 0; f < 2; f++)
  62     {
  63       filevec[f].undiscarded = p;  p += filevec[f].buffered_lines;
  64       filevec[f].realindexes = p;  p += filevec[f].buffered_lines;
  65     }
  66
  67   /* Set up equiv_count[F][I] as the number of lines in file F
  68      that fall in equivalence class I.  */
  69
  70   p = zalloc (filevec[0].equiv_max * (2 * sizeof *p));
  71   equiv_count[0] = p;
  72   equiv_count[1] = p + filevec[0].equiv_max;
  73
  74   for (i = 0; i < filevec[0].buffered_lines; ++i)
  75     ++equiv_count[0][filevec[0].equivs[i]];
  76   for (i = 0; i < filevec[1].buffered_lines; ++i)
  77     ++equiv_count[1][filevec[1].equivs[i]];
  78
  79   /* Set up tables of which lines are going to be discarded.  */
  80
  81   discarded[0] = zalloc (filevec[0].buffered_lines
  82                          + filevec[1].buffered_lines);
  83   discarded[1] = discarded[0] + filevec[0].buffered_lines;
  84
  85   /* Mark to be discarded each line that matches no line of the other file.
  86      If a line matches many lines, mark it as provisionally discardable.  */
  87
  88   for (f = 0; f < 2; f++)
  89     {
  90       size_t end = filevec[f].buffered_lines;
  91       char *discards = discarded[f];
  92       lin *counts = equiv_count[1 - f];
  93       lin *equivs = filevec[f].equivs;
  94       size_t many = 5;
  95       size_t tem = end / 64;
  96
  97       /* Multiply MANY by approximate square root of number of lines.
  98          That is the threshold for provisionally discardable lines.  */
  99       while ((tem = tem >> 2) > 0)
 100         many *= 2;
 101
 102       for (i = 0; i < end; i++)
 103         {
 104           lin nmatch;
 105           if (equivs[i] == 0)
 106             continue;
 107           nmatch = counts[equivs[i]];
 108           if (nmatch == 0)
 109             discards[i] = 1;
 110           else if (nmatch > many)
 111             discards[i] = 2;
 112         }
 113     }
 114
 115   /* Don't really discard the provisional lines except when they occur
 116      in a run of discardables, with nonprovisionals at the beginning
 117      and end.  */
 118
 119   for (f = 0; f < 2; f++)
 120     {
 121       lin end = filevec[f].buffered_lines;
 122       register char *discards = discarded[f];
 123
 124       for (i = 0; i < end; i++)
 125         {
 126           /* Cancel provisional discards not in middle of run of discards.  */
 127           if (discards[i] == 2)
 128             discards[i] = 0;
 129           else if (discards[i] != 0)
 130             {
 131               /* We have found a nonprovisional discard.  */
 132               register lin j;
 133               lin length;
 134               lin provisional = 0;
 135
 136               /* Find end of this run of discardable lines.
 137                  Count how many are provisionally discardable.  */
 138               for (j = i; j < end; j++)
 139                 {
 140                   if (discards[j] == 0)
 141                     break;
 142                   if (discards[j] == 2)
 143                     ++provisional;
 144                 }
 145
 146               /* Cancel provisional discards at end, and shrink the run.  */
 147               while (j > i && discards[j - 1] == 2)
 148                 discards[--j] = 0, --provisional;
 149
 150               /* Now we have the length of a run of discardable lines
 151                  whose first and last are not provisional.  */
 152               length = j - i;
 153
 154               /* If 1/4 of the lines in the run are provisional,
 155                  cancel discarding of all provisional lines in the run.  */
 156               if (provisional * 4 > length)
 157                 {
 158                   while (j > i)
 159                     if (discards[--j] == 2)
 160                       discards[j] = 0;
 161                 }
 162               else
 163                 {
 164                   register lin consec;
 165                   lin minimum = 1;
 166                   lin tem = length >> 2;
 167
 168                   /* MINIMUM is approximate square root of LENGTH/4.
 169                      A subrun of two or more provisionals can stand
 170                      when LENGTH is at least 16.
 171                      A subrun of 4 or more can stand when LENGTH >= 64.  */
 172                   while (0 < (tem >>= 2))
 173                     minimum <<= 1;
 174                   minimum++;
 175
 176                   /* Cancel any subrun of MINIMUM or more provisionals
 177                      within the larger run.  */
 178                   for (j = 0, consec = 0; j < length; j++)
 179                     if (discards[i + j] != 2)
 180                       consec = 0;
 181                     else if (minimum == ++consec)
 182                       /* Back up to start of subrun, to cancel it all.  */
 183                       j -= consec;
 184                     else if (minimum < consec)
 185                       discards[i + j] = 0;
 186
 187                   /* Scan from beginning of run
 188                      until we find 3 or more nonprovisionals in a row
 189                      or until the first nonprovisional at least 8 lines in.
 190                      Until that point, cancel any provisionals.  */
 191                   for (j = 0, consec = 0; j < length; j++)
 192                     {
 193                       if (j >= 8 && discards[i + j] == 1)
 194                         break;
 195                       if (discards[i + j] == 2)
 196                         consec = 0, discards[i + j] = 0;
 197                       else if (discards[i + j] == 0)
 198                         consec = 0;
 199                       else
 200                         consec++;
 201                       if (consec == 3)
 202                         break;
 203                     }
 204
 205                   /* I advances to the last line of the run.  */
 206                   i += length - 1;
 207
 208                   /* Same thing, from end.  */
 209                   for (j = 0, consec = 0; j < length; j++)
 210                     {
 211                       if (j >= 8 && discards[i - j] == 1)
 212                         break;
 213                       if (discards[i - j] == 2)
 214                         consec = 0, discards[i - j] = 0;
 215                       else if (discards[i - j] == 0)
 216                         consec = 0;
 217                       else
 218                         consec++;
 219                       if (consec == 3)
 220                         break;
 221                     }
 222                 }
 223             }
 224         }
 225     }
 226
 227   /* Actually discard the lines. */
 228   for (f = 0; f < 2; f++)
 229     {
 230       char *discards = discarded[f];
 231       lin end = filevec[f].buffered_lines;
 232       lin j = 0;
 233       for (i = 0; i < end; ++i)
 234         if (minimal || discards[i] == 0)
 235           {
 236             filevec[f].undiscarded[j] = filevec[f].equivs[i];
 237             filevec[f].realindexes[j++] = i;
 238           }
 239         else
 240           filevec[f].changed[i] = 1;
 241       filevec[f].nondiscarded_lines = j;
 242     }
 243
 244   free (discarded[0]);
 245   free (equiv_count[0]);
 246 }
 247 \f
 248 /* Adjust inserts/deletes of identical lines to join changes
 249    as much as possible.
 250
 251    We do something when a run of changed lines include a
 252    line at one end and have an excluded, identical line at the other.
 253    We are free to choose which identical line is included.
 254    'compareseq' usually chooses the one at the beginning,
 255    but usually it is cleaner to consider the following identical line
 256    to be the "change".  */
 257
 258 static void
 259 shift_boundaries (struct file_data filevec[])
 260 {
 261   int f;
 262
 263   for (f = 0; f < 2; f++)
 264     {
 265       char *changed = filevec[f].changed;
 266       char *other_changed = filevec[1 - f].changed;
 267       lin const *equivs = filevec[f].equivs;
 268       lin i = 0;
 269       lin j = 0;
 270       lin i_end = filevec[f].buffered_lines;
 271
 272       while (1)
 273         {
 274           lin runlength, start, corresponding;
 275
 276           /* Scan forwards to find beginning of another run of changes.
 277              Also keep track of the corresponding point in the other file.  */
 278
 279           while (i < i_end && !changed[i])
 280             {
 281               while (other_changed[j++])
 282                 continue;
 283               i++;
 284             }
 285
 286           if (i == i_end)
 287             break;
 288
 289           start = i;
 290
 291           /* Find the end of this run of changes.  */
 292
 293           while (changed[++i])
 294             continue;
 295           while (other_changed[j])
 296             j++;
 297
 298           do
 299             {
 300               /* Record the length of this run of changes, so that
 301                  we can later determine whether the run has grown.  */
 302               runlength = i - start;
 303
 304               /* Move the changed region back, so long as the
 305                  previous unchanged line matches the last changed one.
 306                  This merges with previous changed regions.  */
 307
 308               while (start && equivs[start - 1] == equivs[i - 1])
 309                 {
 310                   changed[--start] = 1;
 311                   changed[--i] = 0;
 312                   while (changed[start - 1])
 313                     start--;
 314                   while (other_changed[--j])
 315                     continue;
 316                 }
 317
 318               /* Set CORRESPONDING to the end of the changed run, at the last
 319                  point where it corresponds to a changed run in the other file.
 320                  CORRESPONDING == I_END means no such point has been found.  */
 321               corresponding = other_changed[j - 1] ? i : i_end;
 322
 323               /* Move the changed region forward, so long as the
 324                  first changed line matches the following unchanged one.
 325                  This merges with following changed regions.
 326                  Do this second, so that if there are no merges,
 327                  the changed region is moved forward as far as possible.  */
 328
 329               while (i != i_end && equivs[start] == equivs[i])
 330                 {
 331                   changed[start++] = 0;
 332                   changed[i++] = 1;
 333                   while (changed[i])
 334                     i++;
 335                   while (other_changed[++j])
 336                     corresponding = i;
 337                 }
 338             }
 339           while (runlength != i - start);
 340
 341           /* If possible, move the fully-merged run of changes
 342              back to a corresponding run in the other file.  */
 343
 344           while (corresponding < i)
 345             {
 346               changed[--start] = 1;
 347               changed[--i] = 0;
 348               while (other_changed[--j])
 349                 continue;
 350             }
 351         }
 352     }
 353 }
 354 \f
 355 /* Cons an additional entry onto the front of an edit script OLD.
 356    LINE0 and LINE1 are the first affected lines in the two files (origin 0).
 357    DELETED is the number of lines deleted here from file 0.
 358    INSERTED is the number of lines inserted here in file 1.
 359
 360    If DELETED is 0 then LINE0 is the number of the line before
 361    which the insertion was done; vice versa for INSERTED and LINE1.  */
 362
 363 static struct change *
 364 add_change (lin line0, lin line1, lin deleted, lin inserted,
 365             struct change *old)
 366 {
 367   struct change *new = xmalloc (sizeof *new);
 368
 369   new->line0 = line0;
 370   new->line1 = line1;
 371   new->inserted = inserted;
 372   new->deleted = deleted;
 373   new->link = old;
 374   return new;
 375 }
 376
 377 /* Scan the tables of which lines are inserted and deleted,
 378    producing an edit script in reverse order.  */
 379
 380 static struct change *
 381 build_reverse_script (struct file_data const filevec[])
 382 {
 383   struct change *script = 0;
 384   char *changed0 = filevec[0].changed;
 385   char *changed1 = filevec[1].changed;
 386   lin len0 = filevec[0].buffered_lines;
 387   lin len1 = filevec[1].buffered_lines;
 388
 389   /* Note that changedN[lenN] does exist, and is 0.  */
 390
 391   lin i0 = 0, i1 = 0;
 392
 393   while (i0 < len0 || i1 < len1)
 394     {
 395       if (changed0[i0] | changed1[i1])
 396         {
 397           lin line0 = i0, line1 = i1;
 398
 399           /* Find # lines changed here in each file.  */
 400           while (changed0[i0]) ++i0;
 401           while (changed1[i1]) ++i1;
 402
 403           /* Record this change.  */
 404           script = add_change (line0, line1, i0 - line0, i1 - line1, script);
 405         }
 406
 407       /* We have reached lines in the two files that match each other.  */
 408       i0++, i1++;
 409     }
 410
 411   return script;
 412 }
 413
 414 /* Scan the tables of which lines are inserted and deleted,
 415    producing an edit script in forward order.  */
 416
 417 static struct change *
 418 build_script (struct file_data const filevec[])
 419 {
 420   struct change *script = 0;
 421   char *changed0 = filevec[0].changed;
 422   char *changed1 = filevec[1].changed;
 423   lin i0 = filevec[0].buffered_lines, i1 = filevec[1].buffered_lines;
 424
 425   /* Note that changedN[-1] does exist, and is 0.  */
 426
 427   while (i0 >= 0 || i1 >= 0)
 428     {
 429       if (changed0[i0 - 1] | changed1[i1 - 1])
 430         {
 431           lin line0 = i0, line1 = i1;
 432
 433           /* Find # lines changed here in each file.  */
 434           while (changed0[i0 - 1]) --i0;
 435           while (changed1[i1 - 1]) --i1;
 436
 437           /* Record this change.  */
 438           script = add_change (i0, i1, line0 - i0, line1 - i1, script);
 439         }
 440
 441       /* We have reached lines in the two files that match each other.  */
 442       i0--, i1--;
 443     }
 444
 445   return script;
 446 }
 447 \f
 448 /* If CHANGES, briefly report that two files differed.  */
 449 static void
 450 briefly_report (int changes, struct file_data const filevec[])
 451 {
 452   if (changes)
 453     message ((brief
 454               ? _("Files %s and %s differ\n")
 455               : _("Binary files %s and %s differ\n")),
 456              file_label[0] ? file_label[0] : filevec[0].name,
 457              file_label[1] ? file_label[1] : filevec[1].name);
 458 }
 459
 460 /* Report the differences of two files.  */
 461 int
 462 diff_2_files (struct comparison *cmp)
 463 {
 464   int f;
 465   struct change *e, *p;
 466   struct change *script;
 467   int changes;
 468
 469
 470   /* If we have detected that either file is binary,
 471      compare the two files as binary.  This can happen
 472      only when the first chunk is read.
 473      Also, --brief without any --ignore-* options means
 474      we can speed things up by treating the files as binary.  */
 475
 476   if (read_files (cmp->file, files_can_be_treated_as_binary))
 477     {
 478       /* Files with different lengths must be different.  */
 479       if (cmp->file[0].stat.st_size != cmp->file[1].stat.st_size
 480           && 0 < cmp->file[0].stat.st_size
 481           && 0 < cmp->file[1].stat.st_size
 482           && (cmp->file[0].desc < 0 || S_ISREG (cmp->file[0].stat.st_mode))
 483           && (cmp->file[1].desc < 0 || S_ISREG (cmp->file[1].stat.st_mode)))
 484         changes = 1;
 485
 486       /* Standard input equals itself.  */
 487       else if (cmp->file[0].desc == cmp->file[1].desc)
 488         changes = 0;
 489
 490       else
 491         /* Scan both files, a buffer at a time, looking for a difference.  */
 492         {
 493           /* Allocate same-sized buffers for both files.  */
 494           size_t lcm_max = PTRDIFF_MAX - 1;
 495           size_t buffer_size =
 496             buffer_lcm (sizeof (word),
 497                         buffer_lcm (STAT_BLOCKSIZE (cmp->file[0].stat),
 498                                     STAT_BLOCKSIZE (cmp->file[1].stat),
 499                                     lcm_max),
 500                         lcm_max);
 501           for (f = 0; f < 2; f++)
 502             cmp->file[f].buffer = xrealloc (cmp->file[f].buffer, buffer_size);
 503
 504           for (;; cmp->file[0].buffered = cmp->file[1].buffered = 0)
 505             {
 506               /* Read a buffer's worth from both files.  */
 507               for (f = 0; f < 2; f++)
 508                 if (0 <= cmp->file[f].desc)
 509                   file_block_read (&cmp->file[f],
 510                                    buffer_size - cmp->file[f].buffered);
 511
 512               /* If the buffers differ, the files differ.  */
 513               if (cmp->file[0].buffered != cmp->file[1].buffered
 514                   || memcmp (cmp->file[0].buffer,
 515                              cmp->file[1].buffer,
 516                              cmp->file[0].buffered))
 517                 {
 518                   changes = 1;
 519                   break;
 520                 }
 521
 522               /* If we reach end of file, the files are the same.  */
 523               if (cmp->file[0].buffered != buffer_size)
 524                 {
 525                   changes = 0;
 526                   break;
 527                 }
 528             }
 529         }
 530
 531       briefly_report (changes, cmp->file);
 532     }
 533   else
 534     {
 535       struct context ctxt;
 536       lin diags;
 537       lin too_expensive;
 538
 539       /* Allocate vectors for the results of comparison:
 540          a flag for each line of each file, saying whether that line
 541          is an insertion or deletion.
 542          Allocate an extra element, always 0, at each end of each vector.  */
 543
 544       size_t s = cmp->file[0].buffered_lines + cmp->file[1].buffered_lines + 4;
 545       char *flag_space = zalloc (s);
 546       cmp->file[0].changed = flag_space + 1;
 547       cmp->file[1].changed = flag_space + cmp->file[0].buffered_lines + 3;
 548
 549       /* Some lines are obviously insertions or deletions
 550          because they don't match anything.  Detect them now, and
 551          avoid even thinking about them in the main comparison algorithm.  */
 552
 553       discard_confusing_lines (cmp->file);
 554
 555       /* Now do the main comparison algorithm, considering just the
 556          undiscarded lines.  */
 557
 558       ctxt.xvec = cmp->file[0].undiscarded;
 559       ctxt.yvec = cmp->file[1].undiscarded;
 560       diags = (cmp->file[0].nondiscarded_lines
 561                + cmp->file[1].nondiscarded_lines + 3);
 562       ctxt.fdiag = xmalloc (diags * (2 * sizeof *ctxt.fdiag));
 563       ctxt.bdiag = ctxt.fdiag + diags;
 564       ctxt.fdiag += cmp->file[1].nondiscarded_lines + 1;
 565       ctxt.bdiag += cmp->file[1].nondiscarded_lines + 1;
 566
 567       ctxt.heuristic = speed_large_files;
 568
 569       /* Set TOO_EXPENSIVE to be the approximate square root of the
 570          input size, bounded below by 4096.  4096 seems to be good for
 571          circa-2016 CPUs; see Bug#16848 and Bug#24715.  */
 572       too_expensive = 1;
 573       for (;  diags != 0;  diags >>= 2)
 574         too_expensive <<= 1;
 575       ctxt.too_expensive = MAX (4096, too_expensive);
 576
 577       files[0] = cmp->file[0];
 578       files[1] = cmp->file[1];
 579
 580       compareseq (0, cmp->file[0].nondiscarded_lines,
 581                   0, cmp->file[1].nondiscarded_lines, minimal, &ctxt);
 582
 583       free (ctxt.fdiag - (cmp->file[1].nondiscarded_lines + 1));
 584
 585       /* Modify the results slightly to make them prettier
 586          in cases where that can validly be done.  */
 587
 588       shift_boundaries (cmp->file);
 589
 590       /* Get the results of comparison in the form of a chain
 591          of 'struct change's -- an edit script.  */
 592
 593       if (output_style == OUTPUT_ED)
 594         script = build_reverse_script (cmp->file);
 595       else
 596         script = build_script (cmp->file);
 597
 598       /* Set CHANGES if we had any diffs.
 599          If some changes are ignored, we must scan the script to decide.  */
 600       if (ignore_blank_lines || ignore_regexp.fastmap)
 601         {
 602           struct change *next = script;
 603           changes = 0;
 604
 605           while (next && changes == 0)
 606             {
 607               struct change *this, *end;
 608               lin first0, last0, first1, last1;
 609
 610               /* Find a set of changes that belong together.  */
 611               this = next;
 612               end = find_change (next);
 613
 614               /* Disconnect them from the rest of the changes, making them
 615                  a hunk, and remember the rest for next iteration.  */
 616               next = end->link;
 617               end->link = 0;
 618
 619               /* Determine whether this hunk is really a difference.  */
 620               if (analyze_hunk (this, &first0, &last0, &first1, &last1))
 621                 changes = 1;
 622
 623               /* Reconnect the script so it will all be freed properly.  */
 624               end->link = next;
 625             }
 626         }
 627       else
 628         changes = (script != 0);
 629
 630       if (brief)
 631         briefly_report (changes, cmp->file);
 632       else
 633         {
 634           if (changes || !no_diff_means_no_output)
 635             {
 636               /* Record info for starting up output,
 637                  to be used if and when we have some output to print.  */
 638               setup_output (file_label[0] ? file_label[0] : cmp->file[0].name,
 639                             file_label[1] ? file_label[1] : cmp->file[1].name,
 640                             cmp->parent != 0);
 641
 642               switch (output_style)
 643                 {
 644                 case OUTPUT_CONTEXT:
 645                   print_context_script (script, false);
 646                   break;
 647
 648                 case OUTPUT_UNIFIED:
 649                   print_context_script (script, true);
 650                   break;
 651
 652                 case OUTPUT_ED:
 653                   print_ed_script (script);
 654                   break;
 655
 656                 case OUTPUT_FORWARD_ED:
 657                   pr_forward_ed_script (script);
 658                   break;
 659
 660                 case OUTPUT_RCS:
 661                   print_rcs_script (script);
 662                   break;
 663
 664                 case OUTPUT_NORMAL:
 665                   print_normal_script (script);
 666                   break;
 667
 668                 case OUTPUT_IFDEF:
 669                   print_ifdef_script (script);
 670                   break;
 671
 672                 case OUTPUT_SDIFF:
 673                   print_sdiff_script (script);
 674                   break;
 675
 676                 default:
 677                   abort ();
 678                 }
 679
 680               finish_output ();
 681             }
 682         }
 683
 684       free (cmp->file[0].undiscarded);
 685
 686       free (flag_space);
 687
 688       for (f = 0; f < 2; f++)
 689         {
 690           free (cmp->file[f].equivs);
 691           free (cmp->file[f].linbuf + cmp->file[f].linbuf_base);
 692         }
 693
 694       for (e = script; e; e = p)
 695         {
 696           p = e->link;
 697           free (e);
 698         }
 699
 700       if (! ROBUST_OUTPUT_STYLE (output_style))
 701         for (f = 0; f < 2; ++f)
 702           if (cmp->file[f].missing_newline)
 703             {
 704               error (0, 0, "%s: %s\n",
 705                      file_label[f] ? file_label[f] : cmp->file[f].name,
 706                      _("No newline at end of file"));
 707               changes = 2;
 708             }
 709     }
 710
 711   if (cmp->file[0].buffer != cmp->file[1].buffer)
 712     free (cmp->file[0].buffer);
 713   free (cmp->file[1].buffer);
 714
 715   return changes;
 716 }