src/io.c

   1 /* File I/O for GNU DIFF.
   2
   3    Copyright (C) 1988-1989, 1992-1995, 1998, 2001-2002, 2004, 2006, 2009-2013,
   4    2015-2018 Free Software Foundation, Inc.
   5
   6    This file is part of GNU DIFF.
   7
   8    This program is free software: you can redistribute it and/or modify
   9    it under the terms of the GNU General Public License as published by
  10    the Free Software Foundation, either version 3 of the License, or
  11    (at your option) any later version.
  12
  13    This program is distributed in the hope that it will be useful,
  14    but WITHOUT ANY WARRANTY; without even the implied warranty of
  15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16    GNU General Public License for more details.
  17
  18    You should have received a copy of the GNU General Public License
  19    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  20
  21 #include "diff.h"
  22 #include <binary-io.h>
  23 #include <cmpbuf.h>
  24 #include <file-type.h>
  25 #include <xalloc.h>
  26
  27 /* Rotate an unsigned value to the left.  */
  28 #define ROL(v, n) ((v) << (n) | (v) >> (sizeof (v) * CHAR_BIT - (n)))
  29
  30 /* Given a hash value and a new character, return a new hash value.  */
  31 #define HASH(h, c) ((c) + ROL (h, 7))
  32 \f
  33 /* The type of a hash value.  */
  34 typedef size_t hash_value;
  35 verify (! TYPE_SIGNED (hash_value));
  36
  37 /* Lines are put into equivalence classes of lines that match in lines_differ.
  38    Each equivalence class is represented by one of these structures,
  39    but only while the classes are being computed.
  40    Afterward, each class is represented by a number.  */
  41 struct equivclass
  42 {
  43   lin next;             /* Next item in this bucket.  */
  44   hash_value hash;      /* Hash of lines in this class.  */
  45   char const *line;     /* A line that fits this class.  */
  46   size_t length;        /* That line's length, not counting its newline.  */
  47 };
  48
  49 /* Hash-table: array of buckets, each being a chain of equivalence classes.
  50    buckets[-1] is reserved for incomplete lines.  */
  51 static lin *buckets;
  52
  53 /* Number of buckets in the hash table array, not counting buckets[-1].  */
  54 static size_t nbuckets;
  55
  56 /* Array in which the equivalence classes are allocated.
  57    The bucket-chains go through the elements in this array.
  58    The number of an equivalence class is its index in this array.  */
  59 static struct equivclass *equivs;
  60
  61 /* Index of first free element in the array 'equivs'.  */
  62 static lin equivs_index;
  63
  64 /* Number of elements allocated in the array 'equivs'.  */
  65 static lin equivs_alloc;
  66 \f
  67 /* Read a block of data into a file buffer, checking for EOF and error.  */
  68
  69 void
  70 file_block_read (struct file_data *current, size_t size)
  71 {
  72   if (size && ! current->eof)
  73     {
  74       size_t s = block_read (current->desc,
  75                              FILE_BUFFER (current) + current->buffered, size);
  76       if (s == SIZE_MAX)
  77         pfatal_with_name (current->name);
  78       current->buffered += s;
  79       current->eof = s < size;
  80     }
  81 }
  82 \f
  83 /* Check for binary files and compare them for exact identity.  */
  84
  85 /* Return 1 if BUF contains a non text character.
  86    SIZE is the number of characters in BUF.  */
  87
  88 #define binary_file_p(buf, size) (memchr (buf, 0, size) != 0)
  89
  90 /* Get ready to read the current file.
  91    Return nonzero if SKIP_TEST is zero,
  92    and if it appears to be a binary file.  */
  93
  94 static bool
  95 sip (struct file_data *current, bool skip_test)
  96 {
  97   /* If we have a nonexistent file at this stage, treat it as empty.  */
  98   if (current->desc < 0)
  99     {
 100       /* Leave room for a sentinel.  */
 101       current->bufsize = sizeof (word);
 102       current->buffer = xmalloc (current->bufsize);
 103     }
 104   else
 105     {
 106       current->bufsize = buffer_lcm (sizeof (word),
 107                                      STAT_BLOCKSIZE (current->stat),
 108                                      PTRDIFF_MAX - 2 * sizeof (word));
 109       current->buffer = xmalloc (current->bufsize);
 110
 111 #ifdef __KLIBC__
 112       /* Skip test if seek is not possible */
 113       skip_test = skip_test
 114                   || (lseek (current->desc, 0, SEEK_CUR) < 0
 115                       && errno == ESPIPE);
 116 #endif
 117
 118       if (! skip_test)
 119         {
 120           /* Check first part of file to see if it's a binary file.  */
 121
 122           int prev_mode = set_binary_mode (current->desc, O_BINARY);
 123           off_t buffered;
 124           file_block_read (current, current->bufsize);
 125           buffered = current->buffered;
 126
 127           if (prev_mode != O_BINARY)
 128             {
 129               /* Revert to text mode and seek back to the start to reread
 130                  the file.  Use relative seek, since file descriptors
 131                  like stdin might not start at offset zero.  */
 132               if (lseek (current->desc, - buffered, SEEK_CUR) < 0)
 133                 pfatal_with_name (current->name);
 134               set_binary_mode (current->desc, prev_mode);
 135               current->buffered = 0;
 136               current->eof = false;
 137             }
 138
 139           return binary_file_p (current->buffer, buffered);
 140         }
 141     }
 142
 143   current->buffered = 0;
 144   current->eof = false;
 145   return false;
 146 }
 147
 148 /* Slurp the rest of the current file completely into memory.  */
 149
 150 static void
 151 slurp (struct file_data *current)
 152 {
 153   size_t cc;
 154
 155   if (current->desc < 0)
 156     {
 157       /* The file is nonexistent.  */
 158       return;
 159     }
 160
 161   if (S_ISREG (current->stat.st_mode))
 162     {
 163       /* It's a regular file; slurp in the rest all at once.  */
 164
 165       /* Get the size out of the stat block.
 166          Allocate just enough room for appended newline plus word sentinel,
 167          plus word-alignment since we want the buffer word-aligned.  */
 168       size_t file_size = current->stat.st_size;
 169       cc = file_size + 2 * sizeof (word) - file_size % sizeof (word);
 170       if (file_size != current->stat.st_size || cc < file_size
 171           || PTRDIFF_MAX <= cc)
 172         xalloc_die ();
 173
 174       if (current->bufsize < cc)
 175         {
 176           current->bufsize = cc;
 177           current->buffer = xrealloc (current->buffer, cc);
 178         }
 179
 180       /* Try to read at least 1 more byte than the size indicates, to
 181          detect whether the file is growing.  This is a nicety for
 182          users who run 'diff' on files while they are changing.  */
 183
 184       if (current->buffered <= file_size)
 185         {
 186           file_block_read (current, file_size + 1 - current->buffered);
 187           if (current->buffered <= file_size)
 188             return;
 189         }
 190     }
 191
 192   /* It's not a regular file, or it's a growing regular file; read it,
 193      growing the buffer as needed.  */
 194
 195   file_block_read (current, current->bufsize - current->buffered);
 196
 197   if (current->buffered)
 198     {
 199       while (current->buffered == current->bufsize)
 200         {
 201           if (PTRDIFF_MAX / 2 - sizeof (word) < current->bufsize)
 202             xalloc_die ();
 203           current->bufsize *= 2;
 204           current->buffer = xrealloc (current->buffer, current->bufsize);
 205           file_block_read (current, current->bufsize - current->buffered);
 206         }
 207
 208       /* Allocate just enough room for appended newline plus word
 209          sentinel, plus word-alignment.  */
 210       cc = current->buffered + 2 * sizeof (word);
 211       current->bufsize = cc - cc % sizeof (word);
 212       current->buffer = xrealloc (current->buffer, current->bufsize);
 213     }
 214 }
 215 \f
 216 /* Split the file into lines, simultaneously computing the equivalence
 217    class for each line.  */
 218
 219 static void
 220 find_and_hash_each_line (struct file_data *current)
 221 {
 222   char const *p = current->prefix_end;
 223   lin i, *bucket;
 224   size_t length;
 225
 226   /* Cache often-used quantities in local variables to help the compiler.  */
 227   char const **linbuf = current->linbuf;
 228   lin alloc_lines = current->alloc_lines;
 229   lin line = 0;
 230   lin linbuf_base = current->linbuf_base;
 231   lin *cureqs = xmalloc (alloc_lines * sizeof *cureqs);
 232   struct equivclass *eqs = equivs;
 233   lin eqs_index = equivs_index;
 234   lin eqs_alloc = equivs_alloc;
 235   char const *suffix_begin = current->suffix_begin;
 236   char const *bufend = FILE_BUFFER (current) + current->buffered;
 237   bool ig_case = ignore_case;
 238   enum DIFF_white_space ig_white_space = ignore_white_space;
 239   bool diff_length_compare_anyway =
 240     ig_white_space != IGNORE_NO_WHITE_SPACE;
 241   bool same_length_diff_contents_compare_anyway =
 242     diff_length_compare_anyway | ig_case;
 243
 244   while (p < suffix_begin)
 245     {
 246       char const *ip = p;
 247       hash_value h = 0;
 248       unsigned char c;
 249
 250       /* Hash this line until we find a newline.  */
 251       switch (ig_white_space)
 252         {
 253         case IGNORE_ALL_SPACE:
 254           while ((c = *p++) != '\n')
 255             if (! isspace (c))
 256               h = HASH (h, ig_case ? tolower (c) : c);
 257           break;
 258
 259         case IGNORE_SPACE_CHANGE:
 260           while ((c = *p++) != '\n')
 261             {
 262               if (isspace (c))
 263                 {
 264                   do
 265                     if ((c = *p++) == '\n')
 266                       goto hashing_done;
 267                   while (isspace (c));
 268
 269                   h = HASH (h, ' ');
 270                 }
 271
 272               /* C is now the first non-space.  */
 273               h = HASH (h, ig_case ? tolower (c) : c);
 274             }
 275           break;
 276
 277         case IGNORE_TAB_EXPANSION:
 278         case IGNORE_TAB_EXPANSION_AND_TRAILING_SPACE:
 279         case IGNORE_TRAILING_SPACE:
 280           {
 281             size_t column = 0;
 282             while ((c = *p++) != '\n')
 283               {
 284                 if (ig_white_space & IGNORE_TRAILING_SPACE
 285                     && isspace (c))
 286                   {
 287                     char const *p1 = p;
 288                     unsigned char c1;
 289                     do
 290                       if ((c1 = *p1++) == '\n')
 291                         {
 292                           p = p1;
 293                           goto hashing_done;
 294                         }
 295                     while (isspace (c1));
 296                   }
 297
 298                 size_t repetitions = 1;
 299
 300                 if (ig_white_space & IGNORE_TAB_EXPANSION)
 301                   switch (c)
 302                     {
 303                     case '\b':
 304                       column -= 0 < column;
 305                       break;
 306
 307                     case '\t':
 308                       c = ' ';
 309                       repetitions = tabsize - column % tabsize;
 310                       column = (column + repetitions < column
 311                                 ? 0
 312                                 : column + repetitions);
 313                       break;
 314
 315                     case '\r':
 316                       column = 0;
 317                       break;
 318
 319                     default:
 320                       column++;
 321                       break;
 322                     }
 323
 324                 if (ig_case)
 325                   c = tolower (c);
 326
 327                 do
 328                   h = HASH (h, c);
 329                 while (--repetitions != 0);
 330               }
 331           }
 332           break;
 333
 334         default:
 335           if (ig_case)
 336             while ((c = *p++) != '\n')
 337               h = HASH (h, tolower (c));
 338           else
 339             while ((c = *p++) != '\n')
 340               h = HASH (h, c);
 341           break;
 342         }
 343
 344    hashing_done:;
 345
 346       bucket = &buckets[h % nbuckets];
 347       length = p - ip - 1;
 348
 349       if (p == bufend
 350           && current->missing_newline
 351           && ROBUST_OUTPUT_STYLE (output_style))
 352         {
 353           /* The last line is incomplete and we do not silently
 354              complete lines.  If the line cannot compare equal to any
 355              complete line, put it into buckets[-1] so that it can
 356              compare equal only to the other file's incomplete line
 357              (if one exists).  */
 358           if (ig_white_space < IGNORE_TRAILING_SPACE)
 359             bucket = &buckets[-1];
 360         }
 361
 362       for (i = *bucket;  ;  i = eqs[i].next)
 363         if (!i)
 364           {
 365             /* Create a new equivalence class in this bucket.  */
 366             i = eqs_index++;
 367             if (i == eqs_alloc)
 368               {
 369                 if (PTRDIFF_MAX / (2 * sizeof *eqs) <= eqs_alloc)
 370                   xalloc_die ();
 371                 eqs_alloc *= 2;
 372                 eqs = xrealloc (eqs, eqs_alloc * sizeof *eqs);
 373               }
 374             eqs[i].next = *bucket;
 375             eqs[i].hash = h;
 376             eqs[i].line = ip;
 377             eqs[i].length = length;
 378             *bucket = i;
 379             break;
 380           }
 381         else if (eqs[i].hash == h)
 382           {
 383             char const *eqline = eqs[i].line;
 384
 385             /* Reuse existing class if lines_differ reports the lines
 386                equal.  */
 387             if (eqs[i].length == length)
 388               {
 389                 /* Reuse existing equivalence class if the lines are identical.
 390                    This detects the common case of exact identity
 391                    faster than lines_differ would.  */
 392                 if (memcmp (eqline, ip, length) == 0)
 393                   break;
 394                 if (!same_length_diff_contents_compare_anyway)
 395                   continue;
 396               }
 397             else if (!diff_length_compare_anyway)
 398               continue;
 399
 400             if (! lines_differ (eqline, ip))
 401               break;
 402           }
 403
 404       /* Maybe increase the size of the line table.  */
 405       if (line == alloc_lines)
 406         {
 407           /* Double (alloc_lines - linbuf_base) by adding to alloc_lines.  */
 408           if (PTRDIFF_MAX / 3 <= alloc_lines
 409               || PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base
 410               || PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base)
 411             xalloc_die ();
 412           alloc_lines = 2 * alloc_lines - linbuf_base;
 413           cureqs = xrealloc (cureqs, alloc_lines * sizeof *cureqs);
 414           linbuf += linbuf_base;
 415           linbuf = xrealloc (linbuf,
 416                              (alloc_lines - linbuf_base) * sizeof *linbuf);
 417           linbuf -= linbuf_base;
 418         }
 419       linbuf[line] = ip;
 420       cureqs[line] = i;
 421       ++line;
 422     }
 423
 424   current->buffered_lines = line;
 425
 426   for (i = 0;  ;  i++)
 427     {
 428       /* Record the line start for lines in the suffix that we care about.
 429          Record one more line start than lines,
 430          so that we can compute the length of any buffered line.  */
 431       if (line == alloc_lines)
 432         {
 433           /* Double (alloc_lines - linbuf_base) by adding to alloc_lines.  */
 434           if (PTRDIFF_MAX / 3 <= alloc_lines
 435               || PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base
 436               || PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base)
 437             xalloc_die ();
 438           alloc_lines = 2 * alloc_lines - linbuf_base;
 439           linbuf += linbuf_base;
 440           linbuf = xrealloc (linbuf,
 441                              (alloc_lines - linbuf_base) * sizeof *linbuf);
 442           linbuf -= linbuf_base;
 443         }
 444       linbuf[line] = p;
 445
 446       if (p == bufend)
 447         {
 448           /* If the last line is incomplete and we do not silently
 449              complete lines, don't count its appended newline.  */
 450           if (current->missing_newline && ROBUST_OUTPUT_STYLE (output_style))
 451             linbuf[line]--;
 452           break;
 453         }
 454
 455       if (context <= i && no_diff_means_no_output)
 456         break;
 457
 458       line++;
 459
 460       while (*p++ != '\n')
 461         continue;
 462     }
 463
 464   /* Done with cache in local variables.  */
 465   current->linbuf = linbuf;
 466   current->valid_lines = line;
 467   current->alloc_lines = alloc_lines;
 468   current->equivs = cureqs;
 469   equivs = eqs;
 470   equivs_alloc = eqs_alloc;
 471   equivs_index = eqs_index;
 472 }
 473 \f
 474 /* Prepare the text.  Make sure the text end is initialized.
 475    Make sure text ends in a newline,
 476    but remember that we had to add one.
 477    Strip trailing CRs, if that was requested.  */
 478
 479 static void
 480 prepare_text (struct file_data *current)
 481 {
 482   size_t buffered = current->buffered;
 483   char *p = FILE_BUFFER (current);
 484   if (!p)
 485     return;
 486
 487   if (strip_trailing_cr)
 488     {
 489       char *srclim = p + buffered;
 490       *srclim = '\r';
 491       char *dst = rawmemchr (p, '\r');
 492
 493       for (char const *src = dst; src != srclim; src++)
 494         {
 495           src += *src == '\r' && src[1] == '\n';
 496           *dst++ = *src;
 497         }
 498
 499       buffered -= srclim - dst;
 500     }
 501
 502   if (buffered != 0 && p[buffered - 1] != '\n')
 503     {
 504       p[buffered++] = '\n';
 505       current->missing_newline = true;
 506     }
 507
 508   /* Don't use uninitialized storage when planting or using sentinels.  */
 509   memset (p + buffered, 0, sizeof (word));
 510
 511   current->buffered = buffered;
 512 }
 513 \f
 514 /* We have found N lines in a buffer of size S; guess the
 515    proportionate number of lines that will be found in a buffer of
 516    size T.  However, do not guess a number of lines so large that the
 517    resulting line table might cause overflow in size calculations.  */
 518 static lin
 519 guess_lines (lin n, size_t s, size_t t)
 520 {
 521   size_t guessed_bytes_per_line = n < 10 ? 32 : s / (n - 1);
 522   lin guessed_lines = MAX (1, t / guessed_bytes_per_line);
 523   return MIN (guessed_lines, PTRDIFF_MAX / (2 * sizeof (char *) + 1) - 5) + 5;
 524 }
 525
 526 /* Given a vector of two file_data objects, find the identical
 527    prefixes and suffixes of each object.  */
 528
 529 static void
 530 find_identical_ends (struct file_data filevec[])
 531 {
 532   word *w0, *w1;
 533   char *p0, *p1, *buffer0, *buffer1;
 534   char const *end0, *beg0;
 535   char const **linbuf0, **linbuf1;
 536   lin i, lines;
 537   size_t n0, n1;
 538   lin alloc_lines0, alloc_lines1;
 539   bool prefix_needed;
 540   lin buffered_prefix, prefix_count, prefix_mask;
 541   lin middle_guess, suffix_guess;
 542
 543   slurp (&filevec[0]);
 544   prepare_text (&filevec[0]);
 545   if (filevec[0].desc != filevec[1].desc)
 546     {
 547       slurp (&filevec[1]);
 548       prepare_text (&filevec[1]);
 549     }
 550   else
 551     {
 552       filevec[1].buffer = filevec[0].buffer;
 553       filevec[1].bufsize = filevec[0].bufsize;
 554       filevec[1].buffered = filevec[0].buffered;
 555       filevec[1].missing_newline = filevec[0].missing_newline;
 556     }
 557
 558   /* Find identical prefix.  */
 559
 560   w0 = filevec[0].buffer;
 561   w1 = filevec[1].buffer;
 562   p0 = buffer0 = (char *) w0;
 563   p1 = buffer1 = (char *) w1;
 564   n0 = filevec[0].buffered;
 565   n1 = filevec[1].buffered;
 566
 567   if (p0 == p1)
 568     /* The buffers are the same; sentinels won't work.  */
 569     p0 = p1 += n1;
 570   else
 571     {
 572       /* Insert end sentinels, in this case characters that are guaranteed
 573          to make the equality test false, and thus terminate the loop.  */
 574
 575       if (n0 < n1)
 576         p0[n0] = ~p1[n0];
 577       else
 578         p1[n1] = ~p0[n1];
 579
 580       /* Loop until first mismatch, or to the sentinel characters.  */
 581
 582       /* Compare a word at a time for speed.  */
 583       while (*w0 == *w1)
 584         w0++, w1++;
 585
 586       /* Do the last few bytes of comparison a byte at a time.  */
 587       p0 = (char *) w0;
 588       p1 = (char *) w1;
 589       while (*p0 == *p1)
 590         p0++, p1++;
 591
 592       /* Don't mistakenly count missing newline as part of prefix.  */
 593       if (ROBUST_OUTPUT_STYLE (output_style)
 594           && ((buffer0 + n0 - filevec[0].missing_newline < p0)
 595               !=
 596               (buffer1 + n1 - filevec[1].missing_newline < p1)))
 597         p0--, p1--;
 598     }
 599
 600   /* Now P0 and P1 point at the first nonmatching characters.  */
 601
 602   /* Skip back to last line-beginning in the prefix,
 603      and then discard up to HORIZON_LINES lines from the prefix.  */
 604   i = horizon_lines;
 605   while (p0 != buffer0 && (p0[-1] != '\n' || i--))
 606     p0--, p1--;
 607
 608   /* Record the prefix.  */
 609   filevec[0].prefix_end = p0;
 610   filevec[1].prefix_end = p1;
 611
 612   /* Find identical suffix.  */
 613
 614   /* P0 and P1 point beyond the last chars not yet compared.  */
 615   p0 = buffer0 + n0;
 616   p1 = buffer1 + n1;
 617
 618   if (! ROBUST_OUTPUT_STYLE (output_style)
 619       || filevec[0].missing_newline == filevec[1].missing_newline)
 620     {
 621       end0 = p0;        /* Addr of last char in file 0.  */
 622
 623       /* Get value of P0 at which we should stop scanning backward:
 624          this is when either P0 or P1 points just past the last char
 625          of the identical prefix.  */
 626       beg0 = filevec[0].prefix_end + (n0 < n1 ? 0 : n0 - n1);
 627
 628       /* Scan back until chars don't match or we reach that point.  */
 629       while (p0 != beg0)
 630         if (*--p0 != *--p1)
 631           {
 632             /* Point at the first char of the matching suffix.  */
 633             ++p0, ++p1;
 634             beg0 = p0;
 635             break;
 636           }
 637
 638       /* Are we at a line-beginning in both files?  If not, add the rest of
 639          this line to the main body.  Discard up to HORIZON_LINES lines from
 640          the identical suffix.  Also, discard one extra line,
 641          because shift_boundaries may need it.  */
 642       i = horizon_lines + !((buffer0 == p0 || p0[-1] == '\n')
 643                             &&
 644                             (buffer1 == p1 || p1[-1] == '\n'));
 645       while (i-- && p0 != end0)
 646         while (*p0++ != '\n')
 647           continue;
 648
 649       p1 += p0 - beg0;
 650     }
 651
 652   /* Record the suffix.  */
 653   filevec[0].suffix_begin = p0;
 654   filevec[1].suffix_begin = p1;
 655
 656   /* Calculate number of lines of prefix to save.
 657
 658      prefix_count == 0 means save the whole prefix;
 659      we need this for options like -D that output the whole file,
 660      or for enormous contexts (to avoid worrying about arithmetic overflow).
 661      We also need it for options like -F that output some preceding line;
 662      at least we will need to find the last few lines,
 663      but since we don't know how many, it's easiest to find them all.
 664
 665      Otherwise, prefix_count != 0.  Save just prefix_count lines at start
 666      of the line buffer; they'll be moved to the proper location later.
 667      Handle 1 more line than the context says (because we count 1 too many),
 668      rounded up to the next power of 2 to speed index computation.  */
 669
 670   if (no_diff_means_no_output && ! function_regexp.fastmap
 671       && context < LIN_MAX / 4 && context < n0)
 672     {
 673       middle_guess = guess_lines (0, 0, p0 - filevec[0].prefix_end);
 674       suffix_guess = guess_lines (0, 0, buffer0 + n0 - p0);
 675       for (prefix_count = 1;  prefix_count <= context;  prefix_count *= 2)
 676         continue;
 677       alloc_lines0 = (prefix_count + middle_guess
 678                       + MIN (context, suffix_guess));
 679     }
 680   else
 681     {
 682       prefix_count = 0;
 683       alloc_lines0 = guess_lines (0, 0, n0);
 684     }
 685
 686   prefix_mask = prefix_count - 1;
 687   lines = 0;
 688   linbuf0 = xmalloc (alloc_lines0 * sizeof *linbuf0);
 689   prefix_needed = ! (no_diff_means_no_output
 690                      && filevec[0].prefix_end == p0
 691                      && filevec[1].prefix_end == p1);
 692   p0 = buffer0;
 693
 694   /* If the prefix is needed, find the prefix lines.  */
 695   if (prefix_needed)
 696     {
 697       end0 = filevec[0].prefix_end;
 698       while (p0 != end0)
 699         {
 700           lin l = lines++ & prefix_mask;
 701           if (l == alloc_lines0)
 702             {
 703               if (PTRDIFF_MAX / (2 * sizeof *linbuf0) <= alloc_lines0)
 704                 xalloc_die ();
 705               alloc_lines0 *= 2;
 706               linbuf0 = xrealloc (linbuf0, alloc_lines0 * sizeof *linbuf0);
 707             }
 708           linbuf0[l] = p0;
 709           while (*p0++ != '\n')
 710             continue;
 711         }
 712     }
 713   buffered_prefix = prefix_count && context < lines ? context : lines;
 714
 715   /* Allocate line buffer 1.  */
 716
 717   middle_guess = guess_lines (lines, p0 - buffer0, p1 - filevec[1].prefix_end);
 718   suffix_guess = guess_lines (lines, p0 - buffer0, buffer1 + n1 - p1);
 719   alloc_lines1 = buffered_prefix + middle_guess + MIN (context, suffix_guess);
 720   if (alloc_lines1 < buffered_prefix
 721       || PTRDIFF_MAX / sizeof *linbuf1 <= alloc_lines1)
 722     xalloc_die ();
 723   linbuf1 = xmalloc (alloc_lines1 * sizeof *linbuf1);
 724
 725   if (buffered_prefix != lines)
 726     {
 727       /* Rotate prefix lines to proper location.  */
 728       for (i = 0;  i < buffered_prefix;  i++)
 729         linbuf1[i] = linbuf0[(lines - context + i) & prefix_mask];
 730       for (i = 0;  i < buffered_prefix;  i++)
 731         linbuf0[i] = linbuf1[i];
 732     }
 733
 734   /* Initialize line buffer 1 from line buffer 0.  */
 735   for (i = 0; i < buffered_prefix; i++)
 736     linbuf1[i] = linbuf0[i] - buffer0 + buffer1;
 737
 738   /* Record the line buffer, adjusted so that
 739      linbuf[0] points at the first differing line.  */
 740   filevec[0].linbuf = linbuf0 + buffered_prefix;
 741   filevec[1].linbuf = linbuf1 + buffered_prefix;
 742   filevec[0].linbuf_base = filevec[1].linbuf_base = - buffered_prefix;
 743   filevec[0].alloc_lines = alloc_lines0 - buffered_prefix;
 744   filevec[1].alloc_lines = alloc_lines1 - buffered_prefix;
 745   filevec[0].prefix_lines = filevec[1].prefix_lines = lines;
 746 }
 747 \f
 748 /* If 1 < k, then (2**k - prime_offset[k]) is the largest prime less
 749    than 2**k.  This table is derived from Chris K. Caldwell's list
 750    <http://www.utm.edu/research/primes/lists/2small/>.  */
 751
 752 static unsigned char const prime_offset[] =
 753 {
 754   0, 0, 1, 1, 3, 1, 3, 1, 5, 3, 3, 9, 3, 1, 3, 19, 15, 1, 5, 1, 3, 9, 3,
 755   15, 3, 39, 5, 39, 57, 3, 35, 1, 5, 9, 41, 31, 5, 25, 45, 7, 87, 21,
 756   11, 57, 17, 55, 21, 115, 59, 81, 27, 129, 47, 111, 33, 55, 5, 13, 27,
 757   55, 93, 1, 57, 25
 758 };
 759
 760 /* Verify that this host's size_t is not too wide for the above table.  */
 761
 762 verify (sizeof (size_t) * CHAR_BIT <= sizeof prime_offset);
 763
 764 /* Given a vector of two file_data objects, read the file associated
 765    with each one, and build the table of equivalence classes.
 766    Return nonzero if either file appears to be a binary file.
 767    If PRETEND_BINARY is nonzero, pretend they are binary regardless.  */
 768
 769 bool
 770 read_files (struct file_data filevec[], bool pretend_binary)
 771 {
 772   int i;
 773   bool skip_test = text | pretend_binary;
 774   bool appears_binary = pretend_binary | sip (&filevec[0], skip_test);
 775
 776   if (filevec[0].desc != filevec[1].desc)
 777     appears_binary |= sip (&filevec[1], skip_test | appears_binary);
 778   else
 779     {
 780       filevec[1].buffer = filevec[0].buffer;
 781       filevec[1].bufsize = filevec[0].bufsize;
 782       filevec[1].buffered = filevec[0].buffered;
 783     }
 784   if (appears_binary)
 785     {
 786       set_binary_mode (filevec[0].desc, O_BINARY);
 787       set_binary_mode (filevec[1].desc, O_BINARY);
 788       return true;
 789     }
 790
 791   find_identical_ends (filevec);
 792
 793   equivs_alloc = filevec[0].alloc_lines + filevec[1].alloc_lines + 1;
 794   if (PTRDIFF_MAX / sizeof *equivs <= equivs_alloc)
 795     xalloc_die ();
 796   equivs = xmalloc (equivs_alloc * sizeof *equivs);
 797   /* Equivalence class 0 is permanently safe for lines that were not
 798      hashed.  Real equivalence classes start at 1.  */
 799   equivs_index = 1;
 800
 801   /* Allocate (one plus) a prime number of hash buckets.  Use a prime
 802      number between 1/3 and 2/3 of the value of equiv_allocs,
 803      approximately.  */
 804   for (i = 9; (size_t) 1 << i < equivs_alloc / 3; i++)
 805     continue;
 806   nbuckets = ((size_t) 1 << i) - prime_offset[i];
 807   if (PTRDIFF_MAX / sizeof *buckets <= nbuckets)
 808     xalloc_die ();
 809   buckets = zalloc ((nbuckets + 1) * sizeof *buckets);
 810   buckets++;
 811
 812   for (i = 0; i < 2; i++)
 813     find_and_hash_each_line (&filevec[i]);
 814
 815   filevec[0].equiv_max = filevec[1].equiv_max = equivs_index;
 816
 817   free (equivs);
 818   free (buckets - 1);
 819
 820   return false;
 821 }