1  /* File I/O for GNU DIFF.
       2  
       3     Copyright (C) 1988-1989, 1992-1995, 1998, 2001-2002, 2004, 2006, 2009-2013,
       4     2015-2023 Free Software Foundation, Inc.
       5  
       6     This file is part of GNU DIFF.
       7  
       8     This program is free software: you can redistribute it and/or modify
       9     it under the terms of the GNU General Public License as published by
      10     the Free Software Foundation, either version 3 of the License, or
      11     (at your option) any later version.
      12  
      13     This program is distributed in the hope that it will be useful,
      14     but WITHOUT ANY WARRANTY; without even the implied warranty of
      15     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      16     GNU General Public License for more details.
      17  
      18     You should have received a copy of the GNU General Public License
      19     along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
      20  
      21  #include "diff.h"
      22  #include <binary-io.h>
      23  #include <cmpbuf.h>
      24  #include <file-type.h>
      25  #include <xalloc.h>
      26  
      27  /* Rotate an unsigned value to the left.  */
      28  #define ROL(v, n) ((v) << (n) | (v) >> (sizeof (v) * CHAR_BIT - (n)))
      29  
      30  /* Given a hash value and a new character, return a new hash value.  */
      31  #define HASH(h, c) ((c) + ROL (h, 7))
      32  
      33  /* The type of a hash value.  */
      34  typedef size_t hash_value;
      35  verify (! TYPE_SIGNED (hash_value));
      36  
      37  /* Lines are put into equivalence classes of lines that match in lines_differ.
      38     Each equivalence class is represented by one of these structures,
      39     but only while the classes are being computed.
      40     Afterward, each class is represented by a number.  */
      41  struct equivclass
      42  {
      43    lin next;		/* Next item in this bucket.  */
      44    hash_value hash;	/* Hash of lines in this class.  */
      45    char const *line;	/* A line that fits this class.  */
      46    size_t length;	/* That line's length, not counting its newline.  */
      47  };
      48  
      49  /* Hash-table: array of buckets, each being a chain of equivalence classes.
      50     buckets[-1] is reserved for incomplete lines.  */
      51  static lin *buckets;
      52  
      53  /* Number of buckets in the hash table array, not counting buckets[-1].  */
      54  static size_t nbuckets;
      55  
      56  /* Array in which the equivalence classes are allocated.
      57     The bucket-chains go through the elements in this array.
      58     The number of an equivalence class is its index in this array.  */
      59  static struct equivclass *equivs;
      60  
      61  /* Index of first free element in the array 'equivs'.  */
      62  static lin equivs_index;
      63  
      64  /* Number of elements allocated in the array 'equivs'.  */
      65  static lin equivs_alloc;
      66  
      67  /* The file buffer, considered as an array of bytes rather than
      68     as an array of words.  */
      69  
      70  static char *
      71  file_buffer (struct file_data const *f)
      72  {
      73    return (char *) f->buffer;
      74  }
      75  
      76  /* Read a block of data into a file buffer, checking for EOF and error.  */
      77  
      78  void
      79  file_block_read (struct file_data *current, size_t size)
      80  {
      81    if (size && ! current->eof)
      82      {
      83        size_t s = block_read (current->desc,
      84                               file_buffer (current) + current->buffered, size);
      85        if (s == SIZE_MAX)
      86          pfatal_with_name (current->name);
      87        current->buffered += s;
      88        current->eof = s < size;
      89      }
      90  }
      91  
      92  /* Check for binary files and compare them for exact identity.  */
      93  
      94  /* Return 1 if BUF contains a non text character.
      95     SIZE is the number of characters in BUF.  */
      96  
      97  #define binary_file_p(buf, size) (memchr (buf, 0, size) != 0)
      98  
      99  /* Get ready to read the current file.
     100     Return nonzero if SKIP_TEST is zero,
     101     and if it appears to be a binary file.  */
     102  
     103  static bool
     104  sip (struct file_data *current, bool skip_test)
     105  {
     106    /* If we have a nonexistent file at this stage, treat it as empty.  */
     107    if (current->desc < 0)
     108      {
     109        /* Leave room for a sentinel.  */
     110        current->bufsize = sizeof (word);
     111        current->buffer = xmalloc (current->bufsize);
     112      }
     113    else
     114      {
     115        current->bufsize = buffer_lcm (sizeof (word),
     116                                       STAT_BLOCKSIZE (current->stat),
     117                                       PTRDIFF_MAX - 2 * sizeof (word));
     118        current->buffer = xmalloc (current->bufsize);
     119  
     120  #ifdef __KLIBC__
     121        /* Skip test if seek is not possible */
     122        skip_test = skip_test
     123                    || (lseek (current->desc, 0, SEEK_CUR) < 0
     124                        && errno == ESPIPE);
     125  #endif
     126  
     127        if (! skip_test)
     128          {
     129            /* Check first part of file to see if it's a binary file.  */
     130  
     131            int prev_mode = set_binary_mode (current->desc, O_BINARY);
     132            off_t buffered;
     133            file_block_read (current, current->bufsize);
     134            buffered = current->buffered;
     135  
     136            if (prev_mode != O_BINARY)
     137              {
     138                /* Revert to text mode and seek back to the start to reread
     139                   the file.  Use relative seek, since file descriptors
     140                   like stdin might not start at offset zero.  */
     141                if (lseek (current->desc, - buffered, SEEK_CUR) < 0)
     142                  pfatal_with_name (current->name);
     143                set_binary_mode (current->desc, prev_mode);
     144                current->buffered = 0;
     145                current->eof = false;
     146              }
     147  
     148            return binary_file_p (current->buffer, buffered);
     149          }
     150      }
     151  
     152    current->buffered = 0;
     153    current->eof = false;
     154    return false;
     155  }
     156  
     157  /* Slurp the rest of the current file completely into memory.  */
     158  
     159  static void
     160  slurp (struct file_data *current)
     161  {
     162    size_t cc;
     163  
     164    if (current->desc < 0)
     165      {
     166        /* The file is nonexistent.  */
     167        return;
     168      }
     169  
     170    if (S_ISREG (current->stat.st_mode))
     171      {
     172        /* It's a regular file; slurp in the rest all at once.  */
     173  
     174        /* Get the size out of the stat block.
     175           Allocate just enough room for appended newline plus word sentinel,
     176           plus word-alignment since we want the buffer word-aligned.  */
     177        off_t file_size = current->stat.st_size;
     178        if (INT_ADD_WRAPV (2 * sizeof (word) - file_size % sizeof (word),
     179  			 file_size, &cc))
     180          xalloc_die ();
     181  
     182        if (current->bufsize < cc)
     183          {
     184            current->bufsize = cc;
     185            current->buffer = xrealloc (current->buffer, cc);
     186          }
     187  
     188        /* Try to read at least 1 more byte than the size indicates, to
     189           detect whether the file is growing.  This is a nicety for
     190           users who run 'diff' on files while they are changing.  */
     191  
     192        if (current->buffered <= file_size)
     193          {
     194            file_block_read (current, file_size + 1 - current->buffered);
     195            if (current->buffered <= file_size)
     196              return;
     197          }
     198      }
     199  
     200    /* It's not a regular file, or it's a growing regular file; read it,
     201       growing the buffer as needed.  */
     202  
     203    file_block_read (current, current->bufsize - current->buffered);
     204  
     205    if (current->buffered)
     206      {
     207        while (current->buffered == current->bufsize)
     208          {
     209            if (PTRDIFF_MAX / 2 - sizeof (word) < current->bufsize)
     210              xalloc_die ();
     211            current->bufsize *= 2;
     212            current->buffer = xrealloc (current->buffer, current->bufsize);
     213            file_block_read (current, current->bufsize - current->buffered);
     214          }
     215  
     216        /* Allocate just enough room for appended newline plus word
     217           sentinel, plus word-alignment.  */
     218        cc = current->buffered + 2 * sizeof (word);
     219        current->bufsize = cc - cc % sizeof (word);
     220        current->buffer = xrealloc (current->buffer, current->bufsize);
     221      }
     222  }
     223  
     224  /* Split the file into lines, simultaneously computing the equivalence
     225     class for each line.  */
     226  
     227  static void
     228  find_and_hash_each_line (struct file_data *current)
     229  {
     230    char const *p = current->prefix_end;
     231    lin i, *bucket;
     232    size_t length;
     233  
     234    /* Cache often-used quantities in local variables to help the compiler.  */
     235    char const **linbuf = current->linbuf;
     236    lin alloc_lines = current->alloc_lines;
     237    lin line = 0;
     238    lin linbuf_base = current->linbuf_base;
     239    lin *cureqs = xmalloc (alloc_lines * sizeof *cureqs);
     240    struct equivclass *eqs = equivs;
     241    lin eqs_index = equivs_index;
     242    lin eqs_alloc = equivs_alloc;
     243    char const *suffix_begin = current->suffix_begin;
     244    char const *bufend = file_buffer (current) + current->buffered;
     245    bool ig_case = ignore_case;
     246    enum DIFF_white_space ig_white_space = ignore_white_space;
     247    bool diff_length_compare_anyway =
     248      ig_white_space != IGNORE_NO_WHITE_SPACE;
     249    bool same_length_diff_contents_compare_anyway =
     250      diff_length_compare_anyway | ig_case;
     251  
     252    while (p < suffix_begin)
     253      {
     254        char const *ip = p;
     255        hash_value h = 0;
     256        unsigned char c;
     257  
     258        /* Hash this line until we find a newline.  */
     259        switch (ig_white_space)
     260          {
     261          case IGNORE_ALL_SPACE:
     262            while ((c = *p++) != '\n')
     263              if (! isspace (c))
     264                h = HASH (h, ig_case ? tolower (c) : c);
     265            break;
     266  
     267          case IGNORE_SPACE_CHANGE:
     268            while ((c = *p++) != '\n')
     269              {
     270                if (isspace (c))
     271                  {
     272                    do
     273                      if ((c = *p++) == '\n')
     274                        goto hashing_done;
     275                    while (isspace (c));
     276  
     277                    h = HASH (h, ' ');
     278                  }
     279  
     280                /* C is now the first non-space.  */
     281                h = HASH (h, ig_case ? tolower (c) : c);
     282              }
     283            break;
     284  
     285          case IGNORE_TAB_EXPANSION:
     286          case IGNORE_TAB_EXPANSION_AND_TRAILING_SPACE:
     287          case IGNORE_TRAILING_SPACE:
     288            {
     289              size_t column = 0;
     290              while ((c = *p++) != '\n')
     291                {
     292                  if (ig_white_space & IGNORE_TRAILING_SPACE
     293                      && isspace (c))
     294                    {
     295                      char const *p1 = p;
     296                      unsigned char c1;
     297                      do
     298                        if ((c1 = *p1++) == '\n')
     299                          {
     300                            p = p1;
     301                            goto hashing_done;
     302                          }
     303                      while (isspace (c1));
     304                    }
     305  
     306                  size_t repetitions = 1;
     307  
     308                  if (ig_white_space & IGNORE_TAB_EXPANSION)
     309                    switch (c)
     310                      {
     311                      case '\b':
     312                        column -= 0 < column;
     313                        break;
     314  
     315                      case '\t':
     316                        c = ' ';
     317                        repetitions = tabsize - column % tabsize;
     318                        column = (column + repetitions < column
     319                                  ? 0
     320                                  : column + repetitions);
     321                        break;
     322  
     323                      case '\r':
     324                        column = 0;
     325                        break;
     326  
     327                      default:
     328                        column++;
     329                        break;
     330                      }
     331  
     332                  if (ig_case)
     333                    c = tolower (c);
     334  
     335                  do
     336                    h = HASH (h, c);
     337                  while (--repetitions != 0);
     338                }
     339            }
     340            break;
     341  
     342          default:
     343            if (ig_case)
     344              while ((c = *p++) != '\n')
     345                h = HASH (h, tolower (c));
     346            else
     347              while ((c = *p++) != '\n')
     348                h = HASH (h, c);
     349            break;
     350          }
     351  
     352     hashing_done:;
     353  
     354        bucket = &buckets[h % nbuckets];
     355        length = p - ip - 1;
     356  
     357        if (p == bufend
     358            && current->missing_newline
     359            && robust_output_style (output_style))
     360          {
     361            /* The last line is incomplete and we do not silently
     362               complete lines.  If the line cannot compare equal to any
     363               complete line, put it into buckets[-1] so that it can
     364               compare equal only to the other file's incomplete line
     365               (if one exists).  */
     366            if (ig_white_space < IGNORE_TRAILING_SPACE)
     367              bucket = &buckets[-1];
     368          }
     369  
     370        for (i = *bucket;  ;  i = eqs[i].next)
     371          if (!i)
     372            {
     373              /* Create a new equivalence class in this bucket.  */
     374              i = eqs_index++;
     375              if (i == eqs_alloc)
     376                {
     377                  if (PTRDIFF_MAX / (2 * sizeof *eqs) <= eqs_alloc)
     378                    xalloc_die ();
     379                  eqs_alloc *= 2;
     380                  eqs = xrealloc (eqs, eqs_alloc * sizeof *eqs);
     381                }
     382              eqs[i].next = *bucket;
     383              eqs[i].hash = h;
     384              eqs[i].line = ip;
     385              eqs[i].length = length;
     386              *bucket = i;
     387              break;
     388            }
     389          else if (eqs[i].hash == h)
     390            {
     391              char const *eqline = eqs[i].line;
     392  
     393              /* Reuse existing class if lines_differ reports the lines
     394                 equal.  */
     395              if (eqs[i].length == length)
     396                {
     397                  /* Reuse existing equivalence class if the lines are identical.
     398                     This detects the common case of exact identity
     399                     faster than lines_differ would.  */
     400                  if (memcmp (eqline, ip, length) == 0)
     401                    break;
     402                  if (!same_length_diff_contents_compare_anyway)
     403                    continue;
     404                }
     405              else if (!diff_length_compare_anyway)
     406                continue;
     407  
     408              if (! lines_differ (eqline, ip))
     409                break;
     410            }
     411  
     412        /* Maybe increase the size of the line table.  */
     413        if (line == alloc_lines)
     414          {
     415            /* Double (alloc_lines - linbuf_base) by adding to alloc_lines.  */
     416            if (PTRDIFF_MAX / 3 <= alloc_lines
     417                || PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base
     418                || PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base)
     419              xalloc_die ();
     420            alloc_lines = 2 * alloc_lines - linbuf_base;
     421            cureqs = xrealloc (cureqs, alloc_lines * sizeof *cureqs);
     422            linbuf += linbuf_base;
     423            linbuf = xrealloc (linbuf,
     424                               (alloc_lines - linbuf_base) * sizeof *linbuf);
     425            linbuf -= linbuf_base;
     426          }
     427        linbuf[line] = ip;
     428        cureqs[line] = i;
     429        ++line;
     430      }
     431  
     432    current->buffered_lines = line;
     433  
     434    for (i = 0;  ;  i++)
     435      {
     436        /* Record the line start for lines in the suffix that we care about.
     437           Record one more line start than lines,
     438           so that we can compute the length of any buffered line.  */
     439        if (line == alloc_lines)
     440          {
     441            /* Double (alloc_lines - linbuf_base) by adding to alloc_lines.  */
     442            if (PTRDIFF_MAX / 3 <= alloc_lines
     443                || PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base
     444                || PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base)
     445              xalloc_die ();
     446            alloc_lines = 2 * alloc_lines - linbuf_base;
     447            linbuf += linbuf_base;
     448            linbuf = xrealloc (linbuf,
     449                               (alloc_lines - linbuf_base) * sizeof *linbuf);
     450            linbuf -= linbuf_base;
     451          }
     452        linbuf[line] = p;
     453  
     454        if (p == bufend)
     455          {
     456            /* If the last line is incomplete and we do not silently
     457               complete lines, don't count its appended newline.  */
     458            if (current->missing_newline && robust_output_style (output_style))
     459              linbuf[line]--;
     460            break;
     461          }
     462  
     463        if (context <= i && no_diff_means_no_output)
     464          break;
     465  
     466        line++;
     467  
     468        while (*p++ != '\n')
     469          continue;
     470      }
     471  
     472    /* Done with cache in local variables.  */
     473    current->linbuf = linbuf;
     474    current->valid_lines = line;
     475    current->alloc_lines = alloc_lines;
     476    current->equivs = cureqs;
     477    equivs = eqs;
     478    equivs_alloc = eqs_alloc;
     479    equivs_index = eqs_index;
     480  }
     481  
     482  /* Prepare the text.  Make sure the text end is initialized.
     483     Make sure text ends in a newline,
     484     but remember that we had to add one.
     485     Strip trailing CRs, if that was requested.  */
     486  
     487  static void
     488  prepare_text (struct file_data *current)
     489  {
     490    size_t buffered = current->buffered;
     491    char *p = file_buffer (current);
     492    if (!p)
     493      return;
     494  
     495    if (strip_trailing_cr)
     496      {
     497        char *srclim = p + buffered;
     498        *srclim = '\r';
     499        char *dst = rawmemchr (p, '\r');
     500  
     501        for (char const *src = dst; src != srclim; src++)
     502          {
     503            src += *src == '\r' && src[1] == '\n';
     504            *dst++ = *src;
     505          }
     506  
     507        buffered -= srclim - dst;
     508      }
     509  
     510    if (buffered != 0 && p[buffered - 1] != '\n')
     511      {
     512        p[buffered++] = '\n';
     513        current->missing_newline = true;
     514      }
     515  
     516    /* Don't use uninitialized storage when planting or using sentinels.  */
     517    memset (p + buffered, 0, sizeof (word));
     518  
     519    current->buffered = buffered;
     520  }
     521  
     522  /* We have found N lines in a buffer of size S; guess the
     523     proportionate number of lines that will be found in a buffer of
     524     size T.  However, do not guess a number of lines so large that the
     525     resulting line table might cause overflow in size calculations.  */
     526  static lin
     527  guess_lines (lin n, size_t s, size_t t)
     528  {
     529    size_t guessed_bytes_per_line = n < 10 ? 32 : s / (n - 1);
     530    lin guessed_lines = MAX (1, t / guessed_bytes_per_line);
     531    return MIN (guessed_lines, PTRDIFF_MAX / (2 * sizeof (char *) + 1) - 5) + 5;
     532  }
     533  
     534  /* Given a vector of two file_data objects, find the identical
     535     prefixes and suffixes of each object.  */
     536  
     537  static void
     538  find_identical_ends (struct file_data filevec[])
     539  {
     540    word *w0, *w1;
     541    char *p0, *p1, *buffer0, *buffer1;
     542    char const *end0, *beg0;
     543    char const **linbuf0, **linbuf1;
     544    lin i, lines;
     545    size_t n0, n1;
     546    lin alloc_lines0, alloc_lines1;
     547    bool prefix_needed;
     548    lin buffered_prefix, prefix_count, prefix_mask;
     549    lin middle_guess, suffix_guess;
     550  
     551    slurp (&filevec[0]);
     552    prepare_text (&filevec[0]);
     553    if (filevec[0].desc != filevec[1].desc)
     554      {
     555        slurp (&filevec[1]);
     556        prepare_text (&filevec[1]);
     557      }
     558    else
     559      {
     560        filevec[1].buffer = filevec[0].buffer;
     561        filevec[1].bufsize = filevec[0].bufsize;
     562        filevec[1].buffered = filevec[0].buffered;
     563        filevec[1].missing_newline = filevec[0].missing_newline;
     564      }
     565  
     566    /* Find identical prefix.  */
     567  
     568    w0 = filevec[0].buffer;
     569    w1 = filevec[1].buffer;
     570    p0 = buffer0 = (char *) w0;
     571    p1 = buffer1 = (char *) w1;
     572    n0 = filevec[0].buffered;
     573    n1 = filevec[1].buffered;
     574  
     575    if (p0 == p1)
     576      /* The buffers are the same; sentinels won't work.  */
     577      p0 = p1 += n1;
     578    else
     579      {
     580        /* Insert end sentinels, in this case characters that are guaranteed
     581           to make the equality test false, and thus terminate the loop.  */
     582  
     583        if (n0 < n1)
     584          p0[n0] = ~p1[n0];
     585        else
     586          p1[n1] = ~p0[n1];
     587  
     588        /* Loop until first mismatch, or to the sentinel characters.  */
     589  
     590        /* Compare a word at a time for speed.  */
     591        while (*w0 == *w1)
     592          w0++, w1++;
     593  
     594        /* Do the last few bytes of comparison a byte at a time.  */
     595        p0 = (char *) w0;
     596        p1 = (char *) w1;
     597        while (*p0 == *p1)
     598          p0++, p1++;
     599  
     600        /* Don't mistakenly count missing newline as part of prefix.  */
     601        if (robust_output_style (output_style)
     602            && ((buffer0 + n0 - filevec[0].missing_newline < p0)
     603                !=
     604                (buffer1 + n1 - filevec[1].missing_newline < p1)))
     605          p0--, p1--;
     606      }
     607  
     608    /* Now P0 and P1 point at the first nonmatching characters.  */
     609  
     610    /* Skip back to last line-beginning in the prefix,
     611       and then discard up to HORIZON_LINES lines from the prefix.  */
     612    i = horizon_lines;
     613    while (p0 != buffer0 && (p0[-1] != '\n' || i--))
     614      p0--, p1--;
     615  
     616    /* Record the prefix.  */
     617    filevec[0].prefix_end = p0;
     618    filevec[1].prefix_end = p1;
     619  
     620    /* Find identical suffix.  */
     621  
     622    /* P0 and P1 point beyond the last chars not yet compared.  */
     623    p0 = buffer0 + n0;
     624    p1 = buffer1 + n1;
     625  
     626    if (! robust_output_style (output_style)
     627        || filevec[0].missing_newline == filevec[1].missing_newline)
     628      {
     629        end0 = p0;	/* Addr of last char in file 0.  */
     630  
     631        /* Get value of P0 at which we should stop scanning backward:
     632           this is when either P0 or P1 points just past the last char
     633           of the identical prefix.  */
     634        beg0 = filevec[0].prefix_end + (n0 < n1 ? 0 : n0 - n1);
     635  
     636        /* Scan back until chars don't match or we reach that point.  */
     637        while (p0 != beg0)
     638          if (*--p0 != *--p1)
     639            {
     640              /* Point at the first char of the matching suffix.  */
     641              ++p0, ++p1;
     642              beg0 = p0;
     643              break;
     644            }
     645  
     646        /* Are we at a line-beginning in both files?  If not, add the rest of
     647           this line to the main body.  Discard up to HORIZON_LINES lines from
     648           the identical suffix.  Also, discard one extra line,
     649           because shift_boundaries may need it.  */
     650        i = horizon_lines + !((buffer0 == p0 || p0[-1] == '\n')
     651                              &&
     652                              (buffer1 == p1 || p1[-1] == '\n'));
     653        while (i-- && p0 != end0)
     654          while (*p0++ != '\n')
     655            continue;
     656  
     657        p1 += p0 - beg0;
     658      }
     659  
     660    /* Record the suffix.  */
     661    filevec[0].suffix_begin = p0;
     662    filevec[1].suffix_begin = p1;
     663  
     664    /* Calculate number of lines of prefix to save.
     665  
     666       prefix_count == 0 means save the whole prefix;
     667       we need this for options like -D that output the whole file,
     668       or for enormous contexts (to avoid worrying about arithmetic overflow).
     669       We also need it for options like -F that output some preceding line;
     670       at least we will need to find the last few lines,
     671       but since we don't know how many, it's easiest to find them all.
     672  
     673       Otherwise, prefix_count != 0.  Save just prefix_count lines at start
     674       of the line buffer; they'll be moved to the proper location later.
     675       Handle 1 more line than the context says (because we count 1 too many),
     676       rounded up to the next power of 2 to speed index computation.  */
     677  
     678    if (no_diff_means_no_output && ! function_regexp.fastmap
     679        && context < LIN_MAX / 4 && context < n0)
     680      {
     681        middle_guess = guess_lines (0, 0, p0 - filevec[0].prefix_end);
     682        suffix_guess = guess_lines (0, 0, buffer0 + n0 - p0);
     683        for (prefix_count = 1;  prefix_count <= context;  prefix_count *= 2)
     684          continue;
     685        alloc_lines0 = (prefix_count + middle_guess
     686                        + MIN (context, suffix_guess));
     687      }
     688    else
     689      {
     690        prefix_count = 0;
     691        alloc_lines0 = guess_lines (0, 0, n0);
     692      }
     693  
     694    prefix_mask = prefix_count - 1;
     695    lines = 0;
     696    linbuf0 = xmalloc (alloc_lines0 * sizeof *linbuf0);
     697    prefix_needed = ! (no_diff_means_no_output
     698                       && filevec[0].prefix_end == p0
     699                       && filevec[1].prefix_end == p1);
     700    p0 = buffer0;
     701  
     702    /* If the prefix is needed, find the prefix lines.  */
     703    if (prefix_needed)
     704      {
     705        end0 = filevec[0].prefix_end;
     706        while (p0 != end0)
     707          {
     708            lin l = lines++ & prefix_mask;
     709            if (l == alloc_lines0)
     710              {
     711                if (PTRDIFF_MAX / (2 * sizeof *linbuf0) <= alloc_lines0)
     712                  xalloc_die ();
     713                alloc_lines0 *= 2;
     714                linbuf0 = xrealloc (linbuf0, alloc_lines0 * sizeof *linbuf0);
     715              }
     716            linbuf0[l] = p0;
     717            while (*p0++ != '\n')
     718              continue;
     719          }
     720      }
     721    buffered_prefix = prefix_count && context < lines ? context : lines;
     722  
     723    /* Allocate line buffer 1.  */
     724  
     725    middle_guess = guess_lines (lines, p0 - buffer0, p1 - filevec[1].prefix_end);
     726    suffix_guess = guess_lines (lines, p0 - buffer0, buffer1 + n1 - p1);
     727    if (INT_ADD_WRAPV (buffered_prefix,
     728  		     middle_guess + MIN (context, suffix_guess),
     729  		     &alloc_lines1))
     730      xalloc_die ();
     731    linbuf1 = xnmalloc (alloc_lines1, sizeof *linbuf1);
     732  
     733    if (buffered_prefix != lines)
     734      {
     735        /* Rotate prefix lines to proper location.  */
     736        for (i = 0;  i < buffered_prefix;  i++)
     737          linbuf1[i] = linbuf0[(lines - context + i) & prefix_mask];
     738        for (i = 0;  i < buffered_prefix;  i++)
     739          linbuf0[i] = linbuf1[i];
     740      }
     741  
     742    /* Initialize line buffer 1 from line buffer 0.  */
     743    for (i = 0; i < buffered_prefix; i++)
     744      linbuf1[i] = linbuf0[i] - buffer0 + buffer1;
     745  
     746    /* Record the line buffer, adjusted so that
     747       linbuf[0] points at the first differing line.  */
     748    filevec[0].linbuf = linbuf0 + buffered_prefix;
     749    filevec[1].linbuf = linbuf1 + buffered_prefix;
     750    filevec[0].linbuf_base = filevec[1].linbuf_base = - buffered_prefix;
     751    filevec[0].alloc_lines = alloc_lines0 - buffered_prefix;
     752    filevec[1].alloc_lines = alloc_lines1 - buffered_prefix;
     753    filevec[0].prefix_lines = filevec[1].prefix_lines = lines;
     754  }
     755  
     756  /* If 1 < k, then (2**k - prime_offset[k]) is the largest prime less
     757     than 2**k.  This table is derived from Chris K. Caldwell's list
     758     <http://www.utm.edu/research/primes/lists/2small/>.  */
     759  
     760  static unsigned char const prime_offset[] =
     761  {
     762    0, 0, 1, 1, 3, 1, 3, 1, 5, 3, 3, 9, 3, 1, 3, 19, 15, 1, 5, 1, 3, 9, 3,
     763    15, 3, 39, 5, 39, 57, 3, 35, 1, 5, 9, 41, 31, 5, 25, 45, 7, 87, 21,
     764    11, 57, 17, 55, 21, 115, 59, 81, 27, 129, 47, 111, 33, 55, 5, 13, 27,
     765    55, 93, 1, 57, 25
     766  };
     767  
     768  /* Verify that this host's size_t is not too wide for the above table.  */
     769  
     770  verify (sizeof (size_t) * CHAR_BIT <= sizeof prime_offset);
     771  
     772  /* Given a vector of two file_data objects, read the file associated
     773     with each one, and build the table of equivalence classes.
     774     Return nonzero if either file appears to be a binary file.
     775     If PRETEND_BINARY is nonzero, pretend they are binary regardless.  */
     776  
     777  bool
     778  read_files (struct file_data filevec[], bool pretend_binary)
     779  {
     780    int i;
     781    bool skip_test = text | pretend_binary;
     782    bool appears_binary = pretend_binary | sip (&filevec[0], skip_test);
     783  
     784    if (filevec[0].desc != filevec[1].desc)
     785      appears_binary |= sip (&filevec[1], skip_test | appears_binary);
     786    else
     787      {
     788        filevec[1].buffer = filevec[0].buffer;
     789        filevec[1].bufsize = filevec[0].bufsize;
     790        filevec[1].buffered = filevec[0].buffered;
     791      }
     792    if (appears_binary)
     793      {
     794        set_binary_mode (filevec[0].desc, O_BINARY);
     795        set_binary_mode (filevec[1].desc, O_BINARY);
     796        return true;
     797      }
     798  
     799    find_identical_ends (filevec);
     800  
     801    equivs_alloc = filevec[0].alloc_lines + filevec[1].alloc_lines + 1;
     802    equivs = xnmalloc (equivs_alloc, sizeof *equivs);
     803    /* Equivalence class 0 is permanently safe for lines that were not
     804       hashed.  Real equivalence classes start at 1.  */
     805    equivs_index = 1;
     806  
     807    /* Allocate (one plus) a prime number of hash buckets.  Use a prime
     808       number between 1/3 and 2/3 of the value of equiv_allocs,
     809       approximately.  */
     810    for (i = 9; (size_t) 1 << i < equivs_alloc / 3; i++)
     811      continue;
     812    nbuckets = ((size_t) 1 << i) - prime_offset[i];
     813    buckets = xcalloc (nbuckets + 1, sizeof *buckets);
     814    buckets++;
     815  
     816    for (i = 0; i < 2; i++)
     817      find_and_hash_each_line (&filevec[i]);
     818  
     819    filevec[0].equiv_max = filevec[1].equiv_max = equivs_index;
     820  
     821    free (equivs);
     822    free (buckets - 1);
     823  
     824    return false;
     825  }