1 /* File I/O for GNU DIFF.
2
3 Copyright (C) 1988-1989, 1992-1995, 1998, 2001-2002, 2004, 2006, 2009-2013,
4 2015-2023 Free Software Foundation, Inc.
5
6 This file is part of GNU DIFF.
7
8 This program is free software: you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation, either version 3 of the License, or
11 (at your option) any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program. If not, see <http://www.gnu.org/licenses/>. */
20
21 #include "diff.h"
22 #include <binary-io.h>
23 #include <cmpbuf.h>
24 #include <file-type.h>
25 #include <xalloc.h>
26
27 /* Rotate an unsigned value to the left. */
28 #define ROL(v, n) ((v) << (n) | (v) >> (sizeof (v) * CHAR_BIT - (n)))
29
30 /* Given a hash value and a new character, return a new hash value. */
31 #define HASH(h, c) ((c) + ROL (h, 7))
32
33 /* The type of a hash value. */
34 typedef size_t hash_value;
35 verify (! TYPE_SIGNED (hash_value));
36
37 /* Lines are put into equivalence classes of lines that match in lines_differ.
38 Each equivalence class is represented by one of these structures,
39 but only while the classes are being computed.
40 Afterward, each class is represented by a number. */
41 struct equivclass
42 {
43 lin next; /* Next item in this bucket. */
44 hash_value hash; /* Hash of lines in this class. */
45 char const *line; /* A line that fits this class. */
46 size_t length; /* That line's length, not counting its newline. */
47 };
48
49 /* Hash-table: array of buckets, each being a chain of equivalence classes.
50 buckets[-1] is reserved for incomplete lines. */
51 static lin *buckets;
52
53 /* Number of buckets in the hash table array, not counting buckets[-1]. */
54 static size_t nbuckets;
55
56 /* Array in which the equivalence classes are allocated.
57 The bucket-chains go through the elements in this array.
58 The number of an equivalence class is its index in this array. */
59 static struct equivclass *equivs;
60
61 /* Index of first free element in the array 'equivs'. */
62 static lin equivs_index;
63
64 /* Number of elements allocated in the array 'equivs'. */
65 static lin equivs_alloc;
66
67 /* The file buffer, considered as an array of bytes rather than
68 as an array of words. */
69
70 static char *
71 file_buffer (struct file_data const *f)
72 {
73 return (char *) f->buffer;
74 }
75
76 /* Read a block of data into a file buffer, checking for EOF and error. */
77
78 void
79 file_block_read (struct file_data *current, size_t size)
80 {
81 if (size && ! current->eof)
82 {
83 size_t s = block_read (current->desc,
84 file_buffer (current) + current->buffered, size);
85 if (s == SIZE_MAX)
86 pfatal_with_name (current->name);
87 current->buffered += s;
88 current->eof = s < size;
89 }
90 }
91
92 /* Check for binary files and compare them for exact identity. */
93
94 /* Return 1 if BUF contains a non text character.
95 SIZE is the number of characters in BUF. */
96
97 #define binary_file_p(buf, size) (memchr (buf, 0, size) != 0)
98
99 /* Get ready to read the current file.
100 Return nonzero if SKIP_TEST is zero,
101 and if it appears to be a binary file. */
102
103 static bool
104 sip (struct file_data *current, bool skip_test)
105 {
106 /* If we have a nonexistent file at this stage, treat it as empty. */
107 if (current->desc < 0)
108 {
109 /* Leave room for a sentinel. */
110 current->bufsize = sizeof (word);
111 current->buffer = xmalloc (current->bufsize);
112 }
113 else
114 {
115 current->bufsize = buffer_lcm (sizeof (word),
116 STAT_BLOCKSIZE (current->stat),
117 PTRDIFF_MAX - 2 * sizeof (word));
118 current->buffer = xmalloc (current->bufsize);
119
120 #ifdef __KLIBC__
121 /* Skip test if seek is not possible */
122 skip_test = skip_test
123 || (lseek (current->desc, 0, SEEK_CUR) < 0
124 && errno == ESPIPE);
125 #endif
126
127 if (! skip_test)
128 {
129 /* Check first part of file to see if it's a binary file. */
130
131 int prev_mode = set_binary_mode (current->desc, O_BINARY);
132 off_t buffered;
133 file_block_read (current, current->bufsize);
134 buffered = current->buffered;
135
136 if (prev_mode != O_BINARY)
137 {
138 /* Revert to text mode and seek back to the start to reread
139 the file. Use relative seek, since file descriptors
140 like stdin might not start at offset zero. */
141 if (lseek (current->desc, - buffered, SEEK_CUR) < 0)
142 pfatal_with_name (current->name);
143 set_binary_mode (current->desc, prev_mode);
144 current->buffered = 0;
145 current->eof = false;
146 }
147
148 return binary_file_p (current->buffer, buffered);
149 }
150 }
151
152 current->buffered = 0;
153 current->eof = false;
154 return false;
155 }
156
157 /* Slurp the rest of the current file completely into memory. */
158
159 static void
160 slurp (struct file_data *current)
161 {
162 size_t cc;
163
164 if (current->desc < 0)
165 {
166 /* The file is nonexistent. */
167 return;
168 }
169
170 if (S_ISREG (current->stat.st_mode))
171 {
172 /* It's a regular file; slurp in the rest all at once. */
173
174 /* Get the size out of the stat block.
175 Allocate just enough room for appended newline plus word sentinel,
176 plus word-alignment since we want the buffer word-aligned. */
177 off_t file_size = current->stat.st_size;
178 if (INT_ADD_WRAPV (2 * sizeof (word) - file_size % sizeof (word),
179 file_size, &cc))
180 xalloc_die ();
181
182 if (current->bufsize < cc)
183 {
184 current->bufsize = cc;
185 current->buffer = xrealloc (current->buffer, cc);
186 }
187
188 /* Try to read at least 1 more byte than the size indicates, to
189 detect whether the file is growing. This is a nicety for
190 users who run 'diff' on files while they are changing. */
191
192 if (current->buffered <= file_size)
193 {
194 file_block_read (current, file_size + 1 - current->buffered);
195 if (current->buffered <= file_size)
196 return;
197 }
198 }
199
200 /* It's not a regular file, or it's a growing regular file; read it,
201 growing the buffer as needed. */
202
203 file_block_read (current, current->bufsize - current->buffered);
204
205 if (current->buffered)
206 {
207 while (current->buffered == current->bufsize)
208 {
209 if (PTRDIFF_MAX / 2 - sizeof (word) < current->bufsize)
210 xalloc_die ();
211 current->bufsize *= 2;
212 current->buffer = xrealloc (current->buffer, current->bufsize);
213 file_block_read (current, current->bufsize - current->buffered);
214 }
215
216 /* Allocate just enough room for appended newline plus word
217 sentinel, plus word-alignment. */
218 cc = current->buffered + 2 * sizeof (word);
219 current->bufsize = cc - cc % sizeof (word);
220 current->buffer = xrealloc (current->buffer, current->bufsize);
221 }
222 }
223
224 /* Split the file into lines, simultaneously computing the equivalence
225 class for each line. */
226
227 static void
228 find_and_hash_each_line (struct file_data *current)
229 {
230 char const *p = current->prefix_end;
231 lin i, *bucket;
232 size_t length;
233
234 /* Cache often-used quantities in local variables to help the compiler. */
235 char const **linbuf = current->linbuf;
236 lin alloc_lines = current->alloc_lines;
237 lin line = 0;
238 lin linbuf_base = current->linbuf_base;
239 lin *cureqs = xmalloc (alloc_lines * sizeof *cureqs);
240 struct equivclass *eqs = equivs;
241 lin eqs_index = equivs_index;
242 lin eqs_alloc = equivs_alloc;
243 char const *suffix_begin = current->suffix_begin;
244 char const *bufend = file_buffer (current) + current->buffered;
245 bool ig_case = ignore_case;
246 enum DIFF_white_space ig_white_space = ignore_white_space;
247 bool diff_length_compare_anyway =
248 ig_white_space != IGNORE_NO_WHITE_SPACE;
249 bool same_length_diff_contents_compare_anyway =
250 diff_length_compare_anyway | ig_case;
251
252 while (p < suffix_begin)
253 {
254 char const *ip = p;
255 hash_value h = 0;
256 unsigned char c;
257
258 /* Hash this line until we find a newline. */
259 switch (ig_white_space)
260 {
261 case IGNORE_ALL_SPACE:
262 while ((c = *p++) != '\n')
263 if (! isspace (c))
264 h = HASH (h, ig_case ? tolower (c) : c);
265 break;
266
267 case IGNORE_SPACE_CHANGE:
268 while ((c = *p++) != '\n')
269 {
270 if (isspace (c))
271 {
272 do
273 if ((c = *p++) == '\n')
274 goto hashing_done;
275 while (isspace (c));
276
277 h = HASH (h, ' ');
278 }
279
280 /* C is now the first non-space. */
281 h = HASH (h, ig_case ? tolower (c) : c);
282 }
283 break;
284
285 case IGNORE_TAB_EXPANSION:
286 case IGNORE_TAB_EXPANSION_AND_TRAILING_SPACE:
287 case IGNORE_TRAILING_SPACE:
288 {
289 size_t column = 0;
290 while ((c = *p++) != '\n')
291 {
292 if (ig_white_space & IGNORE_TRAILING_SPACE
293 && isspace (c))
294 {
295 char const *p1 = p;
296 unsigned char c1;
297 do
298 if ((c1 = *p1++) == '\n')
299 {
300 p = p1;
301 goto hashing_done;
302 }
303 while (isspace (c1));
304 }
305
306 size_t repetitions = 1;
307
308 if (ig_white_space & IGNORE_TAB_EXPANSION)
309 switch (c)
310 {
311 case '\b':
312 column -= 0 < column;
313 break;
314
315 case '\t':
316 c = ' ';
317 repetitions = tabsize - column % tabsize;
318 column = (column + repetitions < column
319 ? 0
320 : column + repetitions);
321 break;
322
323 case '\r':
324 column = 0;
325 break;
326
327 default:
328 column++;
329 break;
330 }
331
332 if (ig_case)
333 c = tolower (c);
334
335 do
336 h = HASH (h, c);
337 while (--repetitions != 0);
338 }
339 }
340 break;
341
342 default:
343 if (ig_case)
344 while ((c = *p++) != '\n')
345 h = HASH (h, tolower (c));
346 else
347 while ((c = *p++) != '\n')
348 h = HASH (h, c);
349 break;
350 }
351
352 hashing_done:;
353
354 bucket = &buckets[h % nbuckets];
355 length = p - ip - 1;
356
357 if (p == bufend
358 && current->missing_newline
359 && robust_output_style (output_style))
360 {
361 /* The last line is incomplete and we do not silently
362 complete lines. If the line cannot compare equal to any
363 complete line, put it into buckets[-1] so that it can
364 compare equal only to the other file's incomplete line
365 (if one exists). */
366 if (ig_white_space < IGNORE_TRAILING_SPACE)
367 bucket = &buckets[-1];
368 }
369
370 for (i = *bucket; ; i = eqs[i].next)
371 if (!i)
372 {
373 /* Create a new equivalence class in this bucket. */
374 i = eqs_index++;
375 if (i == eqs_alloc)
376 {
377 if (PTRDIFF_MAX / (2 * sizeof *eqs) <= eqs_alloc)
378 xalloc_die ();
379 eqs_alloc *= 2;
380 eqs = xrealloc (eqs, eqs_alloc * sizeof *eqs);
381 }
382 eqs[i].next = *bucket;
383 eqs[i].hash = h;
384 eqs[i].line = ip;
385 eqs[i].length = length;
386 *bucket = i;
387 break;
388 }
389 else if (eqs[i].hash == h)
390 {
391 char const *eqline = eqs[i].line;
392
393 /* Reuse existing class if lines_differ reports the lines
394 equal. */
395 if (eqs[i].length == length)
396 {
397 /* Reuse existing equivalence class if the lines are identical.
398 This detects the common case of exact identity
399 faster than lines_differ would. */
400 if (memcmp (eqline, ip, length) == 0)
401 break;
402 if (!same_length_diff_contents_compare_anyway)
403 continue;
404 }
405 else if (!diff_length_compare_anyway)
406 continue;
407
408 if (! lines_differ (eqline, ip))
409 break;
410 }
411
412 /* Maybe increase the size of the line table. */
413 if (line == alloc_lines)
414 {
415 /* Double (alloc_lines - linbuf_base) by adding to alloc_lines. */
416 if (PTRDIFF_MAX / 3 <= alloc_lines
417 || PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base
418 || PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base)
419 xalloc_die ();
420 alloc_lines = 2 * alloc_lines - linbuf_base;
421 cureqs = xrealloc (cureqs, alloc_lines * sizeof *cureqs);
422 linbuf += linbuf_base;
423 linbuf = xrealloc (linbuf,
424 (alloc_lines - linbuf_base) * sizeof *linbuf);
425 linbuf -= linbuf_base;
426 }
427 linbuf[line] = ip;
428 cureqs[line] = i;
429 ++line;
430 }
431
432 current->buffered_lines = line;
433
434 for (i = 0; ; i++)
435 {
436 /* Record the line start for lines in the suffix that we care about.
437 Record one more line start than lines,
438 so that we can compute the length of any buffered line. */
439 if (line == alloc_lines)
440 {
441 /* Double (alloc_lines - linbuf_base) by adding to alloc_lines. */
442 if (PTRDIFF_MAX / 3 <= alloc_lines
443 || PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base
444 || PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base)
445 xalloc_die ();
446 alloc_lines = 2 * alloc_lines - linbuf_base;
447 linbuf += linbuf_base;
448 linbuf = xrealloc (linbuf,
449 (alloc_lines - linbuf_base) * sizeof *linbuf);
450 linbuf -= linbuf_base;
451 }
452 linbuf[line] = p;
453
454 if (p == bufend)
455 {
456 /* If the last line is incomplete and we do not silently
457 complete lines, don't count its appended newline. */
458 if (current->missing_newline && robust_output_style (output_style))
459 linbuf[line]--;
460 break;
461 }
462
463 if (context <= i && no_diff_means_no_output)
464 break;
465
466 line++;
467
468 while (*p++ != '\n')
469 continue;
470 }
471
472 /* Done with cache in local variables. */
473 current->linbuf = linbuf;
474 current->valid_lines = line;
475 current->alloc_lines = alloc_lines;
476 current->equivs = cureqs;
477 equivs = eqs;
478 equivs_alloc = eqs_alloc;
479 equivs_index = eqs_index;
480 }
481
482 /* Prepare the text. Make sure the text end is initialized.
483 Make sure text ends in a newline,
484 but remember that we had to add one.
485 Strip trailing CRs, if that was requested. */
486
487 static void
488 prepare_text (struct file_data *current)
489 {
490 size_t buffered = current->buffered;
491 char *p = file_buffer (current);
492 if (!p)
493 return;
494
495 if (strip_trailing_cr)
496 {
497 char *srclim = p + buffered;
498 *srclim = '\r';
499 char *dst = rawmemchr (p, '\r');
500
501 for (char const *src = dst; src != srclim; src++)
502 {
503 src += *src == '\r' && src[1] == '\n';
504 *dst++ = *src;
505 }
506
507 buffered -= srclim - dst;
508 }
509
510 if (buffered != 0 && p[buffered - 1] != '\n')
511 {
512 p[buffered++] = '\n';
513 current->missing_newline = true;
514 }
515
516 /* Don't use uninitialized storage when planting or using sentinels. */
517 memset (p + buffered, 0, sizeof (word));
518
519 current->buffered = buffered;
520 }
521
522 /* We have found N lines in a buffer of size S; guess the
523 proportionate number of lines that will be found in a buffer of
524 size T. However, do not guess a number of lines so large that the
525 resulting line table might cause overflow in size calculations. */
526 static lin
527 guess_lines (lin n, size_t s, size_t t)
528 {
529 size_t guessed_bytes_per_line = n < 10 ? 32 : s / (n - 1);
530 lin guessed_lines = MAX (1, t / guessed_bytes_per_line);
531 return MIN (guessed_lines, PTRDIFF_MAX / (2 * sizeof (char *) + 1) - 5) + 5;
532 }
533
534 /* Given a vector of two file_data objects, find the identical
535 prefixes and suffixes of each object. */
536
537 static void
538 find_identical_ends (struct file_data filevec[])
539 {
540 word *w0, *w1;
541 char *p0, *p1, *buffer0, *buffer1;
542 char const *end0, *beg0;
543 char const **linbuf0, **linbuf1;
544 lin i, lines;
545 size_t n0, n1;
546 lin alloc_lines0, alloc_lines1;
547 bool prefix_needed;
548 lin buffered_prefix, prefix_count, prefix_mask;
549 lin middle_guess, suffix_guess;
550
551 slurp (&filevec[0]);
552 prepare_text (&filevec[0]);
553 if (filevec[0].desc != filevec[1].desc)
554 {
555 slurp (&filevec[1]);
556 prepare_text (&filevec[1]);
557 }
558 else
559 {
560 filevec[1].buffer = filevec[0].buffer;
561 filevec[1].bufsize = filevec[0].bufsize;
562 filevec[1].buffered = filevec[0].buffered;
563 filevec[1].missing_newline = filevec[0].missing_newline;
564 }
565
566 /* Find identical prefix. */
567
568 w0 = filevec[0].buffer;
569 w1 = filevec[1].buffer;
570 p0 = buffer0 = (char *) w0;
571 p1 = buffer1 = (char *) w1;
572 n0 = filevec[0].buffered;
573 n1 = filevec[1].buffered;
574
575 if (p0 == p1)
576 /* The buffers are the same; sentinels won't work. */
577 p0 = p1 += n1;
578 else
579 {
580 /* Insert end sentinels, in this case characters that are guaranteed
581 to make the equality test false, and thus terminate the loop. */
582
583 if (n0 < n1)
584 p0[n0] = ~p1[n0];
585 else
586 p1[n1] = ~p0[n1];
587
588 /* Loop until first mismatch, or to the sentinel characters. */
589
590 /* Compare a word at a time for speed. */
591 while (*w0 == *w1)
592 w0++, w1++;
593
594 /* Do the last few bytes of comparison a byte at a time. */
595 p0 = (char *) w0;
596 p1 = (char *) w1;
597 while (*p0 == *p1)
598 p0++, p1++;
599
600 /* Don't mistakenly count missing newline as part of prefix. */
601 if (robust_output_style (output_style)
602 && ((buffer0 + n0 - filevec[0].missing_newline < p0)
603 !=
604 (buffer1 + n1 - filevec[1].missing_newline < p1)))
605 p0--, p1--;
606 }
607
608 /* Now P0 and P1 point at the first nonmatching characters. */
609
610 /* Skip back to last line-beginning in the prefix,
611 and then discard up to HORIZON_LINES lines from the prefix. */
612 i = horizon_lines;
613 while (p0 != buffer0 && (p0[-1] != '\n' || i--))
614 p0--, p1--;
615
616 /* Record the prefix. */
617 filevec[0].prefix_end = p0;
618 filevec[1].prefix_end = p1;
619
620 /* Find identical suffix. */
621
622 /* P0 and P1 point beyond the last chars not yet compared. */
623 p0 = buffer0 + n0;
624 p1 = buffer1 + n1;
625
626 if (! robust_output_style (output_style)
627 || filevec[0].missing_newline == filevec[1].missing_newline)
628 {
629 end0 = p0; /* Addr of last char in file 0. */
630
631 /* Get value of P0 at which we should stop scanning backward:
632 this is when either P0 or P1 points just past the last char
633 of the identical prefix. */
634 beg0 = filevec[0].prefix_end + (n0 < n1 ? 0 : n0 - n1);
635
636 /* Scan back until chars don't match or we reach that point. */
637 while (p0 != beg0)
638 if (*--p0 != *--p1)
639 {
640 /* Point at the first char of the matching suffix. */
641 ++p0, ++p1;
642 beg0 = p0;
643 break;
644 }
645
646 /* Are we at a line-beginning in both files? If not, add the rest of
647 this line to the main body. Discard up to HORIZON_LINES lines from
648 the identical suffix. Also, discard one extra line,
649 because shift_boundaries may need it. */
650 i = horizon_lines + !((buffer0 == p0 || p0[-1] == '\n')
651 &&
652 (buffer1 == p1 || p1[-1] == '\n'));
653 while (i-- && p0 != end0)
654 while (*p0++ != '\n')
655 continue;
656
657 p1 += p0 - beg0;
658 }
659
660 /* Record the suffix. */
661 filevec[0].suffix_begin = p0;
662 filevec[1].suffix_begin = p1;
663
664 /* Calculate number of lines of prefix to save.
665
666 prefix_count == 0 means save the whole prefix;
667 we need this for options like -D that output the whole file,
668 or for enormous contexts (to avoid worrying about arithmetic overflow).
669 We also need it for options like -F that output some preceding line;
670 at least we will need to find the last few lines,
671 but since we don't know how many, it's easiest to find them all.
672
673 Otherwise, prefix_count != 0. Save just prefix_count lines at start
674 of the line buffer; they'll be moved to the proper location later.
675 Handle 1 more line than the context says (because we count 1 too many),
676 rounded up to the next power of 2 to speed index computation. */
677
678 if (no_diff_means_no_output && ! function_regexp.fastmap
679 && context < LIN_MAX / 4 && context < n0)
680 {
681 middle_guess = guess_lines (0, 0, p0 - filevec[0].prefix_end);
682 suffix_guess = guess_lines (0, 0, buffer0 + n0 - p0);
683 for (prefix_count = 1; prefix_count <= context; prefix_count *= 2)
684 continue;
685 alloc_lines0 = (prefix_count + middle_guess
686 + MIN (context, suffix_guess));
687 }
688 else
689 {
690 prefix_count = 0;
691 alloc_lines0 = guess_lines (0, 0, n0);
692 }
693
694 prefix_mask = prefix_count - 1;
695 lines = 0;
696 linbuf0 = xmalloc (alloc_lines0 * sizeof *linbuf0);
697 prefix_needed = ! (no_diff_means_no_output
698 && filevec[0].prefix_end == p0
699 && filevec[1].prefix_end == p1);
700 p0 = buffer0;
701
702 /* If the prefix is needed, find the prefix lines. */
703 if (prefix_needed)
704 {
705 end0 = filevec[0].prefix_end;
706 while (p0 != end0)
707 {
708 lin l = lines++ & prefix_mask;
709 if (l == alloc_lines0)
710 {
711 if (PTRDIFF_MAX / (2 * sizeof *linbuf0) <= alloc_lines0)
712 xalloc_die ();
713 alloc_lines0 *= 2;
714 linbuf0 = xrealloc (linbuf0, alloc_lines0 * sizeof *linbuf0);
715 }
716 linbuf0[l] = p0;
717 while (*p0++ != '\n')
718 continue;
719 }
720 }
721 buffered_prefix = prefix_count && context < lines ? context : lines;
722
723 /* Allocate line buffer 1. */
724
725 middle_guess = guess_lines (lines, p0 - buffer0, p1 - filevec[1].prefix_end);
726 suffix_guess = guess_lines (lines, p0 - buffer0, buffer1 + n1 - p1);
727 if (INT_ADD_WRAPV (buffered_prefix,
728 middle_guess + MIN (context, suffix_guess),
729 &alloc_lines1))
730 xalloc_die ();
731 linbuf1 = xnmalloc (alloc_lines1, sizeof *linbuf1);
732
733 if (buffered_prefix != lines)
734 {
735 /* Rotate prefix lines to proper location. */
736 for (i = 0; i < buffered_prefix; i++)
737 linbuf1[i] = linbuf0[(lines - context + i) & prefix_mask];
738 for (i = 0; i < buffered_prefix; i++)
739 linbuf0[i] = linbuf1[i];
740 }
741
742 /* Initialize line buffer 1 from line buffer 0. */
743 for (i = 0; i < buffered_prefix; i++)
744 linbuf1[i] = linbuf0[i] - buffer0 + buffer1;
745
746 /* Record the line buffer, adjusted so that
747 linbuf[0] points at the first differing line. */
748 filevec[0].linbuf = linbuf0 + buffered_prefix;
749 filevec[1].linbuf = linbuf1 + buffered_prefix;
750 filevec[0].linbuf_base = filevec[1].linbuf_base = - buffered_prefix;
751 filevec[0].alloc_lines = alloc_lines0 - buffered_prefix;
752 filevec[1].alloc_lines = alloc_lines1 - buffered_prefix;
753 filevec[0].prefix_lines = filevec[1].prefix_lines = lines;
754 }
755
756 /* If 1 < k, then (2**k - prime_offset[k]) is the largest prime less
757 than 2**k. This table is derived from Chris K. Caldwell's list
758 <http://www.utm.edu/research/primes/lists/2small/>. */
759
760 static unsigned char const prime_offset[] =
761 {
762 0, 0, 1, 1, 3, 1, 3, 1, 5, 3, 3, 9, 3, 1, 3, 19, 15, 1, 5, 1, 3, 9, 3,
763 15, 3, 39, 5, 39, 57, 3, 35, 1, 5, 9, 41, 31, 5, 25, 45, 7, 87, 21,
764 11, 57, 17, 55, 21, 115, 59, 81, 27, 129, 47, 111, 33, 55, 5, 13, 27,
765 55, 93, 1, 57, 25
766 };
767
768 /* Verify that this host's size_t is not too wide for the above table. */
769
770 verify (sizeof (size_t) * CHAR_BIT <= sizeof prime_offset);
771
772 /* Given a vector of two file_data objects, read the file associated
773 with each one, and build the table of equivalence classes.
774 Return nonzero if either file appears to be a binary file.
775 If PRETEND_BINARY is nonzero, pretend they are binary regardless. */
776
777 bool
778 read_files (struct file_data filevec[], bool pretend_binary)
779 {
780 int i;
781 bool skip_test = text | pretend_binary;
782 bool appears_binary = pretend_binary | sip (&filevec[0], skip_test);
783
784 if (filevec[0].desc != filevec[1].desc)
785 appears_binary |= sip (&filevec[1], skip_test | appears_binary);
786 else
787 {
788 filevec[1].buffer = filevec[0].buffer;
789 filevec[1].bufsize = filevec[0].bufsize;
790 filevec[1].buffered = filevec[0].buffered;
791 }
792 if (appears_binary)
793 {
794 set_binary_mode (filevec[0].desc, O_BINARY);
795 set_binary_mode (filevec[1].desc, O_BINARY);
796 return true;
797 }
798
799 find_identical_ends (filevec);
800
801 equivs_alloc = filevec[0].alloc_lines + filevec[1].alloc_lines + 1;
802 equivs = xnmalloc (equivs_alloc, sizeof *equivs);
803 /* Equivalence class 0 is permanently safe for lines that were not
804 hashed. Real equivalence classes start at 1. */
805 equivs_index = 1;
806
807 /* Allocate (one plus) a prime number of hash buckets. Use a prime
808 number between 1/3 and 2/3 of the value of equiv_allocs,
809 approximately. */
810 for (i = 9; (size_t) 1 << i < equivs_alloc / 3; i++)
811 continue;
812 nbuckets = ((size_t) 1 << i) - prime_offset[i];
813 buckets = xcalloc (nbuckets + 1, sizeof *buckets);
814 buckets++;
815
816 for (i = 0; i < 2; i++)
817 find_and_hash_each_line (&filevec[i]);
818
819 filevec[0].equiv_max = filevec[1].equiv_max = equivs_index;
820
821 free (equivs);
822 free (buckets - 1);
823
824 return false;
825 }