1 /* Analyze file differences for GNU DIFF.
2
3 Copyright (C) 1988-1989, 1992-1995, 1998, 2001-2002, 2004, 2006-2007,
4 2009-2013, 2015-2023 Free Software Foundation, Inc.
5
6 This file is part of GNU DIFF.
7
8 This program is free software: you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation, either version 3 of the License, or
11 (at your option) any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program. If not, see <http://www.gnu.org/licenses/>. */
20
21 #include "diff.h"
22 #include <cmpbuf.h>
23 #include <error.h>
24 #include <file-type.h>
25 #include <xalloc.h>
26
27 /* The core of the Diff algorithm. */
28 #define ELEMENT lin
29 #define EQUAL(x,y) ((x) == (y))
30 #define OFFSET lin
31 #define OFFSET_MAX LIN_MAX
32 #define EXTRA_CONTEXT_FIELDS /* none */
33 #define NOTE_DELETE(c, xoff) (files[0].changed[files[0].realindexes[xoff]] = 1)
34 #define NOTE_INSERT(c, yoff) (files[1].changed[files[1].realindexes[yoff]] = 1)
35 #define USE_HEURISTIC 1
36 #include <diffseq.h>
37
38 /* Discard lines from one file that have no matches in the other file.
39
40 A line which is discarded will not be considered by the actual
41 comparison algorithm; it will be as if that line were not in the file.
42 The file's 'realindexes' table maps virtual line numbers
43 (which don't count the discarded lines) into real line numbers;
44 this is how the actual comparison algorithm produces results
45 that are comprehensible when the discarded lines are counted.
46
47 When we discard a line, we also mark it as a deletion or insertion
48 so that it will be printed in the output. */
49
50 static void
51 discard_confusing_lines (struct file_data filevec[])
52 {
53 int f;
54 lin i;
55 char *discarded[2];
56 lin *equiv_count[2];
57 lin *p;
58
59 /* Allocate our results. */
60 p = xmalloc ((filevec[0].buffered_lines + filevec[1].buffered_lines)
61 * (2 * sizeof *p));
62 for (f = 0; f < 2; f++)
63 {
64 filevec[f].undiscarded = p; p += filevec[f].buffered_lines;
65 filevec[f].realindexes = p; p += filevec[f].buffered_lines;
66 }
67
68 /* Set up equiv_count[F][I] as the number of lines in file F
69 that fall in equivalence class I. */
70
71 p = xcalloc (filevec[0].equiv_max, 2 * sizeof *p);
72 equiv_count[0] = p;
73 equiv_count[1] = p + filevec[0].equiv_max;
74
75 for (i = 0; i < filevec[0].buffered_lines; ++i)
76 ++equiv_count[0][filevec[0].equivs[i]];
77 for (i = 0; i < filevec[1].buffered_lines; ++i)
78 ++equiv_count[1][filevec[1].equivs[i]];
79
80 /* Set up tables of which lines are going to be discarded. */
81
82 discarded[0] = xzalloc (filevec[0].buffered_lines
83 + filevec[1].buffered_lines);
84 discarded[1] = discarded[0] + filevec[0].buffered_lines;
85
86 /* Mark to be discarded each line that matches no line of the other file.
87 If a line matches many lines, mark it as provisionally discardable. */
88
89 for (f = 0; f < 2; f++)
90 {
91 size_t end = filevec[f].buffered_lines;
92 char *discards = discarded[f];
93 lin *counts = equiv_count[1 - f];
94 lin *equivs = filevec[f].equivs;
95 size_t many = 5;
96 size_t tem = end / 64;
97
98 /* Multiply MANY by approximate square root of number of lines.
99 That is the threshold for provisionally discardable lines. */
100 while ((tem = tem >> 2) > 0)
101 many *= 2;
102
103 for (i = 0; i < end; i++)
104 {
105 lin nmatch;
106 if (equivs[i] == 0)
107 continue;
108 nmatch = counts[equivs[i]];
109 if (nmatch == 0)
110 discards[i] = 1;
111 else if (nmatch > many)
112 discards[i] = 2;
113 }
114 }
115
116 /* Don't really discard the provisional lines except when they occur
117 in a run of discardables, with nonprovisionals at the beginning
118 and end. */
119
120 for (f = 0; f < 2; f++)
121 {
122 lin end = filevec[f].buffered_lines;
123 register char *discards = discarded[f];
124
125 for (i = 0; i < end; i++)
126 {
127 /* Cancel provisional discards not in middle of run of discards. */
128 if (discards[i] == 2)
129 discards[i] = 0;
130 else if (discards[i] != 0)
131 {
132 /* We have found a nonprovisional discard. */
133 register lin j;
134 lin length;
135 lin provisional = 0;
136
137 /* Find end of this run of discardable lines.
138 Count how many are provisionally discardable. */
139 for (j = i; j < end; j++)
140 {
141 if (discards[j] == 0)
142 break;
143 if (discards[j] == 2)
144 ++provisional;
145 }
146
147 /* Cancel provisional discards at end, and shrink the run. */
148 while (j > i && discards[j - 1] == 2)
149 discards[--j] = 0, --provisional;
150
151 /* Now we have the length of a run of discardable lines
152 whose first and last are not provisional. */
153 length = j - i;
154
155 /* If 1/4 of the lines in the run are provisional,
156 cancel discarding of all provisional lines in the run. */
157 if (provisional * 4 > length)
158 {
159 while (j > i)
160 if (discards[--j] == 2)
161 discards[j] = 0;
162 }
163 else
164 {
165 register lin consec;
166 lin minimum = 1;
167 lin tem = length >> 2;
168
169 /* MINIMUM is approximate square root of LENGTH/4.
170 A subrun of two or more provisionals can stand
171 when LENGTH is at least 16.
172 A subrun of 4 or more can stand when LENGTH >= 64. */
173 while (0 < (tem >>= 2))
174 minimum <<= 1;
175 minimum++;
176
177 /* Cancel any subrun of MINIMUM or more provisionals
178 within the larger run. */
179 for (j = 0, consec = 0; j < length; j++)
180 if (discards[i + j] != 2)
181 consec = 0;
182 else if (minimum == ++consec)
183 /* Back up to start of subrun, to cancel it all. */
184 j -= consec;
185 else if (minimum < consec)
186 discards[i + j] = 0;
187
188 /* Scan from beginning of run
189 until we find 3 or more nonprovisionals in a row
190 or until the first nonprovisional at least 8 lines in.
191 Until that point, cancel any provisionals. */
192 for (j = 0, consec = 0; j < length; j++)
193 {
194 if (j >= 8 && discards[i + j] == 1)
195 break;
196 if (discards[i + j] == 2)
197 consec = 0, discards[i + j] = 0;
198 else if (discards[i + j] == 0)
199 consec = 0;
200 else
201 consec++;
202 if (consec == 3)
203 break;
204 }
205
206 /* I advances to the last line of the run. */
207 i += length - 1;
208
209 /* Same thing, from end. */
210 for (j = 0, consec = 0; j < length; j++)
211 {
212 if (j >= 8 && discards[i - j] == 1)
213 break;
214 if (discards[i - j] == 2)
215 consec = 0, discards[i - j] = 0;
216 else if (discards[i - j] == 0)
217 consec = 0;
218 else
219 consec++;
220 if (consec == 3)
221 break;
222 }
223 }
224 }
225 }
226 }
227
228 /* Actually discard the lines. */
229 for (f = 0; f < 2; f++)
230 {
231 char *discards = discarded[f];
232 lin end = filevec[f].buffered_lines;
233 lin j = 0;
234 for (i = 0; i < end; ++i)
235 if (minimal || discards[i] == 0)
236 {
237 filevec[f].undiscarded[j] = filevec[f].equivs[i];
238 filevec[f].realindexes[j++] = i;
239 }
240 else
241 filevec[f].changed[i] = 1;
242 filevec[f].nondiscarded_lines = j;
243 }
244
245 free (discarded[0]);
246 free (equiv_count[0]);
247 }
248
249 /* Adjust inserts/deletes of identical lines to join changes
250 as much as possible.
251
252 We do something when a run of changed lines include a
253 line at one end and have an excluded, identical line at the other.
254 We are free to choose which identical line is included.
255 'compareseq' usually chooses the one at the beginning,
256 but usually it is cleaner to consider the following identical line
257 to be the "change". */
258
259 static void
260 shift_boundaries (struct file_data filevec[])
261 {
262 int f;
263
264 for (f = 0; f < 2; f++)
265 {
266 char *changed = filevec[f].changed;
267 char *other_changed = filevec[1 - f].changed;
268 lin const *equivs = filevec[f].equivs;
269 lin i = 0;
270 lin j = 0;
271 lin i_end = filevec[f].buffered_lines;
272
273 while (1)
274 {
275 lin runlength, start, corresponding;
276
277 /* Scan forwards to find beginning of another run of changes.
278 Also keep track of the corresponding point in the other file. */
279
280 while (i < i_end && !changed[i])
281 {
282 while (other_changed[j++])
283 continue;
284 i++;
285 }
286
287 if (i == i_end)
288 break;
289
290 start = i;
291
292 /* Find the end of this run of changes. */
293
294 while (changed[++i])
295 continue;
296 while (other_changed[j])
297 j++;
298
299 do
300 {
301 /* Record the length of this run of changes, so that
302 we can later determine whether the run has grown. */
303 runlength = i - start;
304
305 /* Move the changed region back, so long as the
306 previous unchanged line matches the last changed one.
307 This merges with previous changed regions. */
308
309 while (start && equivs[start - 1] == equivs[i - 1])
310 {
311 changed[--start] = 1;
312 changed[--i] = 0;
313 while (changed[start - 1])
314 start--;
315 while (other_changed[--j])
316 continue;
317 }
318
319 /* Set CORRESPONDING to the end of the changed run, at the last
320 point where it corresponds to a changed run in the other file.
321 CORRESPONDING == I_END means no such point has been found. */
322 corresponding = other_changed[j - 1] ? i : i_end;
323
324 /* Move the changed region forward, so long as the
325 first changed line matches the following unchanged one.
326 This merges with following changed regions.
327 Do this second, so that if there are no merges,
328 the changed region is moved forward as far as possible. */
329
330 while (i != i_end && equivs[start] == equivs[i])
331 {
332 changed[start++] = 0;
333 changed[i++] = 1;
334 while (changed[i])
335 i++;
336 while (other_changed[++j])
337 corresponding = i;
338 }
339 }
340 while (runlength != i - start);
341
342 /* If possible, move the fully-merged run of changes
343 back to a corresponding run in the other file. */
344
345 while (corresponding < i)
346 {
347 changed[--start] = 1;
348 changed[--i] = 0;
349 while (other_changed[--j])
350 continue;
351 }
352 }
353 }
354 }
355
356 /* Cons an additional entry onto the front of an edit script OLD.
357 LINE0 and LINE1 are the first affected lines in the two files (origin 0).
358 DELETED is the number of lines deleted here from file 0.
359 INSERTED is the number of lines inserted here in file 1.
360
361 If DELETED is 0 then LINE0 is the number of the line before
362 which the insertion was done; vice versa for INSERTED and LINE1. */
363
364 static struct change *
365 add_change (lin line0, lin line1, lin deleted, lin inserted,
366 struct change *old)
367 {
368 struct change *new = xmalloc (sizeof *new);
369
370 new->line0 = line0;
371 new->line1 = line1;
372 new->inserted = inserted;
373 new->deleted = deleted;
374 new->link = old;
375 return new;
376 }
377
378 /* Scan the tables of which lines are inserted and deleted,
379 producing an edit script in reverse order. */
380
381 static struct change *
382 build_reverse_script (struct file_data const filevec[])
383 {
384 struct change *script = 0;
385 char *changed0 = filevec[0].changed;
386 char *changed1 = filevec[1].changed;
387 lin len0 = filevec[0].buffered_lines;
388 lin len1 = filevec[1].buffered_lines;
389
390 /* Note that changedN[lenN] does exist, and is 0. */
391
392 lin i0 = 0, i1 = 0;
393
394 while (i0 < len0 || i1 < len1)
395 {
396 if (changed0[i0] | changed1[i1])
397 {
398 lin line0 = i0, line1 = i1;
399
400 /* Find # lines changed here in each file. */
401 while (changed0[i0]) ++i0;
402 while (changed1[i1]) ++i1;
403
404 /* Record this change. */
405 script = add_change (line0, line1, i0 - line0, i1 - line1, script);
406 }
407
408 /* We have reached lines in the two files that match each other. */
409 i0++, i1++;
410 }
411
412 return script;
413 }
414
415 /* Scan the tables of which lines are inserted and deleted,
416 producing an edit script in forward order. */
417
418 static struct change *
419 build_script (struct file_data const filevec[])
420 {
421 struct change *script = 0;
422 char *changed0 = filevec[0].changed;
423 char *changed1 = filevec[1].changed;
424 lin i0 = filevec[0].buffered_lines, i1 = filevec[1].buffered_lines;
425
426 /* Note that changedN[-1] does exist, and is 0. */
427
428 while (i0 >= 0 || i1 >= 0)
429 {
430 if (changed0[i0 - 1] | changed1[i1 - 1])
431 {
432 lin line0 = i0, line1 = i1;
433
434 /* Find # lines changed here in each file. */
435 while (changed0[i0 - 1]) --i0;
436 while (changed1[i1 - 1]) --i1;
437
438 /* Record this change. */
439 script = add_change (i0, i1, line0 - i0, line1 - i1, script);
440 }
441
442 /* We have reached lines in the two files that match each other. */
443 i0--, i1--;
444 }
445
446 return script;
447 }
448
449 /* If CHANGES, briefly report that two files differed. */
450 static void
451 briefly_report (int changes, struct file_data const filevec[])
452 {
453 if (changes)
454 message ((brief
455 ? N_("Files %s and %s differ\n")
456 : N_("Binary files %s and %s differ\n")),
457 file_label[0] ? file_label[0] : filevec[0].name,
458 file_label[1] ? file_label[1] : filevec[1].name);
459 }
460
461 /* Report the differences of two files. */
462 int
463 diff_2_files (struct comparison *cmp)
464 {
465 int f;
466 struct change *e, *p;
467 struct change *script;
468 int changes;
469
470
471 /* If we have detected that either file is binary,
472 compare the two files as binary. This can happen
473 only when the first chunk is read.
474 Also, --brief without any --ignore-* options means
475 we can speed things up by treating the files as binary. */
476
477 if (read_files (cmp->file, files_can_be_treated_as_binary))
478 {
479 /* Files with different lengths must be different. */
480 if (cmp->file[0].stat.st_size != cmp->file[1].stat.st_size
481 && 0 < cmp->file[0].stat.st_size
482 && 0 < cmp->file[1].stat.st_size
483 && (cmp->file[0].desc < 0 || S_ISREG (cmp->file[0].stat.st_mode))
484 && (cmp->file[1].desc < 0 || S_ISREG (cmp->file[1].stat.st_mode)))
485 changes = 1;
486
487 /* Standard input equals itself. */
488 else if (cmp->file[0].desc == cmp->file[1].desc)
489 changes = 0;
490
491 else
492 /* Scan both files, a buffer at a time, looking for a difference. */
493 {
494 /* Allocate same-sized buffers for both files. */
495 size_t lcm_max = PTRDIFF_MAX - 1;
496 size_t buffer_size =
497 buffer_lcm (sizeof (word),
498 buffer_lcm (STAT_BLOCKSIZE (cmp->file[0].stat),
499 STAT_BLOCKSIZE (cmp->file[1].stat),
500 lcm_max),
501 lcm_max);
502 for (f = 0; f < 2; f++)
503 cmp->file[f].buffer = xrealloc (cmp->file[f].buffer, buffer_size);
504
505 for (;; cmp->file[0].buffered = cmp->file[1].buffered = 0)
506 {
507 /* Read a buffer's worth from both files. */
508 for (f = 0; f < 2; f++)
509 if (0 <= cmp->file[f].desc)
510 file_block_read (&cmp->file[f],
511 buffer_size - cmp->file[f].buffered);
512
513 /* If the buffers differ, the files differ. */
514 if (cmp->file[0].buffered != cmp->file[1].buffered
515 || memcmp (cmp->file[0].buffer,
516 cmp->file[1].buffer,
517 cmp->file[0].buffered))
518 {
519 changes = 1;
520 break;
521 }
522
523 /* If we reach end of file, the files are the same. */
524 if (cmp->file[0].buffered != buffer_size)
525 {
526 changes = 0;
527 break;
528 }
529 }
530 }
531
532 briefly_report (changes, cmp->file);
533 }
534 else
535 {
536 struct context ctxt;
537 lin diags;
538 lin too_expensive;
539
540 /* Allocate vectors for the results of comparison:
541 a flag for each line of each file, saying whether that line
542 is an insertion or deletion.
543 Allocate an extra element, always 0, at each end of each vector. */
544
545 size_t s = cmp->file[0].buffered_lines + cmp->file[1].buffered_lines + 4;
546 char *flag_space = xzalloc (s);
547 cmp->file[0].changed = flag_space + 1;
548 cmp->file[1].changed = flag_space + cmp->file[0].buffered_lines + 3;
549
550 /* Some lines are obviously insertions or deletions
551 because they don't match anything. Detect them now, and
552 avoid even thinking about them in the main comparison algorithm. */
553
554 discard_confusing_lines (cmp->file);
555
556 /* Now do the main comparison algorithm, considering just the
557 undiscarded lines. */
558
559 ctxt.xvec = cmp->file[0].undiscarded;
560 ctxt.yvec = cmp->file[1].undiscarded;
561 diags = (cmp->file[0].nondiscarded_lines
562 + cmp->file[1].nondiscarded_lines + 3);
563 ctxt.fdiag = xmalloc (diags * (2 * sizeof *ctxt.fdiag));
564 ctxt.bdiag = ctxt.fdiag + diags;
565 ctxt.fdiag += cmp->file[1].nondiscarded_lines + 1;
566 ctxt.bdiag += cmp->file[1].nondiscarded_lines + 1;
567
568 ctxt.heuristic = speed_large_files;
569
570 /* Set TOO_EXPENSIVE to be the approximate square root of the
571 input size, bounded below by 4096. 4096 seems to be good for
572 circa-2016 CPUs; see Bug#16848 and Bug#24715. */
573 too_expensive = 1;
574 for (; diags != 0; diags >>= 2)
575 too_expensive <<= 1;
576 ctxt.too_expensive = MAX (4096, too_expensive);
577
578 files[0] = cmp->file[0];
579 files[1] = cmp->file[1];
580
581 compareseq (0, cmp->file[0].nondiscarded_lines,
582 0, cmp->file[1].nondiscarded_lines, minimal, &ctxt);
583
584 free (ctxt.fdiag - (cmp->file[1].nondiscarded_lines + 1));
585
586 /* Modify the results slightly to make them prettier
587 in cases where that can validly be done. */
588
589 shift_boundaries (cmp->file);
590
591 /* Get the results of comparison in the form of a chain
592 of 'struct change's -- an edit script. */
593
594 if (output_style == OUTPUT_ED)
595 script = build_reverse_script (cmp->file);
596 else
597 script = build_script (cmp->file);
598
599 /* Set CHANGES if we had any diffs.
600 If some changes are ignored, we must scan the script to decide. */
601 if (ignore_blank_lines || ignore_regexp.fastmap)
602 {
603 struct change *next = script;
604 changes = 0;
605
606 while (next && changes == 0)
607 {
608 struct change *this, *end;
609 lin first0, last0, first1, last1;
610
611 /* Find a set of changes that belong together. */
612 this = next;
613 end = find_change (next);
614
615 /* Disconnect them from the rest of the changes, making them
616 a hunk, and remember the rest for next iteration. */
617 next = end->link;
618 end->link = 0;
619
620 /* Determine whether this hunk is really a difference. */
621 if (analyze_hunk (this, &first0, &last0, &first1, &last1))
622 changes = 1;
623
624 /* Reconnect the script so it will all be freed properly. */
625 end->link = next;
626 }
627 }
628 else
629 changes = (script != 0);
630
631 if (brief)
632 briefly_report (changes, cmp->file);
633 else
634 {
635 if (changes || !no_diff_means_no_output)
636 {
637 /* Record info for starting up output,
638 to be used if and when we have some output to print. */
639 setup_output (file_label[0] ? file_label[0] : cmp->file[0].name,
640 file_label[1] ? file_label[1] : cmp->file[1].name,
641 cmp->parent != 0);
642
643 switch (output_style)
644 {
645 case OUTPUT_CONTEXT:
646 print_context_script (script, false);
647 break;
648
649 case OUTPUT_UNIFIED:
650 print_context_script (script, true);
651 break;
652
653 case OUTPUT_ED:
654 print_ed_script (script);
655 break;
656
657 case OUTPUT_FORWARD_ED:
658 pr_forward_ed_script (script);
659 break;
660
661 case OUTPUT_RCS:
662 print_rcs_script (script);
663 break;
664
665 case OUTPUT_NORMAL:
666 print_normal_script (script);
667 break;
668
669 case OUTPUT_IFDEF:
670 print_ifdef_script (script);
671 break;
672
673 case OUTPUT_SDIFF:
674 print_sdiff_script (script);
675 break;
676
677 default:
678 abort ();
679 }
680
681 finish_output ();
682 }
683 }
684
685 free (cmp->file[0].undiscarded);
686
687 free (flag_space);
688
689 for (f = 0; f < 2; f++)
690 {
691 free (cmp->file[f].equivs);
692 free (cmp->file[f].linbuf + cmp->file[f].linbuf_base);
693 }
694
695 for (e = script; e; e = p)
696 {
697 p = e->link;
698 free (e);
699 }
700
701 if (! robust_output_style (output_style))
702 for (f = 0; f < 2; ++f)
703 if (cmp->file[f].missing_newline)
704 {
705 error (0, 0, "%s: %s\n",
706 file_label[f] ? file_label[f] : cmp->file[f].name,
707 _("No newline at end of file"));
708 changes = 2;
709 }
710 }
711
712 if (cmp->file[0].buffer != cmp->file[1].buffer)
713 free (cmp->file[0].buffer);
714 free (cmp->file[1].buffer);
715
716 return changes;
717 }