1 /* Analyze differences between two vectors.
2
3 Copyright (C) 1988-1989, 1992-1995, 2001-2004, 2006-2023 Free Software
4 Foundation, Inc.
5
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <https://www.gnu.org/licenses/>. */
18
19
20 /* The basic idea is to consider two vectors as similar if, when
21 transforming the first vector into the second vector through a
22 sequence of edits (inserts and deletes of one element each),
23 this sequence is short - or equivalently, if the ordered list
24 of elements that are untouched by these edits is long. For a
25 good introduction to the subject, read about the "Levenshtein
26 distance" in Wikipedia.
27
28 The basic algorithm is described in:
29 "An O(ND) Difference Algorithm and its Variations", Eugene W. Myers,
30 Algorithmica Vol. 1, 1986, pp. 251-266,
31 <https://doi.org/10.1007/BF01840446>.
32 See especially section 4.2, which describes the variation used below.
33
34 The basic algorithm was independently discovered as described in:
35 "Algorithms for Approximate String Matching", Esko Ukkonen,
36 Information and Control Vol. 64, 1985, pp. 100-118,
37 <https://doi.org/10.1016/S0019-9958(85)80046-2>.
38
39 Unless the 'find_minimal' flag is set, this code uses the TOO_EXPENSIVE
40 heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
41 at the price of producing suboptimal output for large inputs with
42 many differences. */
43
44 /* Before including this file, you need to define:
45 ELEMENT The element type of the vectors being compared.
46 EQUAL A two-argument macro that tests two elements for
47 equality.
48 OFFSET A signed integer type sufficient to hold the
49 difference between two indices. Usually
50 something like ptrdiff_t.
51 OFFSET_MAX (Optional) The maximum value of OFFSET (e.g.,
52 PTRDIFF_MAX). If omitted, it is inferred in a
53 way portable to the vast majority of C platforms,
54 as they lack padding bits.
55 EXTRA_CONTEXT_FIELDS Declarations of fields for 'struct context'.
56 NOTE_DELETE(ctxt, xoff) Record the removal of the object xvec[xoff].
57 NOTE_INSERT(ctxt, yoff) Record the insertion of the object yvec[yoff].
58 NOTE_ORDERED (Optional) A boolean expression saying that
59 NOTE_DELETE and NOTE_INSERT calls must be
60 issued in offset order.
61 EARLY_ABORT(ctxt) (Optional) A boolean expression that triggers an
62 early abort of the computation.
63 USE_HEURISTIC (Optional) Define if you want to support the
64 heuristic for large vectors.
65
66 It is also possible to use this file with abstract arrays. In this case,
67 xvec and yvec are not represented in memory. They only exist conceptually.
68 In this case, the list of defines above is amended as follows:
69 ELEMENT Undefined.
70 EQUAL Undefined.
71 XVECREF_YVECREF_EQUAL(ctxt, xoff, yoff)
72 A three-argument macro: References xvec[xoff] and
73 yvec[yoff] and tests these elements for equality.
74
75 Before including this file, you also need to include:
76 #include <limits.h>
77 #include "minmax.h"
78 */
79
80 /* Maximum value of type OFFSET. */
81 #ifndef OFFSET_MAX
82 # define OFFSET_MAX \
83 ((((OFFSET) 1 << (sizeof (OFFSET) * CHAR_BIT - 2)) - 1) * 2 + 1)
84 #endif
85
86 /* Default to no early abort. */
87 #ifndef EARLY_ABORT
88 # define EARLY_ABORT(ctxt) false
89 #endif
90
91 #ifndef NOTE_ORDERED
92 # define NOTE_ORDERED false
93 #endif
94
95 /* Use this to suppress gcc's "...may be used before initialized" warnings.
96 Beware: The Code argument must not contain commas. */
97 #ifndef IF_LINT
98 # if defined GCC_LINT || defined lint
99 # define IF_LINT(Code) Code
100 # else
101 # define IF_LINT(Code) /* empty */
102 # endif
103 #endif
104
105 /*
106 * Context of comparison operation.
107 */
108 struct context
109 {
110 #ifdef ELEMENT
111 /* Vectors being compared. */
112 ELEMENT const *xvec;
113 ELEMENT const *yvec;
114 #endif
115
116 /* Extra fields. */
117 EXTRA_CONTEXT_FIELDS
118
119 /* Vector, indexed by diagonal, containing 1 + the X coordinate of the point
120 furthest along the given diagonal in the forward search of the edit
121 matrix. */
122 OFFSET *fdiag;
123
124 /* Vector, indexed by diagonal, containing the X coordinate of the point
125 furthest along the given diagonal in the backward search of the edit
126 matrix. */
127 OFFSET *bdiag;
128
129 #ifdef USE_HEURISTIC
130 /* This corresponds to the diff --speed-large-files flag. With this
131 heuristic, for vectors with a constant small density of changes,
132 the algorithm is linear in the vector size. */
133 bool heuristic;
134 #endif
135
136 /* Edit scripts longer than this are too expensive to compute. */
137 OFFSET too_expensive;
138
139 /* Snakes bigger than this are considered "big". */
140 #define SNAKE_LIMIT 20
141 };
142
143 struct partition
144 {
145 /* Midpoints of this partition. */
146 OFFSET xmid;
147 OFFSET ymid;
148
149 /* True if low half will be analyzed minimally. */
150 bool lo_minimal;
151
152 /* Likewise for high half. */
153 bool hi_minimal;
154 };
155
156
157 /* Find the midpoint of the shortest edit script for a specified portion
158 of the two vectors.
159
160 Scan from the beginnings of the vectors, and simultaneously from the ends,
161 doing a breadth-first search through the space of edit-sequence.
162 When the two searches meet, we have found the midpoint of the shortest
163 edit sequence.
164
165 If FIND_MINIMAL is true, find the minimal edit script regardless of
166 expense. Otherwise, if the search is too expensive, use heuristics to
167 stop the search and report a suboptimal answer.
168
169 Set PART->(xmid,ymid) to the midpoint (XMID,YMID). The diagonal number
170 XMID - YMID equals the number of inserted elements minus the number
171 of deleted elements (counting only elements before the midpoint).
172
173 Set PART->lo_minimal to true iff the minimal edit script for the
174 left half of the partition is known; similarly for PART->hi_minimal.
175
176 This function assumes that the first elements of the specified portions
177 of the two vectors do not match, and likewise that the last elements do not
178 match. The caller must trim matching elements from the beginning and end
179 of the portions it is going to specify.
180
181 If we return the "wrong" partitions, the worst this can do is cause
182 suboptimal diff output. It cannot cause incorrect diff output. */
183
184 static void
185 diag (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim, bool find_minimal,
186 struct partition *part, struct context *ctxt)
187 {
188 OFFSET *const fd = ctxt->fdiag; /* Give the compiler a chance. */
189 OFFSET *const bd = ctxt->bdiag; /* Additional help for the compiler. */
190 #ifdef ELEMENT
191 ELEMENT const *const xv = ctxt->xvec; /* Still more help for the compiler. */
192 ELEMENT const *const yv = ctxt->yvec; /* And more and more . . . */
193 #define XREF_YREF_EQUAL(x,y) EQUAL (xv[x], yv[y])
194 #else
195 #define XREF_YREF_EQUAL(x,y) XVECREF_YVECREF_EQUAL (ctxt, x, y)
196 #endif
197 const OFFSET dmin = xoff - ylim; /* Minimum valid diagonal. */
198 const OFFSET dmax = xlim - yoff; /* Maximum valid diagonal. */
199 const OFFSET fmid = xoff - yoff; /* Center diagonal of top-down search. */
200 const OFFSET bmid = xlim - ylim; /* Center diagonal of bottom-up search. */
201 OFFSET fmin = fmid;
202 OFFSET fmax = fmid; /* Limits of top-down search. */
203 OFFSET bmin = bmid;
204 OFFSET bmax = bmid; /* Limits of bottom-up search. */
205 OFFSET c; /* Cost. */
206 bool odd = (fmid - bmid) & 1; /* True if southeast corner is on an odd
207 diagonal with respect to the northwest. */
208
209 fd[fmid] = xoff;
210 bd[bmid] = xlim;
211
212 for (c = 1;; ++c)
213 {
214 OFFSET d; /* Active diagonal. */
215 bool big_snake = false;
216
217 /* Extend the top-down search by an edit step in each diagonal. */
218 if (fmin > dmin)
219 fd[--fmin - 1] = -1;
220 else
221 ++fmin;
222 if (fmax < dmax)
223 fd[++fmax + 1] = -1;
224 else
225 --fmax;
226 for (d = fmax; d >= fmin; d -= 2)
227 {
228 OFFSET x;
229 OFFSET y;
230 OFFSET tlo = fd[d - 1];
231 OFFSET thi = fd[d + 1];
232 OFFSET x0 = tlo < thi ? thi : tlo + 1;
233
234 for (x = x0, y = x0 - d;
235 x < xlim && y < ylim && XREF_YREF_EQUAL (x, y);
236 x++, y++)
237 continue;
238 if (x - x0 > SNAKE_LIMIT)
239 big_snake = true;
240 fd[d] = x;
241 if (odd && bmin <= d && d <= bmax && bd[d] <= x)
242 {
243 part->xmid = x;
244 part->ymid = y;
245 part->lo_minimal = part->hi_minimal = true;
246 return;
247 }
248 }
249
250 /* Similarly extend the bottom-up search. */
251 if (bmin > dmin)
252 bd[--bmin - 1] = OFFSET_MAX;
253 else
254 ++bmin;
255 if (bmax < dmax)
256 bd[++bmax + 1] = OFFSET_MAX;
257 else
258 --bmax;
259 for (d = bmax; d >= bmin; d -= 2)
260 {
261 OFFSET x;
262 OFFSET y;
263 OFFSET tlo = bd[d - 1];
264 OFFSET thi = bd[d + 1];
265 OFFSET x0 = tlo < thi ? tlo : thi - 1;
266
267 for (x = x0, y = x0 - d;
268 xoff < x && yoff < y && XREF_YREF_EQUAL (x - 1, y - 1);
269 x--, y--)
270 continue;
271 if (x0 - x > SNAKE_LIMIT)
272 big_snake = true;
273 bd[d] = x;
274 if (!odd && fmin <= d && d <= fmax && x <= fd[d])
275 {
276 part->xmid = x;
277 part->ymid = y;
278 part->lo_minimal = part->hi_minimal = true;
279 return;
280 }
281 }
282
283 if (find_minimal)
284 continue;
285
286 #ifdef USE_HEURISTIC
287 bool heuristic = ctxt->heuristic;
288 #else
289 bool heuristic = false;
290 #endif
291
292 /* Heuristic: check occasionally for a diagonal that has made lots
293 of progress compared with the edit distance. If we have any
294 such, find the one that has made the most progress and return it
295 as if it had succeeded.
296
297 With this heuristic, for vectors with a constant small density
298 of changes, the algorithm is linear in the vector size. */
299
300 if (200 < c && big_snake && heuristic)
301 {
302 {
303 OFFSET best = 0;
304
305 for (d = fmax; d >= fmin; d -= 2)
306 {
307 OFFSET dd = d - fmid;
308 OFFSET x = fd[d];
309 OFFSET y = x - d;
310 OFFSET v = (x - xoff) * 2 - dd;
311
312 if (v > 12 * (c + (dd < 0 ? -dd : dd)))
313 {
314 if (v > best
315 && xoff + SNAKE_LIMIT <= x && x < xlim
316 && yoff + SNAKE_LIMIT <= y && y < ylim)
317 {
318 /* We have a good enough best diagonal; now insist
319 that it end with a significant snake. */
320 int k;
321
322 for (k = 1; XREF_YREF_EQUAL (x - k, y - k); k++)
323 if (k == SNAKE_LIMIT)
324 {
325 best = v;
326 part->xmid = x;
327 part->ymid = y;
328 break;
329 }
330 }
331 }
332 }
333 if (best > 0)
334 {
335 part->lo_minimal = true;
336 part->hi_minimal = false;
337 return;
338 }
339 }
340
341 {
342 OFFSET best = 0;
343
344 for (d = bmax; d >= bmin; d -= 2)
345 {
346 OFFSET dd = d - bmid;
347 OFFSET x = bd[d];
348 OFFSET y = x - d;
349 OFFSET v = (xlim - x) * 2 + dd;
350
351 if (v > 12 * (c + (dd < 0 ? -dd : dd)))
352 {
353 if (v > best
354 && xoff < x && x <= xlim - SNAKE_LIMIT
355 && yoff < y && y <= ylim - SNAKE_LIMIT)
356 {
357 /* We have a good enough best diagonal; now insist
358 that it end with a significant snake. */
359 int k;
360
361 for (k = 0; XREF_YREF_EQUAL (x + k, y + k); k++)
362 if (k == SNAKE_LIMIT - 1)
363 {
364 best = v;
365 part->xmid = x;
366 part->ymid = y;
367 break;
368 }
369 }
370 }
371 }
372 if (best > 0)
373 {
374 part->lo_minimal = false;
375 part->hi_minimal = true;
376 return;
377 }
378 }
379 }
380
381 /* Heuristic: if we've gone well beyond the call of duty, give up
382 and report halfway between our best results so far. */
383 if (c >= ctxt->too_expensive)
384 {
385 OFFSET fxybest;
386 OFFSET fxbest IF_LINT (= 0);
387 OFFSET bxybest;
388 OFFSET bxbest IF_LINT (= 0);
389
390 /* Find forward diagonal that maximizes X + Y. */
391 fxybest = -1;
392 for (d = fmax; d >= fmin; d -= 2)
393 {
394 OFFSET x = MIN (fd[d], xlim);
395 OFFSET y = x - d;
396 if (ylim < y)
397 {
398 x = ylim + d;
399 y = ylim;
400 }
401 if (fxybest < x + y)
402 {
403 fxybest = x + y;
404 fxbest = x;
405 }
406 }
407
408 /* Find backward diagonal that minimizes X + Y. */
409 bxybest = OFFSET_MAX;
410 for (d = bmax; d >= bmin; d -= 2)
411 {
412 OFFSET x = MAX (xoff, bd[d]);
413 OFFSET y = x - d;
414 if (y < yoff)
415 {
416 x = yoff + d;
417 y = yoff;
418 }
419 if (x + y < bxybest)
420 {
421 bxybest = x + y;
422 bxbest = x;
423 }
424 }
425
426 /* Use the better of the two diagonals. */
427 if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff))
428 {
429 part->xmid = fxbest;
430 part->ymid = fxybest - fxbest;
431 part->lo_minimal = true;
432 part->hi_minimal = false;
433 }
434 else
435 {
436 part->xmid = bxbest;
437 part->ymid = bxybest - bxbest;
438 part->lo_minimal = false;
439 part->hi_minimal = true;
440 }
441 return;
442 }
443 }
444 #undef XREF_YREF_EQUAL
445 }
446
447
448 /* Compare in detail contiguous subsequences of the two vectors
449 which are known, as a whole, to match each other.
450
451 The subsequence of vector 0 is [XOFF, XLIM) and likewise for vector 1.
452
453 Note that XLIM, YLIM are exclusive bounds. All indices into the vectors
454 are origin-0.
455
456 If FIND_MINIMAL, find a minimal difference no matter how
457 expensive it is.
458
459 The results are recorded by invoking NOTE_DELETE and NOTE_INSERT.
460
461 Return false if terminated normally, or true if terminated through early
462 abort. */
463
464 static bool
465 compareseq (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim,
466 bool find_minimal, struct context *ctxt)
467 {
468 #ifdef ELEMENT
469 ELEMENT const *xv = ctxt->xvec; /* Help the compiler. */
470 ELEMENT const *yv = ctxt->yvec;
471 #define XREF_YREF_EQUAL(x,y) EQUAL (xv[x], yv[y])
472 #else
473 #define XREF_YREF_EQUAL(x,y) XVECREF_YVECREF_EQUAL (ctxt, x, y)
474 #endif
475
476 while (true)
477 {
478 /* Slide down the bottom initial diagonal. */
479 while (xoff < xlim && yoff < ylim && XREF_YREF_EQUAL (xoff, yoff))
480 {
481 xoff++;
482 yoff++;
483 }
484
485 /* Slide up the top initial diagonal. */
486 while (xoff < xlim && yoff < ylim && XREF_YREF_EQUAL (xlim - 1, ylim - 1))
487 {
488 xlim--;
489 ylim--;
490 }
491
492 /* Handle simple cases. */
493 if (xoff == xlim)
494 {
495 while (yoff < ylim)
496 {
497 NOTE_INSERT (ctxt, yoff);
498 if (EARLY_ABORT (ctxt))
499 return true;
500 yoff++;
501 }
502 break;
503 }
504 if (yoff == ylim)
505 {
506 while (xoff < xlim)
507 {
508 NOTE_DELETE (ctxt, xoff);
509 if (EARLY_ABORT (ctxt))
510 return true;
511 xoff++;
512 }
513 break;
514 }
515
516 struct partition part;
517
518 /* Find a point of correspondence in the middle of the vectors. */
519 diag (xoff, xlim, yoff, ylim, find_minimal, &part, ctxt);
520
521 /* Use the partitions to split this problem into subproblems. */
522 OFFSET xoff1, xlim1, yoff1, ylim1, xoff2, xlim2, yoff2, ylim2;
523 bool find_minimal1, find_minimal2;
524 if (!NOTE_ORDERED
525 && ((xlim + ylim) - (part.xmid + part.ymid)
526 < (part.xmid + part.ymid) - (xoff + yoff)))
527 {
528 /* The second problem is smaller and the caller doesn't
529 care about order, so do the second problem first to
530 lessen recursion. */
531 xoff1 = part.xmid; xlim1 = xlim;
532 yoff1 = part.ymid; ylim1 = ylim;
533 find_minimal1 = part.hi_minimal;
534
535 xoff2 = xoff; xlim2 = part.xmid;
536 yoff2 = yoff; ylim2 = part.ymid;
537 find_minimal2 = part.lo_minimal;
538 }
539 else
540 {
541 xoff1 = xoff; xlim1 = part.xmid;
542 yoff1 = yoff; ylim1 = part.ymid;
543 find_minimal1 = part.lo_minimal;
544
545 xoff2 = part.xmid; xlim2 = xlim;
546 yoff2 = part.ymid; ylim2 = ylim;
547 find_minimal2 = part.hi_minimal;
548 }
549
550 /* Recurse to do one subproblem. */
551 bool early = compareseq (xoff1, xlim1, yoff1, ylim1, find_minimal1, ctxt);
552 if (early)
553 return early;
554
555 /* Iterate to do the other subproblem. */
556 xoff = xoff2; xlim = xlim2;
557 yoff = yoff2; ylim = ylim2;
558 find_minimal = find_minimal2;
559 }
560
561 return false;
562 #undef XREF_YREF_EQUAL
563 }
564
565 #undef ELEMENT
566 #undef EQUAL
567 #undef OFFSET
568 #undef EXTRA_CONTEXT_FIELDS
569 #undef NOTE_DELETE
570 #undef NOTE_INSERT
571 #undef EARLY_ABORT
572 #undef USE_HEURISTIC
573 #undef XVECREF_YVECREF_EQUAL
574 #undef OFFSET_MAX