1 /* Analyze differences between two vectors.
3 Copyright (C) 1988-1989, 1992-1995, 2001-2004, 2006-2016 Free Software
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <http://www.gnu.org/licenses/>. */
20 /* The basic idea is to consider two vectors as similar if, when
21 transforming the first vector into the second vector through a
22 sequence of edits (inserts and deletes of one element each),
23 this sequence is short - or equivalently, if the ordered list
24 of elements that are untouched by these edits is long. For a
25 good introduction to the subject, read about the "Levenshtein
26 distance" in Wikipedia.
28 The basic algorithm is described in:
29 "An O(ND) Difference Algorithm and its Variations", Eugene W. Myers,
30 Algorithmica Vol. 1, 1986, pp. 251-266,
31 <http://dx.doi.org/10.1007/BF01840446>.
32 See especially section 4.2, which describes the variation used below.
34 The basic algorithm was independently discovered as described in:
35 "Algorithms for Approximate String Matching", Esko Ukkonen,
36 Information and Control Vol. 64, 1985, pp. 100-118,
37 <http://dx.doi.org/10.1016/S0019-9958(85)80046-2>. */
39 /* Before including this file, you need to define:
40 ELEMENT The element type of the vectors being compared.
41 EQUAL A two-argument macro that tests two elements for
43 OFFSET A signed integer type sufficient to hold the
44 difference between two indices. Usually
45 something like ptrdiff_t.
46 EXTRA_CONTEXT_FIELDS Declarations of fields for 'struct context'.
47 NOTE_DELETE(ctxt, xoff) Record the removal of the object xvec[xoff].
48 NOTE_INSERT(ctxt, yoff) Record the insertion of the object yvec[yoff].
49 EARLY_ABORT(ctxt) (Optional) A boolean expression that triggers an
50 early abort of the computation.
51 USE_HEURISTIC (Optional) Define if you want to support the
52 heuristic for large vectors.
53 It is also possible to use this file with abstract arrays. In this case,
54 xvec and yvec are not represented in memory. They only exist conceptually.
55 In this case, the list of defines above is amended as follows:
58 XVECREF_YVECREF_EQUAL(ctxt, xoff, yoff)
59 A three-argument macro: References xvec[xoff] and
60 yvec[yoff] and tests these elements for equality.
61 Before including this file, you also need to include:
67 /* Maximum value of type OFFSET. */
69 ((((OFFSET)1 << (sizeof (OFFSET) * CHAR_BIT - 2)) - 1) * 2 + 1)
71 /* Default to no early abort. */
73 # define EARLY_ABORT(ctxt) false
76 /* Use this to suppress gcc's "...may be used before initialized" warnings.
77 Beware: The Code argument must not contain commas. */
79 # if defined GCC_LINT || defined lint
80 # define IF_LINT(Code) Code
82 # define IF_LINT(Code) /* empty */
86 /* As above, but when Code must contain one comma. */
88 # if defined GCC_LINT || defined lint
89 # define IF_LINT2(Code1, Code2) Code1, Code2
91 # define IF_LINT2(Code1, Code2) /* empty */
96 * Context of comparison operation.
101 /* Vectors being compared. */
109 /* Vector, indexed by diagonal, containing 1 + the X coordinate of the point
110 furthest along the given diagonal in the forward search of the edit
114 /* Vector, indexed by diagonal, containing the X coordinate of the point
115 furthest along the given diagonal in the backward search of the edit
120 /* This corresponds to the diff --speed-large-files flag. With this
121 heuristic, for vectors with a constant small density of changes,
122 the algorithm is linear in the vector size. */
126 /* Snakes bigger than this are considered "big". */
127 #define SNAKE_LIMIT 20
132 /* Midpoints of this partition. */
138 /* Find the midpoint of the shortest edit script for a specified portion
141 Scan from the beginnings of the vectors, and simultaneously from the ends,
142 doing a breadth-first search through the space of edit-sequence.
143 When the two searches meet, we have found the midpoint of the shortest
146 Set *PART to the midpoint (XMID,YMID). The diagonal number
147 XMID - YMID equals the number of inserted elements minus the number
148 of deleted elements (counting only elements before the midpoint).
150 This function assumes that the first elements of the specified portions
151 of the two vectors do not match, and likewise that the last elements do not
152 match. The caller must trim matching elements from the beginning and end
153 of the portions it is going to specify.
155 If we return the "wrong" partitions, the worst this can do is cause
156 suboptimal diff output. It cannot cause incorrect diff output. */
159 diag (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim,
160 struct partition *part, struct context *ctxt)
162 OFFSET *const fd = ctxt->fdiag; /* Give the compiler a chance. */
163 OFFSET *const bd = ctxt->bdiag; /* Additional help for the compiler. */
165 ELEMENT const *const xv = ctxt->xvec; /* Still more help for the compiler. */
166 ELEMENT const *const yv = ctxt->yvec; /* And more and more . . . */
167 #define XREF_YREF_EQUAL(x,y) EQUAL (xv[x], yv[y])
169 #define XREF_YREF_EQUAL(x,y) XVECREF_YVECREF_EQUAL (ctxt, x, y)
171 const OFFSET dmin = xoff - ylim; /* Minimum valid diagonal. */
172 const OFFSET dmax = xlim - yoff; /* Maximum valid diagonal. */
173 const OFFSET fmid = xoff - yoff; /* Center diagonal of top-down search. */
174 const OFFSET bmid = xlim - ylim; /* Center diagonal of bottom-up search. */
176 OFFSET fmax = fmid; /* Limits of top-down search. */
178 OFFSET bmax = bmid; /* Limits of bottom-up search. */
179 OFFSET c; /* Cost. */
180 bool odd = (fmid - bmid) & 1; /* True if southeast corner is on an odd
181 diagonal with respect to the northwest. */
188 OFFSET d; /* Active diagonal. */
189 bool big_snake = false;
191 /* Extend the top-down search by an edit step in each diagonal. */
200 for (d = fmax; d >= fmin; d -= 2)
204 OFFSET tlo = fd[d - 1];
205 OFFSET thi = fd[d + 1];
206 OFFSET x0 = tlo < thi ? thi : tlo + 1;
208 for (x = x0, y = x0 - d;
209 x < xlim && y < ylim && XREF_YREF_EQUAL (x, y);
212 if (x - x0 > SNAKE_LIMIT)
215 if (odd && bmin <= d && d <= bmax && bd[d] <= x)
223 /* Similarly extend the bottom-up search. */
225 bd[--bmin - 1] = OFFSET_MAX;
229 bd[++bmax + 1] = OFFSET_MAX;
232 for (d = bmax; d >= bmin; d -= 2)
236 OFFSET tlo = bd[d - 1];
237 OFFSET thi = bd[d + 1];
238 OFFSET x0 = tlo < thi ? tlo : thi - 1;
240 for (x = x0, y = x0 - d;
241 xoff < x && yoff < y && XREF_YREF_EQUAL (x - 1, y - 1);
244 if (x0 - x > SNAKE_LIMIT)
247 if (!odd && fmin <= d && d <= fmax && x <= fd[d])
256 /* Heuristic: check occasionally for a diagonal that has made lots
257 of progress compared with the edit distance. If we have any
258 such, find the one that has made the most progress and return it
259 as if it had succeeded.
261 With this heuristic, for vectors with a constant small density
262 of changes, the algorithm is linear in the vector size. */
264 if (200 < c && big_snake && ctxt->heuristic)
269 for (d = fmax; d >= fmin; d -= 2)
271 OFFSET dd = d - fmid;
274 OFFSET v = (x - xoff) * 2 - dd;
276 if (v > 12 * (c + (dd < 0 ? -dd : dd)))
279 && xoff + SNAKE_LIMIT <= x && x < xlim
280 && yoff + SNAKE_LIMIT <= y && y < ylim)
282 /* We have a good enough best diagonal; now insist
283 that it end with a significant snake. */
286 for (k = 1; XREF_YREF_EQUAL (x - k, y - k); k++)
287 if (k == SNAKE_LIMIT)
304 for (d = bmax; d >= bmin; d -= 2)
306 OFFSET dd = d - bmid;
309 OFFSET v = (xlim - x) * 2 + dd;
311 if (v > 12 * (c + (dd < 0 ? -dd : dd)))
314 && xoff < x && x <= xlim - SNAKE_LIMIT
315 && yoff < y && y <= ylim - SNAKE_LIMIT)
317 /* We have a good enough best diagonal; now insist
318 that it end with a significant snake. */
321 for (k = 0; XREF_YREF_EQUAL (x + k, y + k); k++)
322 if (k == SNAKE_LIMIT - 1)
336 #endif /* USE_HEURISTIC */
338 #undef XREF_YREF_EQUAL
342 /* Compare in detail contiguous subsequences of the two vectors
343 which are known, as a whole, to match each other.
345 The subsequence of vector 0 is [XOFF, XLIM) and likewise for vector 1.
347 Note that XLIM, YLIM are exclusive bounds. All indices into the vectors
350 The results are recorded by invoking NOTE_DELETE and NOTE_INSERT.
352 Return false if terminated normally, or true if terminated through early
356 compareseq (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim,
357 struct context *ctxt)
360 ELEMENT const *xv = ctxt->xvec; /* Help the compiler. */
361 ELEMENT const *yv = ctxt->yvec;
362 #define XREF_YREF_EQUAL(x,y) EQUAL (xv[x], yv[y])
364 #define XREF_YREF_EQUAL(x,y) XVECREF_YVECREF_EQUAL (ctxt, x, y)
367 /* Slide down the bottom initial diagonal. */
368 while (xoff < xlim && yoff < ylim && XREF_YREF_EQUAL (xoff, yoff))
374 /* Slide up the top initial diagonal. */
375 while (xoff < xlim && yoff < ylim && XREF_YREF_EQUAL (xlim - 1, ylim - 1))
381 /* Handle simple cases. */
385 NOTE_INSERT (ctxt, yoff);
386 if (EARLY_ABORT (ctxt))
390 else if (yoff == ylim)
393 NOTE_DELETE (ctxt, xoff);
394 if (EARLY_ABORT (ctxt))
400 struct partition part IF_LINT2 (= { .xmid = 0, .ymid = 0 });
402 /* Find a point of correspondence in the middle of the vectors. */
403 diag (xoff, xlim, yoff, ylim, &part, ctxt);
405 /* Use the partitions to split this problem into subproblems. */
406 if (compareseq (xoff, part.xmid, yoff, part.ymid, ctxt))
408 if (compareseq (part.xmid, xlim, part.ymid, ylim, ctxt))
413 #undef XREF_YREF_EQUAL
419 #undef EXTRA_CONTEXT_FIELDS
424 #undef XVECREF_YVECREF_EQUAL