(root)/
bison-3.8.2/
lib/
diffseq.h
       1  /* Analyze differences between two vectors.
       2  
       3     Copyright (C) 1988-1989, 1992-1995, 2001-2004, 2006-2021 Free Software
       4     Foundation, Inc.
       5  
       6     This program is free software: you can redistribute it and/or modify
       7     it under the terms of the GNU General Public License as published by
       8     the Free Software Foundation; either version 3 of the License, or
       9     (at your option) any later version.
      10  
      11     This program is distributed in the hope that it will be useful,
      12     but WITHOUT ANY WARRANTY; without even the implied warranty of
      13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      14     GNU General Public License for more details.
      15  
      16     You should have received a copy of the GNU General Public License
      17     along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
      18  
      19  
      20  /* The basic idea is to consider two vectors as similar if, when
      21     transforming the first vector into the second vector through a
      22     sequence of edits (inserts and deletes of one element each),
      23     this sequence is short - or equivalently, if the ordered list
      24     of elements that are untouched by these edits is long.  For a
      25     good introduction to the subject, read about the "Levenshtein
      26     distance" in Wikipedia.
      27  
      28     The basic algorithm is described in:
      29     "An O(ND) Difference Algorithm and its Variations", Eugene W. Myers,
      30     Algorithmica Vol. 1, 1986, pp. 251-266,
      31     <https://doi.org/10.1007/BF01840446>.
      32     See especially section 4.2, which describes the variation used below.
      33  
      34     The basic algorithm was independently discovered as described in:
      35     "Algorithms for Approximate String Matching", Esko Ukkonen,
      36     Information and Control Vol. 64, 1985, pp. 100-118,
      37     <https://doi.org/10.1016/S0019-9958(85)80046-2>.
      38  
      39     Unless the 'find_minimal' flag is set, this code uses the TOO_EXPENSIVE
      40     heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
      41     at the price of producing suboptimal output for large inputs with
      42     many differences.  */
      43  
      44  /* Before including this file, you need to define:
      45       ELEMENT                 The element type of the vectors being compared.
      46       EQUAL                   A two-argument macro that tests two elements for
      47                               equality.
      48       OFFSET                  A signed integer type sufficient to hold the
      49                               difference between two indices.  Usually
      50                               something like ptrdiff_t.
      51       EXTRA_CONTEXT_FIELDS    Declarations of fields for 'struct context'.
      52       NOTE_DELETE(ctxt, xoff) Record the removal of the object xvec[xoff].
      53       NOTE_INSERT(ctxt, yoff) Record the insertion of the object yvec[yoff].
      54       NOTE_ORDERED            (Optional) A boolean expression saying that
      55                               NOTE_DELETE and NOTE_INSERT calls must be
      56                               issued in offset order.
      57       EARLY_ABORT(ctxt)       (Optional) A boolean expression that triggers an
      58                               early abort of the computation.
      59       USE_HEURISTIC           (Optional) Define if you want to support the
      60                               heuristic for large vectors.
      61  
      62     It is also possible to use this file with abstract arrays.  In this case,
      63     xvec and yvec are not represented in memory.  They only exist conceptually.
      64     In this case, the list of defines above is amended as follows:
      65       ELEMENT                 Undefined.
      66       EQUAL                   Undefined.
      67       XVECREF_YVECREF_EQUAL(ctxt, xoff, yoff)
      68                               A three-argument macro: References xvec[xoff] and
      69                               yvec[yoff] and tests these elements for equality.
      70  
      71     Before including this file, you also need to include:
      72       #include <limits.h>
      73       #include <stdbool.h>
      74       #include "minmax.h"
      75   */
      76  
      77  /* Maximum value of type OFFSET.  */
      78  #define OFFSET_MAX \
      79    ((((OFFSET)1 << (sizeof (OFFSET) * CHAR_BIT - 2)) - 1) * 2 + 1)
      80  
      81  /* Default to no early abort.  */
      82  #ifndef EARLY_ABORT
      83  # define EARLY_ABORT(ctxt) false
      84  #endif
      85  
      86  #ifndef NOTE_ORDERED
      87  # define NOTE_ORDERED false
      88  #endif
      89  
      90  /* Use this to suppress gcc's "...may be used before initialized" warnings.
      91     Beware: The Code argument must not contain commas.  */
      92  #ifndef IF_LINT
      93  # if defined GCC_LINT || defined lint
      94  #  define IF_LINT(Code) Code
      95  # else
      96  #  define IF_LINT(Code) /* empty */
      97  # endif
      98  #endif
      99  
     100  /*
     101   * Context of comparison operation.
     102   */
     103  struct context
     104  {
     105    #ifdef ELEMENT
     106    /* Vectors being compared.  */
     107    ELEMENT const *xvec;
     108    ELEMENT const *yvec;
     109    #endif
     110  
     111    /* Extra fields.  */
     112    EXTRA_CONTEXT_FIELDS
     113  
     114    /* Vector, indexed by diagonal, containing 1 + the X coordinate of the point
     115       furthest along the given diagonal in the forward search of the edit
     116       matrix.  */
     117    OFFSET *fdiag;
     118  
     119    /* Vector, indexed by diagonal, containing the X coordinate of the point
     120       furthest along the given diagonal in the backward search of the edit
     121       matrix.  */
     122    OFFSET *bdiag;
     123  
     124    #ifdef USE_HEURISTIC
     125    /* This corresponds to the diff --speed-large-files flag.  With this
     126       heuristic, for vectors with a constant small density of changes,
     127       the algorithm is linear in the vector size.  */
     128    bool heuristic;
     129    #endif
     130  
     131    /* Edit scripts longer than this are too expensive to compute.  */
     132    OFFSET too_expensive;
     133  
     134    /* Snakes bigger than this are considered "big".  */
     135    #define SNAKE_LIMIT 20
     136  };
     137  
     138  struct partition
     139  {
     140    /* Midpoints of this partition.  */
     141    OFFSET xmid;
     142    OFFSET ymid;
     143  
     144    /* True if low half will be analyzed minimally.  */
     145    bool lo_minimal;
     146  
     147    /* Likewise for high half.  */
     148    bool hi_minimal;
     149  };
     150  
     151  
     152  /* Find the midpoint of the shortest edit script for a specified portion
     153     of the two vectors.
     154  
     155     Scan from the beginnings of the vectors, and simultaneously from the ends,
     156     doing a breadth-first search through the space of edit-sequence.
     157     When the two searches meet, we have found the midpoint of the shortest
     158     edit sequence.
     159  
     160     If FIND_MINIMAL is true, find the minimal edit script regardless of
     161     expense.  Otherwise, if the search is too expensive, use heuristics to
     162     stop the search and report a suboptimal answer.
     163  
     164     Set PART->(xmid,ymid) to the midpoint (XMID,YMID).  The diagonal number
     165     XMID - YMID equals the number of inserted elements minus the number
     166     of deleted elements (counting only elements before the midpoint).
     167  
     168     Set PART->lo_minimal to true iff the minimal edit script for the
     169     left half of the partition is known; similarly for PART->hi_minimal.
     170  
     171     This function assumes that the first elements of the specified portions
     172     of the two vectors do not match, and likewise that the last elements do not
     173     match.  The caller must trim matching elements from the beginning and end
     174     of the portions it is going to specify.
     175  
     176     If we return the "wrong" partitions, the worst this can do is cause
     177     suboptimal diff output.  It cannot cause incorrect diff output.  */
     178  
     179  static void
     180  diag (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim, bool find_minimal,
     181        struct partition *part, struct context *ctxt)
     182  {
     183    OFFSET *const fd = ctxt->fdiag;       /* Give the compiler a chance. */
     184    OFFSET *const bd = ctxt->bdiag;       /* Additional help for the compiler. */
     185  #ifdef ELEMENT
     186    ELEMENT const *const xv = ctxt->xvec; /* Still more help for the compiler. */
     187    ELEMENT const *const yv = ctxt->yvec; /* And more and more . . . */
     188    #define XREF_YREF_EQUAL(x,y)  EQUAL (xv[x], yv[y])
     189  #else
     190    #define XREF_YREF_EQUAL(x,y)  XVECREF_YVECREF_EQUAL (ctxt, x, y)
     191  #endif
     192    const OFFSET dmin = xoff - ylim;      /* Minimum valid diagonal. */
     193    const OFFSET dmax = xlim - yoff;      /* Maximum valid diagonal. */
     194    const OFFSET fmid = xoff - yoff;      /* Center diagonal of top-down search. */
     195    const OFFSET bmid = xlim - ylim;      /* Center diagonal of bottom-up search. */
     196    OFFSET fmin = fmid;
     197    OFFSET fmax = fmid;           /* Limits of top-down search. */
     198    OFFSET bmin = bmid;
     199    OFFSET bmax = bmid;           /* Limits of bottom-up search. */
     200    OFFSET c;                     /* Cost. */
     201    bool odd = (fmid - bmid) & 1; /* True if southeast corner is on an odd
     202                                     diagonal with respect to the northwest. */
     203  
     204    fd[fmid] = xoff;
     205    bd[bmid] = xlim;
     206  
     207    for (c = 1;; ++c)
     208      {
     209        OFFSET d;                 /* Active diagonal. */
     210        bool big_snake = false;
     211  
     212        /* Extend the top-down search by an edit step in each diagonal. */
     213        if (fmin > dmin)
     214          fd[--fmin - 1] = -1;
     215        else
     216          ++fmin;
     217        if (fmax < dmax)
     218          fd[++fmax + 1] = -1;
     219        else
     220          --fmax;
     221        for (d = fmax; d >= fmin; d -= 2)
     222          {
     223            OFFSET x;
     224            OFFSET y;
     225            OFFSET tlo = fd[d - 1];
     226            OFFSET thi = fd[d + 1];
     227            OFFSET x0 = tlo < thi ? thi : tlo + 1;
     228  
     229            for (x = x0, y = x0 - d;
     230                 x < xlim && y < ylim && XREF_YREF_EQUAL (x, y);
     231                 x++, y++)
     232              continue;
     233            if (x - x0 > SNAKE_LIMIT)
     234              big_snake = true;
     235            fd[d] = x;
     236            if (odd && bmin <= d && d <= bmax && bd[d] <= x)
     237              {
     238                part->xmid = x;
     239                part->ymid = y;
     240                part->lo_minimal = part->hi_minimal = true;
     241                return;
     242              }
     243          }
     244  
     245        /* Similarly extend the bottom-up search.  */
     246        if (bmin > dmin)
     247          bd[--bmin - 1] = OFFSET_MAX;
     248        else
     249          ++bmin;
     250        if (bmax < dmax)
     251          bd[++bmax + 1] = OFFSET_MAX;
     252        else
     253          --bmax;
     254        for (d = bmax; d >= bmin; d -= 2)
     255          {
     256            OFFSET x;
     257            OFFSET y;
     258            OFFSET tlo = bd[d - 1];
     259            OFFSET thi = bd[d + 1];
     260            OFFSET x0 = tlo < thi ? tlo : thi - 1;
     261  
     262            for (x = x0, y = x0 - d;
     263                 xoff < x && yoff < y && XREF_YREF_EQUAL (x - 1, y - 1);
     264                 x--, y--)
     265              continue;
     266            if (x0 - x > SNAKE_LIMIT)
     267              big_snake = true;
     268            bd[d] = x;
     269            if (!odd && fmin <= d && d <= fmax && x <= fd[d])
     270              {
     271                part->xmid = x;
     272                part->ymid = y;
     273                part->lo_minimal = part->hi_minimal = true;
     274                return;
     275              }
     276          }
     277  
     278        if (find_minimal)
     279          continue;
     280  
     281  #ifdef USE_HEURISTIC
     282        bool heuristic = ctxt->heuristic;
     283  #else
     284        bool heuristic = false;
     285  #endif
     286  
     287        /* Heuristic: check occasionally for a diagonal that has made lots
     288           of progress compared with the edit distance.  If we have any
     289           such, find the one that has made the most progress and return it
     290           as if it had succeeded.
     291  
     292           With this heuristic, for vectors with a constant small density
     293           of changes, the algorithm is linear in the vector size.  */
     294  
     295        if (200 < c && big_snake && heuristic)
     296          {
     297            {
     298              OFFSET best = 0;
     299  
     300              for (d = fmax; d >= fmin; d -= 2)
     301                {
     302                  OFFSET dd = d - fmid;
     303                  OFFSET x = fd[d];
     304                  OFFSET y = x - d;
     305                  OFFSET v = (x - xoff) * 2 - dd;
     306  
     307                  if (v > 12 * (c + (dd < 0 ? -dd : dd)))
     308                    {
     309                      if (v > best
     310                          && xoff + SNAKE_LIMIT <= x && x < xlim
     311                          && yoff + SNAKE_LIMIT <= y && y < ylim)
     312                        {
     313                          /* We have a good enough best diagonal; now insist
     314                             that it end with a significant snake.  */
     315                          int k;
     316  
     317                          for (k = 1; XREF_YREF_EQUAL (x - k, y - k); k++)
     318                            if (k == SNAKE_LIMIT)
     319                              {
     320                                best = v;
     321                                part->xmid = x;
     322                                part->ymid = y;
     323                                break;
     324                              }
     325                        }
     326                    }
     327                }
     328              if (best > 0)
     329                {
     330                  part->lo_minimal = true;
     331                  part->hi_minimal = false;
     332                  return;
     333                }
     334            }
     335  
     336            {
     337              OFFSET best = 0;
     338  
     339              for (d = bmax; d >= bmin; d -= 2)
     340                {
     341                  OFFSET dd = d - bmid;
     342                  OFFSET x = bd[d];
     343                  OFFSET y = x - d;
     344                  OFFSET v = (xlim - x) * 2 + dd;
     345  
     346                  if (v > 12 * (c + (dd < 0 ? -dd : dd)))
     347                    {
     348                      if (v > best
     349                          && xoff < x && x <= xlim - SNAKE_LIMIT
     350                          && yoff < y && y <= ylim - SNAKE_LIMIT)
     351                        {
     352                          /* We have a good enough best diagonal; now insist
     353                             that it end with a significant snake.  */
     354                          int k;
     355  
     356                          for (k = 0; XREF_YREF_EQUAL (x + k, y + k); k++)
     357                            if (k == SNAKE_LIMIT - 1)
     358                              {
     359                                best = v;
     360                                part->xmid = x;
     361                                part->ymid = y;
     362                                break;
     363                              }
     364                        }
     365                    }
     366                }
     367              if (best > 0)
     368                {
     369                  part->lo_minimal = false;
     370                  part->hi_minimal = true;
     371                  return;
     372                }
     373            }
     374          }
     375  
     376        /* Heuristic: if we've gone well beyond the call of duty, give up
     377           and report halfway between our best results so far.  */
     378        if (c >= ctxt->too_expensive)
     379          {
     380            OFFSET fxybest;
     381            OFFSET fxbest IF_LINT (= 0);
     382            OFFSET bxybest;
     383            OFFSET bxbest IF_LINT (= 0);
     384  
     385            /* Find forward diagonal that maximizes X + Y.  */
     386            fxybest = -1;
     387            for (d = fmax; d >= fmin; d -= 2)
     388              {
     389                OFFSET x = MIN (fd[d], xlim);
     390                OFFSET y = x - d;
     391                if (ylim < y)
     392                  {
     393                    x = ylim + d;
     394                    y = ylim;
     395                  }
     396                if (fxybest < x + y)
     397                  {
     398                    fxybest = x + y;
     399                    fxbest = x;
     400                  }
     401              }
     402  
     403            /* Find backward diagonal that minimizes X + Y.  */
     404            bxybest = OFFSET_MAX;
     405            for (d = bmax; d >= bmin; d -= 2)
     406              {
     407                OFFSET x = MAX (xoff, bd[d]);
     408                OFFSET y = x - d;
     409                if (y < yoff)
     410                  {
     411                    x = yoff + d;
     412                    y = yoff;
     413                  }
     414                if (x + y < bxybest)
     415                  {
     416                    bxybest = x + y;
     417                    bxbest = x;
     418                  }
     419              }
     420  
     421            /* Use the better of the two diagonals.  */
     422            if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff))
     423              {
     424                part->xmid = fxbest;
     425                part->ymid = fxybest - fxbest;
     426                part->lo_minimal = true;
     427                part->hi_minimal = false;
     428              }
     429            else
     430              {
     431                part->xmid = bxbest;
     432                part->ymid = bxybest - bxbest;
     433                part->lo_minimal = false;
     434                part->hi_minimal = true;
     435              }
     436            return;
     437          }
     438      }
     439    #undef XREF_YREF_EQUAL
     440  }
     441  
     442  
     443  /* Compare in detail contiguous subsequences of the two vectors
     444     which are known, as a whole, to match each other.
     445  
     446     The subsequence of vector 0 is [XOFF, XLIM) and likewise for vector 1.
     447  
     448     Note that XLIM, YLIM are exclusive bounds.  All indices into the vectors
     449     are origin-0.
     450  
     451     If FIND_MINIMAL, find a minimal difference no matter how
     452     expensive it is.
     453  
     454     The results are recorded by invoking NOTE_DELETE and NOTE_INSERT.
     455  
     456     Return false if terminated normally, or true if terminated through early
     457     abort.  */
     458  
     459  static bool
     460  compareseq (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim,
     461              bool find_minimal, struct context *ctxt)
     462  {
     463  #ifdef ELEMENT
     464    ELEMENT const *xv = ctxt->xvec; /* Help the compiler.  */
     465    ELEMENT const *yv = ctxt->yvec;
     466    #define XREF_YREF_EQUAL(x,y)  EQUAL (xv[x], yv[y])
     467  #else
     468    #define XREF_YREF_EQUAL(x,y)  XVECREF_YVECREF_EQUAL (ctxt, x, y)
     469  #endif
     470  
     471    while (true)
     472      {
     473        /* Slide down the bottom initial diagonal.  */
     474        while (xoff < xlim && yoff < ylim && XREF_YREF_EQUAL (xoff, yoff))
     475          {
     476            xoff++;
     477            yoff++;
     478          }
     479  
     480        /* Slide up the top initial diagonal. */
     481        while (xoff < xlim && yoff < ylim && XREF_YREF_EQUAL (xlim - 1, ylim - 1))
     482          {
     483            xlim--;
     484            ylim--;
     485          }
     486  
     487        /* Handle simple cases. */
     488        if (xoff == xlim)
     489          {
     490            while (yoff < ylim)
     491              {
     492                NOTE_INSERT (ctxt, yoff);
     493                if (EARLY_ABORT (ctxt))
     494                  return true;
     495                yoff++;
     496              }
     497            break;
     498          }
     499        if (yoff == ylim)
     500          {
     501            while (xoff < xlim)
     502              {
     503                NOTE_DELETE (ctxt, xoff);
     504                if (EARLY_ABORT (ctxt))
     505                  return true;
     506                xoff++;
     507              }
     508            break;
     509          }
     510  
     511        struct partition part;
     512  
     513        /* Find a point of correspondence in the middle of the vectors.  */
     514        diag (xoff, xlim, yoff, ylim, find_minimal, &part, ctxt);
     515  
     516        /* Use the partitions to split this problem into subproblems.  */
     517        OFFSET xoff1, xlim1, yoff1, ylim1, xoff2, xlim2, yoff2, ylim2;
     518        bool find_minimal1, find_minimal2;
     519        if (!NOTE_ORDERED
     520            && ((xlim + ylim) - (part.xmid + part.ymid)
     521                < (part.xmid + part.ymid) - (xoff + yoff)))
     522          {
     523            /* The second problem is smaller and the caller doesn't
     524               care about order, so do the second problem first to
     525               lessen recursion.  */
     526            xoff1 = part.xmid; xlim1 = xlim;
     527            yoff1 = part.ymid; ylim1 = ylim;
     528            find_minimal1 = part.hi_minimal;
     529  
     530            xoff2 = xoff; xlim2 = part.xmid;
     531            yoff2 = yoff; ylim2 = part.ymid;
     532            find_minimal2 = part.lo_minimal;
     533          }
     534        else
     535          {
     536            xoff1 = xoff; xlim1 = part.xmid;
     537            yoff1 = yoff; ylim1 = part.ymid;
     538            find_minimal1 = part.lo_minimal;
     539  
     540            xoff2 = part.xmid; xlim2 = xlim;
     541            yoff2 = part.ymid; ylim2 = ylim;
     542            find_minimal2 = part.hi_minimal;
     543          }
     544  
     545        /* Recurse to do one subproblem.  */
     546        bool early = compareseq (xoff1, xlim1, yoff1, ylim1, find_minimal1, ctxt);
     547        if (early)
     548          return early;
     549  
     550        /* Iterate to do the other subproblem.  */
     551        xoff = xoff2; xlim = xlim2;
     552        yoff = yoff2; ylim = ylim2;
     553        find_minimal = find_minimal2;
     554      }
     555  
     556    return false;
     557    #undef XREF_YREF_EQUAL
     558  }
     559  
     560  #undef ELEMENT
     561  #undef EQUAL
     562  #undef OFFSET
     563  #undef EXTRA_CONTEXT_FIELDS
     564  #undef NOTE_DELETE
     565  #undef NOTE_INSERT
     566  #undef EARLY_ABORT
     567  #undef USE_HEURISTIC
     568  #undef XVECREF_YVECREF_EQUAL
     569  #undef OFFSET_MAX