(root)/
gettext-0.22.4/
gettext-tools/
gnulib-lib/
diffseq.h
       1  /* Analyze differences between two vectors.
       2  
       3     Copyright (C) 1988-1989, 1992-1995, 2001-2004, 2006-2023 Free Software
       4     Foundation, Inc.
       5  
       6     This program is free software: you can redistribute it and/or modify
       7     it under the terms of the GNU General Public License as published by
       8     the Free Software Foundation, either version 3 of the License, or
       9     (at your option) any later version.
      10  
      11     This program is distributed in the hope that it will be useful,
      12     but WITHOUT ANY WARRANTY; without even the implied warranty of
      13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      14     GNU General Public License for more details.
      15  
      16     You should have received a copy of the GNU General Public License
      17     along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
      18  
      19  
      20  /* The basic idea is to consider two vectors as similar if, when
      21     transforming the first vector into the second vector through a
      22     sequence of edits (inserts and deletes of one element each),
      23     this sequence is short - or equivalently, if the ordered list
      24     of elements that are untouched by these edits is long.  For a
      25     good introduction to the subject, read about the "Levenshtein
      26     distance" in Wikipedia.
      27  
      28     The basic algorithm is described in:
      29     "An O(ND) Difference Algorithm and its Variations", Eugene W. Myers,
      30     Algorithmica Vol. 1, 1986, pp. 251-266,
      31     <https://doi.org/10.1007/BF01840446>.
      32     See especially section 4.2, which describes the variation used below.
      33  
      34     The basic algorithm was independently discovered as described in:
      35     "Algorithms for Approximate String Matching", Esko Ukkonen,
      36     Information and Control Vol. 64, 1985, pp. 100-118,
      37     <https://doi.org/10.1016/S0019-9958(85)80046-2>.
      38  
      39     Unless the 'find_minimal' flag is set, this code uses the TOO_EXPENSIVE
      40     heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
      41     at the price of producing suboptimal output for large inputs with
      42     many differences.  */
      43  
      44  /* Before including this file, you need to define:
      45       ELEMENT                 The element type of the vectors being compared.
      46       EQUAL                   A two-argument macro that tests two elements for
      47                               equality.
      48       OFFSET                  A signed integer type sufficient to hold the
      49                               difference between two indices.  Usually
      50                               something like ptrdiff_t.
      51       OFFSET_MAX              (Optional) The maximum value of OFFSET (e.g.,
      52                               PTRDIFF_MAX).  If omitted, it is inferred in a
      53                               way portable to the vast majority of C platforms,
      54                               as they lack padding bits.
      55       EXTRA_CONTEXT_FIELDS    Declarations of fields for 'struct context'.
      56       NOTE_DELETE(ctxt, xoff) Record the removal of the object xvec[xoff].
      57       NOTE_INSERT(ctxt, yoff) Record the insertion of the object yvec[yoff].
      58       NOTE_ORDERED            (Optional) A boolean expression saying that
      59                               NOTE_DELETE and NOTE_INSERT calls must be
      60                               issued in offset order.
      61       EARLY_ABORT(ctxt)       (Optional) A boolean expression that triggers an
      62                               early abort of the computation.
      63       USE_HEURISTIC           (Optional) Define if you want to support the
      64                               heuristic for large vectors.
      65  
      66     It is also possible to use this file with abstract arrays.  In this case,
      67     xvec and yvec are not represented in memory.  They only exist conceptually.
      68     In this case, the list of defines above is amended as follows:
      69       ELEMENT                 Undefined.
      70       EQUAL                   Undefined.
      71       XVECREF_YVECREF_EQUAL(ctxt, xoff, yoff)
      72                               A three-argument macro: References xvec[xoff] and
      73                               yvec[yoff] and tests these elements for equality.
      74  
      75     Before including this file, you also need to include:
      76       #include <limits.h>
      77       #include "minmax.h"
      78   */
      79  
      80  /* Maximum value of type OFFSET.  */
      81  #ifndef OFFSET_MAX
      82  # define OFFSET_MAX \
      83     ((((OFFSET) 1 << (sizeof (OFFSET) * CHAR_BIT - 2)) - 1) * 2 + 1)
      84  #endif
      85  
      86  /* Default to no early abort.  */
      87  #ifndef EARLY_ABORT
      88  # define EARLY_ABORT(ctxt) false
      89  #endif
      90  
      91  #ifndef NOTE_ORDERED
      92  # define NOTE_ORDERED false
      93  #endif
      94  
      95  /* Suppress gcc's "...may be used before initialized" warnings,
      96     generated by GCC versions up to at least GCC 13.2.  */
      97  #if __GNUC__ + (__GNUC_MINOR__ >= 7) > 4
      98  # pragma GCC diagnostic push
      99  # pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
     100  #endif
     101  
     102  /*
     103   * Context of comparison operation.
     104   */
     105  struct context
     106  {
     107    #ifdef ELEMENT
     108    /* Vectors being compared.  */
     109    ELEMENT const *xvec;
     110    ELEMENT const *yvec;
     111    #endif
     112  
     113    /* Extra fields.  */
     114    EXTRA_CONTEXT_FIELDS
     115  
     116    /* Vector, indexed by diagonal, containing 1 + the X coordinate of the point
     117       furthest along the given diagonal in the forward search of the edit
     118       matrix.  */
     119    OFFSET *fdiag;
     120  
     121    /* Vector, indexed by diagonal, containing the X coordinate of the point
     122       furthest along the given diagonal in the backward search of the edit
     123       matrix.  */
     124    OFFSET *bdiag;
     125  
     126    #ifdef USE_HEURISTIC
     127    /* This corresponds to the diff --speed-large-files flag.  With this
     128       heuristic, for vectors with a constant small density of changes,
     129       the algorithm is linear in the vector size.  */
     130    bool heuristic;
     131    #endif
     132  
     133    /* Edit scripts longer than this are too expensive to compute.  */
     134    OFFSET too_expensive;
     135  
     136    /* Snakes bigger than this are considered "big".  */
     137    #define SNAKE_LIMIT 20
     138  };
     139  
     140  struct partition
     141  {
     142    /* Midpoints of this partition.  */
     143    OFFSET xmid;
     144    OFFSET ymid;
     145  
     146    /* True if low half will be analyzed minimally.  */
     147    bool lo_minimal;
     148  
     149    /* Likewise for high half.  */
     150    bool hi_minimal;
     151  };
     152  
     153  
     154  /* Find the midpoint of the shortest edit script for a specified portion
     155     of the two vectors.
     156  
     157     Scan from the beginnings of the vectors, and simultaneously from the ends,
     158     doing a breadth-first search through the space of edit-sequence.
     159     When the two searches meet, we have found the midpoint of the shortest
     160     edit sequence.
     161  
     162     If FIND_MINIMAL is true, find the minimal edit script regardless of
     163     expense.  Otherwise, if the search is too expensive, use heuristics to
     164     stop the search and report a suboptimal answer.
     165  
     166     Set PART->(xmid,ymid) to the midpoint (XMID,YMID).  The diagonal number
     167     XMID - YMID equals the number of inserted elements minus the number
     168     of deleted elements (counting only elements before the midpoint).
     169  
     170     Set PART->lo_minimal to true iff the minimal edit script for the
     171     left half of the partition is known; similarly for PART->hi_minimal.
     172  
     173     This function assumes that the first elements of the specified portions
     174     of the two vectors do not match, and likewise that the last elements do not
     175     match.  The caller must trim matching elements from the beginning and end
     176     of the portions it is going to specify.
     177  
     178     If we return the "wrong" partitions, the worst this can do is cause
     179     suboptimal diff output.  It cannot cause incorrect diff output.  */
     180  
     181  static void
     182  diag (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim, bool find_minimal,
     183        struct partition *part, struct context *ctxt)
     184  {
     185    OFFSET *const fd = ctxt->fdiag;       /* Give the compiler a chance. */
     186    OFFSET *const bd = ctxt->bdiag;       /* Additional help for the compiler. */
     187  #ifdef ELEMENT
     188    ELEMENT const *const xv = ctxt->xvec; /* Still more help for the compiler. */
     189    ELEMENT const *const yv = ctxt->yvec; /* And more and more . . . */
     190    #define XREF_YREF_EQUAL(x,y)  EQUAL (xv[x], yv[y])
     191  #else
     192    #define XREF_YREF_EQUAL(x,y)  XVECREF_YVECREF_EQUAL (ctxt, x, y)
     193  #endif
     194    const OFFSET dmin = xoff - ylim;      /* Minimum valid diagonal. */
     195    const OFFSET dmax = xlim - yoff;      /* Maximum valid diagonal. */
     196    const OFFSET fmid = xoff - yoff;      /* Center diagonal of top-down search. */
     197    const OFFSET bmid = xlim - ylim;      /* Center diagonal of bottom-up search. */
     198    OFFSET fmin = fmid;
     199    OFFSET fmax = fmid;           /* Limits of top-down search. */
     200    OFFSET bmin = bmid;
     201    OFFSET bmax = bmid;           /* Limits of bottom-up search. */
     202    OFFSET c;                     /* Cost. */
     203    bool odd = (fmid - bmid) & 1; /* True if southeast corner is on an odd
     204                                     diagonal with respect to the northwest. */
     205  
     206    fd[fmid] = xoff;
     207    bd[bmid] = xlim;
     208  
     209    for (c = 1;; ++c)
     210      {
     211        OFFSET d;                 /* Active diagonal. */
     212        bool big_snake = false;
     213  
     214        /* Extend the top-down search by an edit step in each diagonal. */
     215        if (fmin > dmin)
     216          fd[--fmin - 1] = -1;
     217        else
     218          ++fmin;
     219        if (fmax < dmax)
     220          fd[++fmax + 1] = -1;
     221        else
     222          --fmax;
     223        for (d = fmax; d >= fmin; d -= 2)
     224          {
     225            OFFSET x;
     226            OFFSET y;
     227            OFFSET tlo = fd[d - 1];
     228            OFFSET thi = fd[d + 1];
     229            OFFSET x0 = tlo < thi ? thi : tlo + 1;
     230  
     231            for (x = x0, y = x0 - d;
     232                 x < xlim && y < ylim && XREF_YREF_EQUAL (x, y);
     233                 x++, y++)
     234              continue;
     235            if (x - x0 > SNAKE_LIMIT)
     236              big_snake = true;
     237            fd[d] = x;
     238            if (odd && bmin <= d && d <= bmax && bd[d] <= x)
     239              {
     240                part->xmid = x;
     241                part->ymid = y;
     242                part->lo_minimal = part->hi_minimal = true;
     243                return;
     244              }
     245          }
     246  
     247        /* Similarly extend the bottom-up search.  */
     248        if (bmin > dmin)
     249          bd[--bmin - 1] = OFFSET_MAX;
     250        else
     251          ++bmin;
     252        if (bmax < dmax)
     253          bd[++bmax + 1] = OFFSET_MAX;
     254        else
     255          --bmax;
     256        for (d = bmax; d >= bmin; d -= 2)
     257          {
     258            OFFSET x;
     259            OFFSET y;
     260            OFFSET tlo = bd[d - 1];
     261            OFFSET thi = bd[d + 1];
     262            OFFSET x0 = tlo < thi ? tlo : thi - 1;
     263  
     264            for (x = x0, y = x0 - d;
     265                 xoff < x && yoff < y && XREF_YREF_EQUAL (x - 1, y - 1);
     266                 x--, y--)
     267              continue;
     268            if (x0 - x > SNAKE_LIMIT)
     269              big_snake = true;
     270            bd[d] = x;
     271            if (!odd && fmin <= d && d <= fmax && x <= fd[d])
     272              {
     273                part->xmid = x;
     274                part->ymid = y;
     275                part->lo_minimal = part->hi_minimal = true;
     276                return;
     277              }
     278          }
     279  
     280        if (find_minimal)
     281          continue;
     282  
     283  #ifdef USE_HEURISTIC
     284        bool heuristic = ctxt->heuristic;
     285  #else
     286        bool heuristic = false;
     287  #endif
     288  
     289        /* Heuristic: check occasionally for a diagonal that has made lots
     290           of progress compared with the edit distance.  If we have any
     291           such, find the one that has made the most progress and return it
     292           as if it had succeeded.
     293  
     294           With this heuristic, for vectors with a constant small density
     295           of changes, the algorithm is linear in the vector size.  */
     296  
     297        if (200 < c && big_snake && heuristic)
     298          {
     299            {
     300              OFFSET best = 0;
     301  
     302              for (d = fmax; d >= fmin; d -= 2)
     303                {
     304                  OFFSET dd = d - fmid;
     305                  OFFSET x = fd[d];
     306                  OFFSET y = x - d;
     307                  OFFSET v = (x - xoff) * 2 - dd;
     308  
     309                  if (v > 12 * (c + (dd < 0 ? -dd : dd)))
     310                    {
     311                      if (v > best
     312                          && xoff + SNAKE_LIMIT <= x && x < xlim
     313                          && yoff + SNAKE_LIMIT <= y && y < ylim)
     314                        {
     315                          /* We have a good enough best diagonal; now insist
     316                             that it end with a significant snake.  */
     317                          int k;
     318  
     319                          for (k = 1; XREF_YREF_EQUAL (x - k, y - k); k++)
     320                            if (k == SNAKE_LIMIT)
     321                              {
     322                                best = v;
     323                                part->xmid = x;
     324                                part->ymid = y;
     325                                break;
     326                              }
     327                        }
     328                    }
     329                }
     330              if (best > 0)
     331                {
     332                  part->lo_minimal = true;
     333                  part->hi_minimal = false;
     334                  return;
     335                }
     336            }
     337  
     338            {
     339              OFFSET best = 0;
     340  
     341              for (d = bmax; d >= bmin; d -= 2)
     342                {
     343                  OFFSET dd = d - bmid;
     344                  OFFSET x = bd[d];
     345                  OFFSET y = x - d;
     346                  OFFSET v = (xlim - x) * 2 + dd;
     347  
     348                  if (v > 12 * (c + (dd < 0 ? -dd : dd)))
     349                    {
     350                      if (v > best
     351                          && xoff < x && x <= xlim - SNAKE_LIMIT
     352                          && yoff < y && y <= ylim - SNAKE_LIMIT)
     353                        {
     354                          /* We have a good enough best diagonal; now insist
     355                             that it end with a significant snake.  */
     356                          int k;
     357  
     358                          for (k = 0; XREF_YREF_EQUAL (x + k, y + k); k++)
     359                            if (k == SNAKE_LIMIT - 1)
     360                              {
     361                                best = v;
     362                                part->xmid = x;
     363                                part->ymid = y;
     364                                break;
     365                              }
     366                        }
     367                    }
     368                }
     369              if (best > 0)
     370                {
     371                  part->lo_minimal = false;
     372                  part->hi_minimal = true;
     373                  return;
     374                }
     375            }
     376          }
     377  
     378        /* Heuristic: if we've gone well beyond the call of duty, give up
     379           and report halfway between our best results so far.  */
     380        if (c >= ctxt->too_expensive)
     381          {
     382            /* Find forward diagonal that maximizes X + Y.  */
     383            OFFSET fxybest = -1, fxbest;
     384            for (d = fmax; d >= fmin; d -= 2)
     385              {
     386                OFFSET x = MIN (fd[d], xlim);
     387                OFFSET y = x - d;
     388                if (ylim < y)
     389                  {
     390                    x = ylim + d;
     391                    y = ylim;
     392                  }
     393                if (fxybest < x + y)
     394                  {
     395                    fxybest = x + y;
     396                    fxbest = x;
     397                  }
     398              }
     399  
     400            /* Find backward diagonal that minimizes X + Y.  */
     401            OFFSET bxybest = OFFSET_MAX, bxbest;
     402            for (d = bmax; d >= bmin; d -= 2)
     403              {
     404                OFFSET x = MAX (xoff, bd[d]);
     405                OFFSET y = x - d;
     406                if (y < yoff)
     407                  {
     408                    x = yoff + d;
     409                    y = yoff;
     410                  }
     411                if (x + y < bxybest)
     412                  {
     413                    bxybest = x + y;
     414                    bxbest = x;
     415                  }
     416              }
     417  
     418            /* Use the better of the two diagonals.  */
     419            if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff))
     420              {
     421                part->xmid = fxbest;
     422                part->ymid = fxybest - fxbest;
     423                part->lo_minimal = true;
     424                part->hi_minimal = false;
     425              }
     426            else
     427              {
     428                part->xmid = bxbest;
     429                part->ymid = bxybest - bxbest;
     430                part->lo_minimal = false;
     431                part->hi_minimal = true;
     432              }
     433            return;
     434          }
     435      }
     436    #undef XREF_YREF_EQUAL
     437  }
     438  
     439  
     440  /* Compare in detail contiguous subsequences of the two vectors
     441     which are known, as a whole, to match each other.
     442  
     443     The subsequence of vector 0 is [XOFF, XLIM) and likewise for vector 1.
     444  
     445     Note that XLIM, YLIM are exclusive bounds.  All indices into the vectors
     446     are origin-0.
     447  
     448     If FIND_MINIMAL, find a minimal difference no matter how
     449     expensive it is.
     450  
     451     The results are recorded by invoking NOTE_DELETE and NOTE_INSERT.
     452  
     453     Return false if terminated normally, or true if terminated through early
     454     abort.  */
     455  
     456  static bool
     457  compareseq (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim,
     458              bool find_minimal, struct context *ctxt)
     459  {
     460  #ifdef ELEMENT
     461    ELEMENT const *xv = ctxt->xvec; /* Help the compiler.  */
     462    ELEMENT const *yv = ctxt->yvec;
     463    #define XREF_YREF_EQUAL(x,y)  EQUAL (xv[x], yv[y])
     464  #else
     465    #define XREF_YREF_EQUAL(x,y)  XVECREF_YVECREF_EQUAL (ctxt, x, y)
     466  #endif
     467  
     468    while (true)
     469      {
     470        /* Slide down the bottom initial diagonal.  */
     471        while (xoff < xlim && yoff < ylim && XREF_YREF_EQUAL (xoff, yoff))
     472          {
     473            xoff++;
     474            yoff++;
     475          }
     476  
     477        /* Slide up the top initial diagonal. */
     478        while (xoff < xlim && yoff < ylim && XREF_YREF_EQUAL (xlim - 1, ylim - 1))
     479          {
     480            xlim--;
     481            ylim--;
     482          }
     483  
     484        /* Handle simple cases. */
     485        if (xoff == xlim)
     486          {
     487            while (yoff < ylim)
     488              {
     489                NOTE_INSERT (ctxt, yoff);
     490                if (EARLY_ABORT (ctxt))
     491                  return true;
     492                yoff++;
     493              }
     494            break;
     495          }
     496        if (yoff == ylim)
     497          {
     498            while (xoff < xlim)
     499              {
     500                NOTE_DELETE (ctxt, xoff);
     501                if (EARLY_ABORT (ctxt))
     502                  return true;
     503                xoff++;
     504              }
     505            break;
     506          }
     507  
     508        struct partition part;
     509  
     510        /* Find a point of correspondence in the middle of the vectors.  */
     511        diag (xoff, xlim, yoff, ylim, find_minimal, &part, ctxt);
     512  
     513        /* Use the partitions to split this problem into subproblems.  */
     514        OFFSET xoff1, xlim1, yoff1, ylim1, xoff2, xlim2, yoff2, ylim2;
     515        bool find_minimal1, find_minimal2;
     516        if (!NOTE_ORDERED
     517            && ((xlim + ylim) - (part.xmid + part.ymid)
     518                < (part.xmid + part.ymid) - (xoff + yoff)))
     519          {
     520            /* The second problem is smaller and the caller doesn't
     521               care about order, so do the second problem first to
     522               lessen recursion.  */
     523            xoff1 = part.xmid; xlim1 = xlim;
     524            yoff1 = part.ymid; ylim1 = ylim;
     525            find_minimal1 = part.hi_minimal;
     526  
     527            xoff2 = xoff; xlim2 = part.xmid;
     528            yoff2 = yoff; ylim2 = part.ymid;
     529            find_minimal2 = part.lo_minimal;
     530          }
     531        else
     532          {
     533            xoff1 = xoff; xlim1 = part.xmid;
     534            yoff1 = yoff; ylim1 = part.ymid;
     535            find_minimal1 = part.lo_minimal;
     536  
     537            xoff2 = part.xmid; xlim2 = xlim;
     538            yoff2 = part.ymid; ylim2 = ylim;
     539            find_minimal2 = part.hi_minimal;
     540          }
     541  
     542        /* Recurse to do one subproblem.  */
     543        bool early = compareseq (xoff1, xlim1, yoff1, ylim1, find_minimal1, ctxt);
     544        if (early)
     545          return early;
     546  
     547        /* Iterate to do the other subproblem.  */
     548        xoff = xoff2; xlim = xlim2;
     549        yoff = yoff2; ylim = ylim2;
     550        find_minimal = find_minimal2;
     551      }
     552  
     553    return false;
     554    #undef XREF_YREF_EQUAL
     555  }
     556  
     557  #if __GNUC__ + (__GNUC_MINOR__ >= 7) > 4
     558  # pragma GCC diagnostic pop
     559  #endif
     560  
     561  #undef ELEMENT
     562  #undef EQUAL
     563  #undef OFFSET
     564  #undef EXTRA_CONTEXT_FIELDS
     565  #undef NOTE_DELETE
     566  #undef NOTE_INSERT
     567  #undef EARLY_ABORT
     568  #undef USE_HEURISTIC
     569  #undef XVECREF_YVECREF_EQUAL
     570  #undef OFFSET_MAX