1  """
       2  Module difflib -- helpers for computing deltas between objects.
       3  
       4  Function get_close_matches(word, possibilities, n=3, cutoff=0.6):
       5      Use SequenceMatcher to return list of the best "good enough" matches.
       6  
       7  Function context_diff(a, b):
       8      For two lists of strings, return a delta in context diff format.
       9  
      10  Function ndiff(a, b):
      11      Return a delta: the difference between `a` and `b` (lists of strings).
      12  
      13  Function restore(delta, which):
      14      Return one of the two sequences that generated an ndiff delta.
      15  
      16  Function unified_diff(a, b):
      17      For two lists of strings, return a delta in unified diff format.
      18  
      19  Class SequenceMatcher:
      20      A flexible class for comparing pairs of sequences of any type.
      21  
      22  Class Differ:
      23      For producing human-readable deltas from sequences of lines of text.
      24  
      25  Class HtmlDiff:
      26      For producing HTML side by side comparison with change highlights.
      27  """
      28  
      29  __all__ = ['get_close_matches', 'ndiff', 'restore', 'SequenceMatcher',
      30             'Differ','IS_CHARACTER_JUNK', 'IS_LINE_JUNK', 'context_diff',
      31             'unified_diff', 'diff_bytes', 'HtmlDiff', 'Match']
      32  
      33  from heapq import nlargest as _nlargest
      34  from collections import namedtuple as _namedtuple
      35  from types import GenericAlias
      36  
      37  Match = _namedtuple('Match', 'a b size')
      38  
      39  def _calculate_ratio(matches, length):
      40      if length:
      41          return 2.0 * matches / length
      42      return 1.0
      43  
      44  class ESC[4;38;5;81mSequenceMatcher:
      45  
      46      """
      47      SequenceMatcher is a flexible class for comparing pairs of sequences of
      48      any type, so long as the sequence elements are hashable.  The basic
      49      algorithm predates, and is a little fancier than, an algorithm
      50      published in the late 1980's by Ratcliff and Obershelp under the
      51      hyperbolic name "gestalt pattern matching".  The basic idea is to find
      52      the longest contiguous matching subsequence that contains no "junk"
      53      elements (R-O doesn't address junk).  The same idea is then applied
      54      recursively to the pieces of the sequences to the left and to the right
      55      of the matching subsequence.  This does not yield minimal edit
      56      sequences, but does tend to yield matches that "look right" to people.
      57  
      58      SequenceMatcher tries to compute a "human-friendly diff" between two
      59      sequences.  Unlike e.g. UNIX(tm) diff, the fundamental notion is the
      60      longest *contiguous* & junk-free matching subsequence.  That's what
      61      catches peoples' eyes.  The Windows(tm) windiff has another interesting
      62      notion, pairing up elements that appear uniquely in each sequence.
      63      That, and the method here, appear to yield more intuitive difference
      64      reports than does diff.  This method appears to be the least vulnerable
      65      to syncing up on blocks of "junk lines", though (like blank lines in
      66      ordinary text files, or maybe "<P>" lines in HTML files).  That may be
      67      because this is the only method of the 3 that has a *concept* of
      68      "junk" <wink>.
      69  
      70      Example, comparing two strings, and considering blanks to be "junk":
      71  
      72      >>> s = SequenceMatcher(lambda x: x == " ",
      73      ...                     "private Thread currentThread;",
      74      ...                     "private volatile Thread currentThread;")
      75      >>>
      76  
      77      .ratio() returns a float in [0, 1], measuring the "similarity" of the
      78      sequences.  As a rule of thumb, a .ratio() value over 0.6 means the
      79      sequences are close matches:
      80  
      81      >>> print(round(s.ratio(), 3))
      82      0.866
      83      >>>
      84  
      85      If you're only interested in where the sequences match,
      86      .get_matching_blocks() is handy:
      87  
      88      >>> for block in s.get_matching_blocks():
      89      ...     print("a[%d] and b[%d] match for %d elements" % block)
      90      a[0] and b[0] match for 8 elements
      91      a[8] and b[17] match for 21 elements
      92      a[29] and b[38] match for 0 elements
      93  
      94      Note that the last tuple returned by .get_matching_blocks() is always a
      95      dummy, (len(a), len(b), 0), and this is the only case in which the last
      96      tuple element (number of elements matched) is 0.
      97  
      98      If you want to know how to change the first sequence into the second,
      99      use .get_opcodes():
     100  
     101      >>> for opcode in s.get_opcodes():
     102      ...     print("%6s a[%d:%d] b[%d:%d]" % opcode)
     103       equal a[0:8] b[0:8]
     104      insert a[8:8] b[8:17]
     105       equal a[8:29] b[17:38]
     106  
     107      See the Differ class for a fancy human-friendly file differencer, which
     108      uses SequenceMatcher both to compare sequences of lines, and to compare
     109      sequences of characters within similar (near-matching) lines.
     110  
     111      See also function get_close_matches() in this module, which shows how
     112      simple code building on SequenceMatcher can be used to do useful work.
     113  
     114      Timing:  Basic R-O is cubic time worst case and quadratic time expected
     115      case.  SequenceMatcher is quadratic time for the worst case and has
     116      expected-case behavior dependent in a complicated way on how many
     117      elements the sequences have in common; best case time is linear.
     118      """
     119  
     120      def __init__(self, isjunk=None, a='', b='', autojunk=True):
     121          """Construct a SequenceMatcher.
     122  
     123          Optional arg isjunk is None (the default), or a one-argument
     124          function that takes a sequence element and returns true iff the
     125          element is junk.  None is equivalent to passing "lambda x: 0", i.e.
     126          no elements are considered to be junk.  For example, pass
     127              lambda x: x in " \\t"
     128          if you're comparing lines as sequences of characters, and don't
     129          want to synch up on blanks or hard tabs.
     130  
     131          Optional arg a is the first of two sequences to be compared.  By
     132          default, an empty string.  The elements of a must be hashable.  See
     133          also .set_seqs() and .set_seq1().
     134  
     135          Optional arg b is the second of two sequences to be compared.  By
     136          default, an empty string.  The elements of b must be hashable. See
     137          also .set_seqs() and .set_seq2().
     138  
     139          Optional arg autojunk should be set to False to disable the
     140          "automatic junk heuristic" that treats popular elements as junk
     141          (see module documentation for more information).
     142          """
     143  
     144          # Members:
     145          # a
     146          #      first sequence
     147          # b
     148          #      second sequence; differences are computed as "what do
     149          #      we need to do to 'a' to change it into 'b'?"
     150          # b2j
     151          #      for x in b, b2j[x] is a list of the indices (into b)
     152          #      at which x appears; junk and popular elements do not appear
     153          # fullbcount
     154          #      for x in b, fullbcount[x] == the number of times x
     155          #      appears in b; only materialized if really needed (used
     156          #      only for computing quick_ratio())
     157          # matching_blocks
     158          #      a list of (i, j, k) triples, where a[i:i+k] == b[j:j+k];
     159          #      ascending & non-overlapping in i and in j; terminated by
     160          #      a dummy (len(a), len(b), 0) sentinel
     161          # opcodes
     162          #      a list of (tag, i1, i2, j1, j2) tuples, where tag is
     163          #      one of
     164          #          'replace'   a[i1:i2] should be replaced by b[j1:j2]
     165          #          'delete'    a[i1:i2] should be deleted
     166          #          'insert'    b[j1:j2] should be inserted
     167          #          'equal'     a[i1:i2] == b[j1:j2]
     168          # isjunk
     169          #      a user-supplied function taking a sequence element and
     170          #      returning true iff the element is "junk" -- this has
     171          #      subtle but helpful effects on the algorithm, which I'll
     172          #      get around to writing up someday <0.9 wink>.
     173          #      DON'T USE!  Only __chain_b uses this.  Use "in self.bjunk".
     174          # bjunk
     175          #      the items in b for which isjunk is True.
     176          # bpopular
     177          #      nonjunk items in b treated as junk by the heuristic (if used).
     178  
     179          self.isjunk = isjunk
     180          self.a = self.b = None
     181          self.autojunk = autojunk
     182          self.set_seqs(a, b)
     183  
     184      def set_seqs(self, a, b):
     185          """Set the two sequences to be compared.
     186  
     187          >>> s = SequenceMatcher()
     188          >>> s.set_seqs("abcd", "bcde")
     189          >>> s.ratio()
     190          0.75
     191          """
     192  
     193          self.set_seq1(a)
     194          self.set_seq2(b)
     195  
     196      def set_seq1(self, a):
     197          """Set the first sequence to be compared.
     198  
     199          The second sequence to be compared is not changed.
     200  
     201          >>> s = SequenceMatcher(None, "abcd", "bcde")
     202          >>> s.ratio()
     203          0.75
     204          >>> s.set_seq1("bcde")
     205          >>> s.ratio()
     206          1.0
     207          >>>
     208  
     209          SequenceMatcher computes and caches detailed information about the
     210          second sequence, so if you want to compare one sequence S against
     211          many sequences, use .set_seq2(S) once and call .set_seq1(x)
     212          repeatedly for each of the other sequences.
     213  
     214          See also set_seqs() and set_seq2().
     215          """
     216  
     217          if a is self.a:
     218              return
     219          self.a = a
     220          self.matching_blocks = self.opcodes = None
     221  
     222      def set_seq2(self, b):
     223          """Set the second sequence to be compared.
     224  
     225          The first sequence to be compared is not changed.
     226  
     227          >>> s = SequenceMatcher(None, "abcd", "bcde")
     228          >>> s.ratio()
     229          0.75
     230          >>> s.set_seq2("abcd")
     231          >>> s.ratio()
     232          1.0
     233          >>>
     234  
     235          SequenceMatcher computes and caches detailed information about the
     236          second sequence, so if you want to compare one sequence S against
     237          many sequences, use .set_seq2(S) once and call .set_seq1(x)
     238          repeatedly for each of the other sequences.
     239  
     240          See also set_seqs() and set_seq1().
     241          """
     242  
     243          if b is self.b:
     244              return
     245          self.b = b
     246          self.matching_blocks = self.opcodes = None
     247          self.fullbcount = None
     248          self.__chain_b()
     249  
     250      # For each element x in b, set b2j[x] to a list of the indices in
     251      # b where x appears; the indices are in increasing order; note that
     252      # the number of times x appears in b is len(b2j[x]) ...
     253      # when self.isjunk is defined, junk elements don't show up in this
     254      # map at all, which stops the central find_longest_match method
     255      # from starting any matching block at a junk element ...
     256      # b2j also does not contain entries for "popular" elements, meaning
     257      # elements that account for more than 1 + 1% of the total elements, and
     258      # when the sequence is reasonably large (>= 200 elements); this can
     259      # be viewed as an adaptive notion of semi-junk, and yields an enormous
     260      # speedup when, e.g., comparing program files with hundreds of
     261      # instances of "return NULL;" ...
     262      # note that this is only called when b changes; so for cross-product
     263      # kinds of matches, it's best to call set_seq2 once, then set_seq1
     264      # repeatedly
     265  
     266      def __chain_b(self):
     267          # Because isjunk is a user-defined (not C) function, and we test
     268          # for junk a LOT, it's important to minimize the number of calls.
     269          # Before the tricks described here, __chain_b was by far the most
     270          # time-consuming routine in the whole module!  If anyone sees
     271          # Jim Roskind, thank him again for profile.py -- I never would
     272          # have guessed that.
     273          # The first trick is to build b2j ignoring the possibility
     274          # of junk.  I.e., we don't call isjunk at all yet.  Throwing
     275          # out the junk later is much cheaper than building b2j "right"
     276          # from the start.
     277          b = self.b
     278          self.b2j = b2j = {}
     279  
     280          for i, elt in enumerate(b):
     281              indices = b2j.setdefault(elt, [])
     282              indices.append(i)
     283  
     284          # Purge junk elements
     285          self.bjunk = junk = set()
     286          isjunk = self.isjunk
     287          if isjunk:
     288              for elt in b2j.keys():
     289                  if isjunk(elt):
     290                      junk.add(elt)
     291              for elt in junk: # separate loop avoids separate list of keys
     292                  del b2j[elt]
     293  
     294          # Purge popular elements that are not junk
     295          self.bpopular = popular = set()
     296          n = len(b)
     297          if self.autojunk and n >= 200:
     298              ntest = n // 100 + 1
     299              for elt, idxs in b2j.items():
     300                  if len(idxs) > ntest:
     301                      popular.add(elt)
     302              for elt in popular: # ditto; as fast for 1% deletion
     303                  del b2j[elt]
     304  
     305      def find_longest_match(self, alo=0, ahi=None, blo=0, bhi=None):
     306          """Find longest matching block in a[alo:ahi] and b[blo:bhi].
     307  
     308          By default it will find the longest match in the entirety of a and b.
     309  
     310          If isjunk is not defined:
     311  
     312          Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
     313              alo <= i <= i+k <= ahi
     314              blo <= j <= j+k <= bhi
     315          and for all (i',j',k') meeting those conditions,
     316              k >= k'
     317              i <= i'
     318              and if i == i', j <= j'
     319  
     320          In other words, of all maximal matching blocks, return one that
     321          starts earliest in a, and of all those maximal matching blocks that
     322          start earliest in a, return the one that starts earliest in b.
     323  
     324          >>> s = SequenceMatcher(None, " abcd", "abcd abcd")
     325          >>> s.find_longest_match(0, 5, 0, 9)
     326          Match(a=0, b=4, size=5)
     327  
     328          If isjunk is defined, first the longest matching block is
     329          determined as above, but with the additional restriction that no
     330          junk element appears in the block.  Then that block is extended as
     331          far as possible by matching (only) junk elements on both sides.  So
     332          the resulting block never matches on junk except as identical junk
     333          happens to be adjacent to an "interesting" match.
     334  
     335          Here's the same example as before, but considering blanks to be
     336          junk.  That prevents " abcd" from matching the " abcd" at the tail
     337          end of the second sequence directly.  Instead only the "abcd" can
     338          match, and matches the leftmost "abcd" in the second sequence:
     339  
     340          >>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")
     341          >>> s.find_longest_match(0, 5, 0, 9)
     342          Match(a=1, b=0, size=4)
     343  
     344          If no blocks match, return (alo, blo, 0).
     345  
     346          >>> s = SequenceMatcher(None, "ab", "c")
     347          >>> s.find_longest_match(0, 2, 0, 1)
     348          Match(a=0, b=0, size=0)
     349          """
     350  
     351          # CAUTION:  stripping common prefix or suffix would be incorrect.
     352          # E.g.,
     353          #    ab
     354          #    acab
     355          # Longest matching block is "ab", but if common prefix is
     356          # stripped, it's "a" (tied with "b").  UNIX(tm) diff does so
     357          # strip, so ends up claiming that ab is changed to acab by
     358          # inserting "ca" in the middle.  That's minimal but unintuitive:
     359          # "it's obvious" that someone inserted "ac" at the front.
     360          # Windiff ends up at the same place as diff, but by pairing up
     361          # the unique 'b's and then matching the first two 'a's.
     362  
     363          a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.bjunk.__contains__
     364          if ahi is None:
     365              ahi = len(a)
     366          if bhi is None:
     367              bhi = len(b)
     368          besti, bestj, bestsize = alo, blo, 0
     369          # find longest junk-free match
     370          # during an iteration of the loop, j2len[j] = length of longest
     371          # junk-free match ending with a[i-1] and b[j]
     372          j2len = {}
     373          nothing = []
     374          for i in range(alo, ahi):
     375              # look at all instances of a[i] in b; note that because
     376              # b2j has no junk keys, the loop is skipped if a[i] is junk
     377              j2lenget = j2len.get
     378              newj2len = {}
     379              for j in b2j.get(a[i], nothing):
     380                  # a[i] matches b[j]
     381                  if j < blo:
     382                      continue
     383                  if j >= bhi:
     384                      break
     385                  k = newj2len[j] = j2lenget(j-1, 0) + 1
     386                  if k > bestsize:
     387                      besti, bestj, bestsize = i-k+1, j-k+1, k
     388              j2len = newj2len
     389  
     390          # Extend the best by non-junk elements on each end.  In particular,
     391          # "popular" non-junk elements aren't in b2j, which greatly speeds
     392          # the inner loop above, but also means "the best" match so far
     393          # doesn't contain any junk *or* popular non-junk elements.
     394          while besti > alo and bestj > blo and \
     395                not isbjunk(b[bestj-1]) and \
     396                a[besti-1] == b[bestj-1]:
     397              besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
     398          while besti+bestsize < ahi and bestj+bestsize < bhi and \
     399                not isbjunk(b[bestj+bestsize]) and \
     400                a[besti+bestsize] == b[bestj+bestsize]:
     401              bestsize += 1
     402  
     403          # Now that we have a wholly interesting match (albeit possibly
     404          # empty!), we may as well suck up the matching junk on each
     405          # side of it too.  Can't think of a good reason not to, and it
     406          # saves post-processing the (possibly considerable) expense of
     407          # figuring out what to do with it.  In the case of an empty
     408          # interesting match, this is clearly the right thing to do,
     409          # because no other kind of match is possible in the regions.
     410          while besti > alo and bestj > blo and \
     411                isbjunk(b[bestj-1]) and \
     412                a[besti-1] == b[bestj-1]:
     413              besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
     414          while besti+bestsize < ahi and bestj+bestsize < bhi and \
     415                isbjunk(b[bestj+bestsize]) and \
     416                a[besti+bestsize] == b[bestj+bestsize]:
     417              bestsize = bestsize + 1
     418  
     419          return Match(besti, bestj, bestsize)
     420  
     421      def get_matching_blocks(self):
     422          """Return list of triples describing matching subsequences.
     423  
     424          Each triple is of the form (i, j, n), and means that
     425          a[i:i+n] == b[j:j+n].  The triples are monotonically increasing in
     426          i and in j.  New in Python 2.5, it's also guaranteed that if
     427          (i, j, n) and (i', j', n') are adjacent triples in the list, and
     428          the second is not the last triple in the list, then i+n != i' or
     429          j+n != j'.  IOW, adjacent triples never describe adjacent equal
     430          blocks.
     431  
     432          The last triple is a dummy, (len(a), len(b), 0), and is the only
     433          triple with n==0.
     434  
     435          >>> s = SequenceMatcher(None, "abxcd", "abcd")
     436          >>> list(s.get_matching_blocks())
     437          [Match(a=0, b=0, size=2), Match(a=3, b=2, size=2), Match(a=5, b=4, size=0)]
     438          """
     439  
     440          if self.matching_blocks is not None:
     441              return self.matching_blocks
     442          la, lb = len(self.a), len(self.b)
     443  
     444          # This is most naturally expressed as a recursive algorithm, but
     445          # at least one user bumped into extreme use cases that exceeded
     446          # the recursion limit on their box.  So, now we maintain a list
     447          # ('queue`) of blocks we still need to look at, and append partial
     448          # results to `matching_blocks` in a loop; the matches are sorted
     449          # at the end.
     450          queue = [(0, la, 0, lb)]
     451          matching_blocks = []
     452          while queue:
     453              alo, ahi, blo, bhi = queue.pop()
     454              i, j, k = x = self.find_longest_match(alo, ahi, blo, bhi)
     455              # a[alo:i] vs b[blo:j] unknown
     456              # a[i:i+k] same as b[j:j+k]
     457              # a[i+k:ahi] vs b[j+k:bhi] unknown
     458              if k:   # if k is 0, there was no matching block
     459                  matching_blocks.append(x)
     460                  if alo < i and blo < j:
     461                      queue.append((alo, i, blo, j))
     462                  if i+k < ahi and j+k < bhi:
     463                      queue.append((i+k, ahi, j+k, bhi))
     464          matching_blocks.sort()
     465  
     466          # It's possible that we have adjacent equal blocks in the
     467          # matching_blocks list now.  Starting with 2.5, this code was added
     468          # to collapse them.
     469          i1 = j1 = k1 = 0
     470          non_adjacent = []
     471          for i2, j2, k2 in matching_blocks:
     472              # Is this block adjacent to i1, j1, k1?
     473              if i1 + k1 == i2 and j1 + k1 == j2:
     474                  # Yes, so collapse them -- this just increases the length of
     475                  # the first block by the length of the second, and the first
     476                  # block so lengthened remains the block to compare against.
     477                  k1 += k2
     478              else:
     479                  # Not adjacent.  Remember the first block (k1==0 means it's
     480                  # the dummy we started with), and make the second block the
     481                  # new block to compare against.
     482                  if k1:
     483                      non_adjacent.append((i1, j1, k1))
     484                  i1, j1, k1 = i2, j2, k2
     485          if k1:
     486              non_adjacent.append((i1, j1, k1))
     487  
     488          non_adjacent.append( (la, lb, 0) )
     489          self.matching_blocks = list(map(Match._make, non_adjacent))
     490          return self.matching_blocks
     491  
     492      def get_opcodes(self):
     493          """Return list of 5-tuples describing how to turn a into b.
     494  
     495          Each tuple is of the form (tag, i1, i2, j1, j2).  The first tuple
     496          has i1 == j1 == 0, and remaining tuples have i1 == the i2 from the
     497          tuple preceding it, and likewise for j1 == the previous j2.
     498  
     499          The tags are strings, with these meanings:
     500  
     501          'replace':  a[i1:i2] should be replaced by b[j1:j2]
     502          'delete':   a[i1:i2] should be deleted.
     503                      Note that j1==j2 in this case.
     504          'insert':   b[j1:j2] should be inserted at a[i1:i1].
     505                      Note that i1==i2 in this case.
     506          'equal':    a[i1:i2] == b[j1:j2]
     507  
     508          >>> a = "qabxcd"
     509          >>> b = "abycdf"
     510          >>> s = SequenceMatcher(None, a, b)
     511          >>> for tag, i1, i2, j1, j2 in s.get_opcodes():
     512          ...    print(("%7s a[%d:%d] (%s) b[%d:%d] (%s)" %
     513          ...           (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2])))
     514           delete a[0:1] (q) b[0:0] ()
     515            equal a[1:3] (ab) b[0:2] (ab)
     516          replace a[3:4] (x) b[2:3] (y)
     517            equal a[4:6] (cd) b[3:5] (cd)
     518           insert a[6:6] () b[5:6] (f)
     519          """
     520  
     521          if self.opcodes is not None:
     522              return self.opcodes
     523          i = j = 0
     524          self.opcodes = answer = []
     525          for ai, bj, size in self.get_matching_blocks():
     526              # invariant:  we've pumped out correct diffs to change
     527              # a[:i] into b[:j], and the next matching block is
     528              # a[ai:ai+size] == b[bj:bj+size].  So we need to pump
     529              # out a diff to change a[i:ai] into b[j:bj], pump out
     530              # the matching block, and move (i,j) beyond the match
     531              tag = ''
     532              if i < ai and j < bj:
     533                  tag = 'replace'
     534              elif i < ai:
     535                  tag = 'delete'
     536              elif j < bj:
     537                  tag = 'insert'
     538              if tag:
     539                  answer.append( (tag, i, ai, j, bj) )
     540              i, j = ai+size, bj+size
     541              # the list of matching blocks is terminated by a
     542              # sentinel with size 0
     543              if size:
     544                  answer.append( ('equal', ai, i, bj, j) )
     545          return answer
     546  
     547      def get_grouped_opcodes(self, n=3):
     548          """ Isolate change clusters by eliminating ranges with no changes.
     549  
     550          Return a generator of groups with up to n lines of context.
     551          Each group is in the same format as returned by get_opcodes().
     552  
     553          >>> from pprint import pprint
     554          >>> a = list(map(str, range(1,40)))
     555          >>> b = a[:]
     556          >>> b[8:8] = ['i']     # Make an insertion
     557          >>> b[20] += 'x'       # Make a replacement
     558          >>> b[23:28] = []      # Make a deletion
     559          >>> b[30] += 'y'       # Make another replacement
     560          >>> pprint(list(SequenceMatcher(None,a,b).get_grouped_opcodes()))
     561          [[('equal', 5, 8, 5, 8), ('insert', 8, 8, 8, 9), ('equal', 8, 11, 9, 12)],
     562           [('equal', 16, 19, 17, 20),
     563            ('replace', 19, 20, 20, 21),
     564            ('equal', 20, 22, 21, 23),
     565            ('delete', 22, 27, 23, 23),
     566            ('equal', 27, 30, 23, 26)],
     567           [('equal', 31, 34, 27, 30),
     568            ('replace', 34, 35, 30, 31),
     569            ('equal', 35, 38, 31, 34)]]
     570          """
     571  
     572          codes = self.get_opcodes()
     573          if not codes:
     574              codes = [("equal", 0, 1, 0, 1)]
     575          # Fixup leading and trailing groups if they show no changes.
     576          if codes[0][0] == 'equal':
     577              tag, i1, i2, j1, j2 = codes[0]
     578              codes[0] = tag, max(i1, i2-n), i2, max(j1, j2-n), j2
     579          if codes[-1][0] == 'equal':
     580              tag, i1, i2, j1, j2 = codes[-1]
     581              codes[-1] = tag, i1, min(i2, i1+n), j1, min(j2, j1+n)
     582  
     583          nn = n + n
     584          group = []
     585          for tag, i1, i2, j1, j2 in codes:
     586              # End the current group and start a new one whenever
     587              # there is a large range with no changes.
     588              if tag == 'equal' and i2-i1 > nn:
     589                  group.append((tag, i1, min(i2, i1+n), j1, min(j2, j1+n)))
     590                  yield group
     591                  group = []
     592                  i1, j1 = max(i1, i2-n), max(j1, j2-n)
     593              group.append((tag, i1, i2, j1 ,j2))
     594          if group and not (len(group)==1 and group[0][0] == 'equal'):
     595              yield group
     596  
     597      def ratio(self):
     598          """Return a measure of the sequences' similarity (float in [0,1]).
     599  
     600          Where T is the total number of elements in both sequences, and
     601          M is the number of matches, this is 2.0*M / T.
     602          Note that this is 1 if the sequences are identical, and 0 if
     603          they have nothing in common.
     604  
     605          .ratio() is expensive to compute if you haven't already computed
     606          .get_matching_blocks() or .get_opcodes(), in which case you may
     607          want to try .quick_ratio() or .real_quick_ratio() first to get an
     608          upper bound.
     609  
     610          >>> s = SequenceMatcher(None, "abcd", "bcde")
     611          >>> s.ratio()
     612          0.75
     613          >>> s.quick_ratio()
     614          0.75
     615          >>> s.real_quick_ratio()
     616          1.0
     617          """
     618  
     619          matches = sum(triple[-1] for triple in self.get_matching_blocks())
     620          return _calculate_ratio(matches, len(self.a) + len(self.b))
     621  
     622      def quick_ratio(self):
     623          """Return an upper bound on ratio() relatively quickly.
     624  
     625          This isn't defined beyond that it is an upper bound on .ratio(), and
     626          is faster to compute.
     627          """
     628  
     629          # viewing a and b as multisets, set matches to the cardinality
     630          # of their intersection; this counts the number of matches
     631          # without regard to order, so is clearly an upper bound
     632          if self.fullbcount is None:
     633              self.fullbcount = fullbcount = {}
     634              for elt in self.b:
     635                  fullbcount[elt] = fullbcount.get(elt, 0) + 1
     636          fullbcount = self.fullbcount
     637          # avail[x] is the number of times x appears in 'b' less the
     638          # number of times we've seen it in 'a' so far ... kinda
     639          avail = {}
     640          availhas, matches = avail.__contains__, 0
     641          for elt in self.a:
     642              if availhas(elt):
     643                  numb = avail[elt]
     644              else:
     645                  numb = fullbcount.get(elt, 0)
     646              avail[elt] = numb - 1
     647              if numb > 0:
     648                  matches = matches + 1
     649          return _calculate_ratio(matches, len(self.a) + len(self.b))
     650  
     651      def real_quick_ratio(self):
     652          """Return an upper bound on ratio() very quickly.
     653  
     654          This isn't defined beyond that it is an upper bound on .ratio(), and
     655          is faster to compute than either .ratio() or .quick_ratio().
     656          """
     657  
     658          la, lb = len(self.a), len(self.b)
     659          # can't have more matches than the number of elements in the
     660          # shorter sequence
     661          return _calculate_ratio(min(la, lb), la + lb)
     662  
     663      __class_getitem__ = classmethod(GenericAlias)
     664  
     665  
     666  def get_close_matches(word, possibilities, n=3, cutoff=0.6):
     667      """Use SequenceMatcher to return list of the best "good enough" matches.
     668  
     669      word is a sequence for which close matches are desired (typically a
     670      string).
     671  
     672      possibilities is a list of sequences against which to match word
     673      (typically a list of strings).
     674  
     675      Optional arg n (default 3) is the maximum number of close matches to
     676      return.  n must be > 0.
     677  
     678      Optional arg cutoff (default 0.6) is a float in [0, 1].  Possibilities
     679      that don't score at least that similar to word are ignored.
     680  
     681      The best (no more than n) matches among the possibilities are returned
     682      in a list, sorted by similarity score, most similar first.
     683  
     684      >>> get_close_matches("appel", ["ape", "apple", "peach", "puppy"])
     685      ['apple', 'ape']
     686      >>> import keyword as _keyword
     687      >>> get_close_matches("wheel", _keyword.kwlist)
     688      ['while']
     689      >>> get_close_matches("Apple", _keyword.kwlist)
     690      []
     691      >>> get_close_matches("accept", _keyword.kwlist)
     692      ['except']
     693      """
     694  
     695      if not n >  0:
     696          raise ValueError("n must be > 0: %r" % (n,))
     697      if not 0.0 <= cutoff <= 1.0:
     698          raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,))
     699      result = []
     700      s = SequenceMatcher()
     701      s.set_seq2(word)
     702      for x in possibilities:
     703          s.set_seq1(x)
     704          if s.real_quick_ratio() >= cutoff and \
     705             s.quick_ratio() >= cutoff and \
     706             s.ratio() >= cutoff:
     707              result.append((s.ratio(), x))
     708  
     709      # Move the best scorers to head of list
     710      result = _nlargest(n, result)
     711      # Strip scores for the best n matches
     712      return [x for score, x in result]
     713  
     714  
     715  def _keep_original_ws(s, tag_s):
     716      """Replace whitespace with the original whitespace characters in `s`"""
     717      return ''.join(
     718          c if tag_c == " " and c.isspace() else tag_c
     719          for c, tag_c in zip(s, tag_s)
     720      )
     721  
     722  
     723  
     724  class ESC[4;38;5;81mDiffer:
     725      r"""
     726      Differ is a class for comparing sequences of lines of text, and
     727      producing human-readable differences or deltas.  Differ uses
     728      SequenceMatcher both to compare sequences of lines, and to compare
     729      sequences of characters within similar (near-matching) lines.
     730  
     731      Each line of a Differ delta begins with a two-letter code:
     732  
     733          '- '    line unique to sequence 1
     734          '+ '    line unique to sequence 2
     735          '  '    line common to both sequences
     736          '? '    line not present in either input sequence
     737  
     738      Lines beginning with '? ' attempt to guide the eye to intraline
     739      differences, and were not present in either input sequence.  These lines
     740      can be confusing if the sequences contain tab characters.
     741  
     742      Note that Differ makes no claim to produce a *minimal* diff.  To the
     743      contrary, minimal diffs are often counter-intuitive, because they synch
     744      up anywhere possible, sometimes accidental matches 100 pages apart.
     745      Restricting synch points to contiguous matches preserves some notion of
     746      locality, at the occasional cost of producing a longer diff.
     747  
     748      Example: Comparing two texts.
     749  
     750      First we set up the texts, sequences of individual single-line strings
     751      ending with newlines (such sequences can also be obtained from the
     752      `readlines()` method of file-like objects):
     753  
     754      >>> text1 = '''  1. Beautiful is better than ugly.
     755      ...   2. Explicit is better than implicit.
     756      ...   3. Simple is better than complex.
     757      ...   4. Complex is better than complicated.
     758      ... '''.splitlines(keepends=True)
     759      >>> len(text1)
     760      4
     761      >>> text1[0][-1]
     762      '\n'
     763      >>> text2 = '''  1. Beautiful is better than ugly.
     764      ...   3.   Simple is better than complex.
     765      ...   4. Complicated is better than complex.
     766      ...   5. Flat is better than nested.
     767      ... '''.splitlines(keepends=True)
     768  
     769      Next we instantiate a Differ object:
     770  
     771      >>> d = Differ()
     772  
     773      Note that when instantiating a Differ object we may pass functions to
     774      filter out line and character 'junk'.  See Differ.__init__ for details.
     775  
     776      Finally, we compare the two:
     777  
     778      >>> result = list(d.compare(text1, text2))
     779  
     780      'result' is a list of strings, so let's pretty-print it:
     781  
     782      >>> from pprint import pprint as _pprint
     783      >>> _pprint(result)
     784      ['    1. Beautiful is better than ugly.\n',
     785       '-   2. Explicit is better than implicit.\n',
     786       '-   3. Simple is better than complex.\n',
     787       '+   3.   Simple is better than complex.\n',
     788       '?     ++\n',
     789       '-   4. Complex is better than complicated.\n',
     790       '?            ^                     ---- ^\n',
     791       '+   4. Complicated is better than complex.\n',
     792       '?           ++++ ^                      ^\n',
     793       '+   5. Flat is better than nested.\n']
     794  
     795      As a single multi-line string it looks like this:
     796  
     797      >>> print(''.join(result), end="")
     798          1. Beautiful is better than ugly.
     799      -   2. Explicit is better than implicit.
     800      -   3. Simple is better than complex.
     801      +   3.   Simple is better than complex.
     802      ?     ++
     803      -   4. Complex is better than complicated.
     804      ?            ^                     ---- ^
     805      +   4. Complicated is better than complex.
     806      ?           ++++ ^                      ^
     807      +   5. Flat is better than nested.
     808      """
     809  
     810      def __init__(self, linejunk=None, charjunk=None):
     811          """
     812          Construct a text differencer, with optional filters.
     813  
     814          The two optional keyword parameters are for filter functions:
     815  
     816          - `linejunk`: A function that should accept a single string argument,
     817            and return true iff the string is junk. The module-level function
     818            `IS_LINE_JUNK` may be used to filter out lines without visible
     819            characters, except for at most one splat ('#').  It is recommended
     820            to leave linejunk None; the underlying SequenceMatcher class has
     821            an adaptive notion of "noise" lines that's better than any static
     822            definition the author has ever been able to craft.
     823  
     824          - `charjunk`: A function that should accept a string of length 1. The
     825            module-level function `IS_CHARACTER_JUNK` may be used to filter out
     826            whitespace characters (a blank or tab; **note**: bad idea to include
     827            newline in this!).  Use of IS_CHARACTER_JUNK is recommended.
     828          """
     829  
     830          self.linejunk = linejunk
     831          self.charjunk = charjunk
     832  
     833      def compare(self, a, b):
     834          r"""
     835          Compare two sequences of lines; generate the resulting delta.
     836  
     837          Each sequence must contain individual single-line strings ending with
     838          newlines. Such sequences can be obtained from the `readlines()` method
     839          of file-like objects.  The delta generated also consists of newline-
     840          terminated strings, ready to be printed as-is via the writelines()
     841          method of a file-like object.
     842  
     843          Example:
     844  
     845          >>> print(''.join(Differ().compare('one\ntwo\nthree\n'.splitlines(True),
     846          ...                                'ore\ntree\nemu\n'.splitlines(True))),
     847          ...       end="")
     848          - one
     849          ?  ^
     850          + ore
     851          ?  ^
     852          - two
     853          - three
     854          ?  -
     855          + tree
     856          + emu
     857          """
     858  
     859          cruncher = SequenceMatcher(self.linejunk, a, b)
     860          for tag, alo, ahi, blo, bhi in cruncher.get_opcodes():
     861              if tag == 'replace':
     862                  g = self._fancy_replace(a, alo, ahi, b, blo, bhi)
     863              elif tag == 'delete':
     864                  g = self._dump('-', a, alo, ahi)
     865              elif tag == 'insert':
     866                  g = self._dump('+', b, blo, bhi)
     867              elif tag == 'equal':
     868                  g = self._dump(' ', a, alo, ahi)
     869              else:
     870                  raise ValueError('unknown tag %r' % (tag,))
     871  
     872              yield from g
     873  
     874      def _dump(self, tag, x, lo, hi):
     875          """Generate comparison results for a same-tagged range."""
     876          for i in range(lo, hi):
     877              yield '%s %s' % (tag, x[i])
     878  
     879      def _plain_replace(self, a, alo, ahi, b, blo, bhi):
     880          assert alo < ahi and blo < bhi
     881          # dump the shorter block first -- reduces the burden on short-term
     882          # memory if the blocks are of very different sizes
     883          if bhi - blo < ahi - alo:
     884              first  = self._dump('+', b, blo, bhi)
     885              second = self._dump('-', a, alo, ahi)
     886          else:
     887              first  = self._dump('-', a, alo, ahi)
     888              second = self._dump('+', b, blo, bhi)
     889  
     890          for g in first, second:
     891              yield from g
     892  
     893      def _fancy_replace(self, a, alo, ahi, b, blo, bhi):
     894          r"""
     895          When replacing one block of lines with another, search the blocks
     896          for *similar* lines; the best-matching pair (if any) is used as a
     897          synch point, and intraline difference marking is done on the
     898          similar pair. Lots of work, but often worth it.
     899  
     900          Example:
     901  
     902          >>> d = Differ()
     903          >>> results = d._fancy_replace(['abcDefghiJkl\n'], 0, 1,
     904          ...                            ['abcdefGhijkl\n'], 0, 1)
     905          >>> print(''.join(results), end="")
     906          - abcDefghiJkl
     907          ?    ^  ^  ^
     908          + abcdefGhijkl
     909          ?    ^  ^  ^
     910          """
     911  
     912          # don't synch up unless the lines have a similarity score of at
     913          # least cutoff; best_ratio tracks the best score seen so far
     914          best_ratio, cutoff = 0.74, 0.75
     915          cruncher = SequenceMatcher(self.charjunk)
     916          eqi, eqj = None, None   # 1st indices of equal lines (if any)
     917  
     918          # search for the pair that matches best without being identical
     919          # (identical lines must be junk lines, & we don't want to synch up
     920          # on junk -- unless we have to)
     921          for j in range(blo, bhi):
     922              bj = b[j]
     923              cruncher.set_seq2(bj)
     924              for i in range(alo, ahi):
     925                  ai = a[i]
     926                  if ai == bj:
     927                      if eqi is None:
     928                          eqi, eqj = i, j
     929                      continue
     930                  cruncher.set_seq1(ai)
     931                  # computing similarity is expensive, so use the quick
     932                  # upper bounds first -- have seen this speed up messy
     933                  # compares by a factor of 3.
     934                  # note that ratio() is only expensive to compute the first
     935                  # time it's called on a sequence pair; the expensive part
     936                  # of the computation is cached by cruncher
     937                  if cruncher.real_quick_ratio() > best_ratio and \
     938                        cruncher.quick_ratio() > best_ratio and \
     939                        cruncher.ratio() > best_ratio:
     940                      best_ratio, best_i, best_j = cruncher.ratio(), i, j
     941          if best_ratio < cutoff:
     942              # no non-identical "pretty close" pair
     943              if eqi is None:
     944                  # no identical pair either -- treat it as a straight replace
     945                  yield from self._plain_replace(a, alo, ahi, b, blo, bhi)
     946                  return
     947              # no close pair, but an identical pair -- synch up on that
     948              best_i, best_j, best_ratio = eqi, eqj, 1.0
     949          else:
     950              # there's a close pair, so forget the identical pair (if any)
     951              eqi = None
     952  
     953          # a[best_i] very similar to b[best_j]; eqi is None iff they're not
     954          # identical
     955  
     956          # pump out diffs from before the synch point
     957          yield from self._fancy_helper(a, alo, best_i, b, blo, best_j)
     958  
     959          # do intraline marking on the synch pair
     960          aelt, belt = a[best_i], b[best_j]
     961          if eqi is None:
     962              # pump out a '-', '?', '+', '?' quad for the synched lines
     963              atags = btags = ""
     964              cruncher.set_seqs(aelt, belt)
     965              for tag, ai1, ai2, bj1, bj2 in cruncher.get_opcodes():
     966                  la, lb = ai2 - ai1, bj2 - bj1
     967                  if tag == 'replace':
     968                      atags += '^' * la
     969                      btags += '^' * lb
     970                  elif tag == 'delete':
     971                      atags += '-' * la
     972                  elif tag == 'insert':
     973                      btags += '+' * lb
     974                  elif tag == 'equal':
     975                      atags += ' ' * la
     976                      btags += ' ' * lb
     977                  else:
     978                      raise ValueError('unknown tag %r' % (tag,))
     979              yield from self._qformat(aelt, belt, atags, btags)
     980          else:
     981              # the synch pair is identical
     982              yield '  ' + aelt
     983  
     984          # pump out diffs from after the synch point
     985          yield from self._fancy_helper(a, best_i+1, ahi, b, best_j+1, bhi)
     986  
     987      def _fancy_helper(self, a, alo, ahi, b, blo, bhi):
     988          g = []
     989          if alo < ahi:
     990              if blo < bhi:
     991                  g = self._fancy_replace(a, alo, ahi, b, blo, bhi)
     992              else:
     993                  g = self._dump('-', a, alo, ahi)
     994          elif blo < bhi:
     995              g = self._dump('+', b, blo, bhi)
     996  
     997          yield from g
     998  
     999      def _qformat(self, aline, bline, atags, btags):
    1000          r"""
    1001          Format "?" output and deal with tabs.
    1002  
    1003          Example:
    1004  
    1005          >>> d = Differ()
    1006          >>> results = d._qformat('\tabcDefghiJkl\n', '\tabcdefGhijkl\n',
    1007          ...                      '  ^ ^  ^      ', '  ^ ^  ^      ')
    1008          >>> for line in results: print(repr(line))
    1009          ...
    1010          '- \tabcDefghiJkl\n'
    1011          '? \t ^ ^  ^\n'
    1012          '+ \tabcdefGhijkl\n'
    1013          '? \t ^ ^  ^\n'
    1014          """
    1015          atags = _keep_original_ws(aline, atags).rstrip()
    1016          btags = _keep_original_ws(bline, btags).rstrip()
    1017  
    1018          yield "- " + aline
    1019          if atags:
    1020              yield f"? {atags}\n"
    1021  
    1022          yield "+ " + bline
    1023          if btags:
    1024              yield f"? {btags}\n"
    1025  
    1026  # With respect to junk, an earlier version of ndiff simply refused to
    1027  # *start* a match with a junk element.  The result was cases like this:
    1028  #     before: private Thread currentThread;
    1029  #     after:  private volatile Thread currentThread;
    1030  # If you consider whitespace to be junk, the longest contiguous match
    1031  # not starting with junk is "e Thread currentThread".  So ndiff reported
    1032  # that "e volatil" was inserted between the 't' and the 'e' in "private".
    1033  # While an accurate view, to people that's absurd.  The current version
    1034  # looks for matching blocks that are entirely junk-free, then extends the
    1035  # longest one of those as far as possible but only with matching junk.
    1036  # So now "currentThread" is matched, then extended to suck up the
    1037  # preceding blank; then "private" is matched, and extended to suck up the
    1038  # following blank; then "Thread" is matched; and finally ndiff reports
    1039  # that "volatile " was inserted before "Thread".  The only quibble
    1040  # remaining is that perhaps it was really the case that " volatile"
    1041  # was inserted after "private".  I can live with that <wink>.
    1042  
    1043  import re
    1044  
    1045  def IS_LINE_JUNK(line, pat=re.compile(r"\s*(?:#\s*)?$").match):
    1046      r"""
    1047      Return True for ignorable line: iff `line` is blank or contains a single '#'.
    1048  
    1049      Examples:
    1050  
    1051      >>> IS_LINE_JUNK('\n')
    1052      True
    1053      >>> IS_LINE_JUNK('  #   \n')
    1054      True
    1055      >>> IS_LINE_JUNK('hello\n')
    1056      False
    1057      """
    1058  
    1059      return pat(line) is not None
    1060  
    1061  def IS_CHARACTER_JUNK(ch, ws=" \t"):
    1062      r"""
    1063      Return True for ignorable character: iff `ch` is a space or tab.
    1064  
    1065      Examples:
    1066  
    1067      >>> IS_CHARACTER_JUNK(' ')
    1068      True
    1069      >>> IS_CHARACTER_JUNK('\t')
    1070      True
    1071      >>> IS_CHARACTER_JUNK('\n')
    1072      False
    1073      >>> IS_CHARACTER_JUNK('x')
    1074      False
    1075      """
    1076  
    1077      return ch in ws
    1078  
    1079  
    1080  ########################################################################
    1081  ###  Unified Diff
    1082  ########################################################################
    1083  
    1084  def _format_range_unified(start, stop):
    1085      'Convert range to the "ed" format'
    1086      # Per the diff spec at http://www.unix.org/single_unix_specification/
    1087      beginning = start + 1     # lines start numbering with one
    1088      length = stop - start
    1089      if length == 1:
    1090          return '{}'.format(beginning)
    1091      if not length:
    1092          beginning -= 1        # empty ranges begin at line just before the range
    1093      return '{},{}'.format(beginning, length)
    1094  
    1095  def unified_diff(a, b, fromfile='', tofile='', fromfiledate='',
    1096                   tofiledate='', n=3, lineterm='\n'):
    1097      r"""
    1098      Compare two sequences of lines; generate the delta as a unified diff.
    1099  
    1100      Unified diffs are a compact way of showing line changes and a few
    1101      lines of context.  The number of context lines is set by 'n' which
    1102      defaults to three.
    1103  
    1104      By default, the diff control lines (those with ---, +++, or @@) are
    1105      created with a trailing newline.  This is helpful so that inputs
    1106      created from file.readlines() result in diffs that are suitable for
    1107      file.writelines() since both the inputs and outputs have trailing
    1108      newlines.
    1109  
    1110      For inputs that do not have trailing newlines, set the lineterm
    1111      argument to "" so that the output will be uniformly newline free.
    1112  
    1113      The unidiff format normally has a header for filenames and modification
    1114      times.  Any or all of these may be specified using strings for
    1115      'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'.
    1116      The modification times are normally expressed in the ISO 8601 format.
    1117  
    1118      Example:
    1119  
    1120      >>> for line in unified_diff('one two three four'.split(),
    1121      ...             'zero one tree four'.split(), 'Original', 'Current',
    1122      ...             '2005-01-26 23:30:50', '2010-04-02 10:20:52',
    1123      ...             lineterm=''):
    1124      ...     print(line)                 # doctest: +NORMALIZE_WHITESPACE
    1125      --- Original        2005-01-26 23:30:50
    1126      +++ Current         2010-04-02 10:20:52
    1127      @@ -1,4 +1,4 @@
    1128      +zero
    1129       one
    1130      -two
    1131      -three
    1132      +tree
    1133       four
    1134      """
    1135  
    1136      _check_types(a, b, fromfile, tofile, fromfiledate, tofiledate, lineterm)
    1137      started = False
    1138      for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n):
    1139          if not started:
    1140              started = True
    1141              fromdate = '\t{}'.format(fromfiledate) if fromfiledate else ''
    1142              todate = '\t{}'.format(tofiledate) if tofiledate else ''
    1143              yield '--- {}{}{}'.format(fromfile, fromdate, lineterm)
    1144              yield '+++ {}{}{}'.format(tofile, todate, lineterm)
    1145  
    1146          first, last = group[0], group[-1]
    1147          file1_range = _format_range_unified(first[1], last[2])
    1148          file2_range = _format_range_unified(first[3], last[4])
    1149          yield '@@ -{} +{} @@{}'.format(file1_range, file2_range, lineterm)
    1150  
    1151          for tag, i1, i2, j1, j2 in group:
    1152              if tag == 'equal':
    1153                  for line in a[i1:i2]:
    1154                      yield ' ' + line
    1155                  continue
    1156              if tag in {'replace', 'delete'}:
    1157                  for line in a[i1:i2]:
    1158                      yield '-' + line
    1159              if tag in {'replace', 'insert'}:
    1160                  for line in b[j1:j2]:
    1161                      yield '+' + line
    1162  
    1163  
    1164  ########################################################################
    1165  ###  Context Diff
    1166  ########################################################################
    1167  
    1168  def _format_range_context(start, stop):
    1169      'Convert range to the "ed" format'
    1170      # Per the diff spec at http://www.unix.org/single_unix_specification/
    1171      beginning = start + 1     # lines start numbering with one
    1172      length = stop - start
    1173      if not length:
    1174          beginning -= 1        # empty ranges begin at line just before the range
    1175      if length <= 1:
    1176          return '{}'.format(beginning)
    1177      return '{},{}'.format(beginning, beginning + length - 1)
    1178  
    1179  # See http://www.unix.org/single_unix_specification/
    1180  def context_diff(a, b, fromfile='', tofile='',
    1181                   fromfiledate='', tofiledate='', n=3, lineterm='\n'):
    1182      r"""
    1183      Compare two sequences of lines; generate the delta as a context diff.
    1184  
    1185      Context diffs are a compact way of showing line changes and a few
    1186      lines of context.  The number of context lines is set by 'n' which
    1187      defaults to three.
    1188  
    1189      By default, the diff control lines (those with *** or ---) are
    1190      created with a trailing newline.  This is helpful so that inputs
    1191      created from file.readlines() result in diffs that are suitable for
    1192      file.writelines() since both the inputs and outputs have trailing
    1193      newlines.
    1194  
    1195      For inputs that do not have trailing newlines, set the lineterm
    1196      argument to "" so that the output will be uniformly newline free.
    1197  
    1198      The context diff format normally has a header for filenames and
    1199      modification times.  Any or all of these may be specified using
    1200      strings for 'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'.
    1201      The modification times are normally expressed in the ISO 8601 format.
    1202      If not specified, the strings default to blanks.
    1203  
    1204      Example:
    1205  
    1206      >>> print(''.join(context_diff('one\ntwo\nthree\nfour\n'.splitlines(True),
    1207      ...       'zero\none\ntree\nfour\n'.splitlines(True), 'Original', 'Current')),
    1208      ...       end="")
    1209      *** Original
    1210      --- Current
    1211      ***************
    1212      *** 1,4 ****
    1213        one
    1214      ! two
    1215      ! three
    1216        four
    1217      --- 1,4 ----
    1218      + zero
    1219        one
    1220      ! tree
    1221        four
    1222      """
    1223  
    1224      _check_types(a, b, fromfile, tofile, fromfiledate, tofiledate, lineterm)
    1225      prefix = dict(insert='+ ', delete='- ', replace='! ', equal='  ')
    1226      started = False
    1227      for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n):
    1228          if not started:
    1229              started = True
    1230              fromdate = '\t{}'.format(fromfiledate) if fromfiledate else ''
    1231              todate = '\t{}'.format(tofiledate) if tofiledate else ''
    1232              yield '*** {}{}{}'.format(fromfile, fromdate, lineterm)
    1233              yield '--- {}{}{}'.format(tofile, todate, lineterm)
    1234  
    1235          first, last = group[0], group[-1]
    1236          yield '***************' + lineterm
    1237  
    1238          file1_range = _format_range_context(first[1], last[2])
    1239          yield '*** {} ****{}'.format(file1_range, lineterm)
    1240  
    1241          if any(tag in {'replace', 'delete'} for tag, _, _, _, _ in group):
    1242              for tag, i1, i2, _, _ in group:
    1243                  if tag != 'insert':
    1244                      for line in a[i1:i2]:
    1245                          yield prefix[tag] + line
    1246  
    1247          file2_range = _format_range_context(first[3], last[4])
    1248          yield '--- {} ----{}'.format(file2_range, lineterm)
    1249  
    1250          if any(tag in {'replace', 'insert'} for tag, _, _, _, _ in group):
    1251              for tag, _, _, j1, j2 in group:
    1252                  if tag != 'delete':
    1253                      for line in b[j1:j2]:
    1254                          yield prefix[tag] + line
    1255  
    1256  def _check_types(a, b, *args):
    1257      # Checking types is weird, but the alternative is garbled output when
    1258      # someone passes mixed bytes and str to {unified,context}_diff(). E.g.
    1259      # without this check, passing filenames as bytes results in output like
    1260      #   --- b'oldfile.txt'
    1261      #   +++ b'newfile.txt'
    1262      # because of how str.format() incorporates bytes objects.
    1263      if a and not isinstance(a[0], str):
    1264          raise TypeError('lines to compare must be str, not %s (%r)' %
    1265                          (type(a[0]).__name__, a[0]))
    1266      if b and not isinstance(b[0], str):
    1267          raise TypeError('lines to compare must be str, not %s (%r)' %
    1268                          (type(b[0]).__name__, b[0]))
    1269      for arg in args:
    1270          if not isinstance(arg, str):
    1271              raise TypeError('all arguments must be str, not: %r' % (arg,))
    1272  
    1273  def diff_bytes(dfunc, a, b, fromfile=b'', tofile=b'',
    1274                 fromfiledate=b'', tofiledate=b'', n=3, lineterm=b'\n'):
    1275      r"""
    1276      Compare `a` and `b`, two sequences of lines represented as bytes rather
    1277      than str. This is a wrapper for `dfunc`, which is typically either
    1278      unified_diff() or context_diff(). Inputs are losslessly converted to
    1279      strings so that `dfunc` only has to worry about strings, and encoded
    1280      back to bytes on return. This is necessary to compare files with
    1281      unknown or inconsistent encoding. All other inputs (except `n`) must be
    1282      bytes rather than str.
    1283      """
    1284      def decode(s):
    1285          try:
    1286              return s.decode('ascii', 'surrogateescape')
    1287          except AttributeError as err:
    1288              msg = ('all arguments must be bytes, not %s (%r)' %
    1289                     (type(s).__name__, s))
    1290              raise TypeError(msg) from err
    1291      a = list(map(decode, a))
    1292      b = list(map(decode, b))
    1293      fromfile = decode(fromfile)
    1294      tofile = decode(tofile)
    1295      fromfiledate = decode(fromfiledate)
    1296      tofiledate = decode(tofiledate)
    1297      lineterm = decode(lineterm)
    1298  
    1299      lines = dfunc(a, b, fromfile, tofile, fromfiledate, tofiledate, n, lineterm)
    1300      for line in lines:
    1301          yield line.encode('ascii', 'surrogateescape')
    1302  
    1303  def ndiff(a, b, linejunk=None, charjunk=IS_CHARACTER_JUNK):
    1304      r"""
    1305      Compare `a` and `b` (lists of strings); return a `Differ`-style delta.
    1306  
    1307      Optional keyword parameters `linejunk` and `charjunk` are for filter
    1308      functions, or can be None:
    1309  
    1310      - linejunk: A function that should accept a single string argument and
    1311        return true iff the string is junk.  The default is None, and is
    1312        recommended; the underlying SequenceMatcher class has an adaptive
    1313        notion of "noise" lines.
    1314  
    1315      - charjunk: A function that accepts a character (string of length
    1316        1), and returns true iff the character is junk. The default is
    1317        the module-level function IS_CHARACTER_JUNK, which filters out
    1318        whitespace characters (a blank or tab; note: it's a bad idea to
    1319        include newline in this!).
    1320  
    1321      Tools/scripts/ndiff.py is a command-line front-end to this function.
    1322  
    1323      Example:
    1324  
    1325      >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(keepends=True),
    1326      ...              'ore\ntree\nemu\n'.splitlines(keepends=True))
    1327      >>> print(''.join(diff), end="")
    1328      - one
    1329      ?  ^
    1330      + ore
    1331      ?  ^
    1332      - two
    1333      - three
    1334      ?  -
    1335      + tree
    1336      + emu
    1337      """
    1338      return Differ(linejunk, charjunk).compare(a, b)
    1339  
    1340  def _mdiff(fromlines, tolines, context=None, linejunk=None,
    1341             charjunk=IS_CHARACTER_JUNK):
    1342      r"""Returns generator yielding marked up from/to side by side differences.
    1343  
    1344      Arguments:
    1345      fromlines -- list of text lines to compared to tolines
    1346      tolines -- list of text lines to be compared to fromlines
    1347      context -- number of context lines to display on each side of difference,
    1348                 if None, all from/to text lines will be generated.
    1349      linejunk -- passed on to ndiff (see ndiff documentation)
    1350      charjunk -- passed on to ndiff (see ndiff documentation)
    1351  
    1352      This function returns an iterator which returns a tuple:
    1353      (from line tuple, to line tuple, boolean flag)
    1354  
    1355      from/to line tuple -- (line num, line text)
    1356          line num -- integer or None (to indicate a context separation)
    1357          line text -- original line text with following markers inserted:
    1358              '\0+' -- marks start of added text
    1359              '\0-' -- marks start of deleted text
    1360              '\0^' -- marks start of changed text
    1361              '\1' -- marks end of added/deleted/changed text
    1362  
    1363      boolean flag -- None indicates context separation, True indicates
    1364          either "from" or "to" line contains a change, otherwise False.
    1365  
    1366      This function/iterator was originally developed to generate side by side
    1367      file difference for making HTML pages (see HtmlDiff class for example
    1368      usage).
    1369  
    1370      Note, this function utilizes the ndiff function to generate the side by
    1371      side difference markup.  Optional ndiff arguments may be passed to this
    1372      function and they in turn will be passed to ndiff.
    1373      """
    1374      import re
    1375  
    1376      # regular expression for finding intraline change indices
    1377      change_re = re.compile(r'(\++|\-+|\^+)')
    1378  
    1379      # create the difference iterator to generate the differences
    1380      diff_lines_iterator = ndiff(fromlines,tolines,linejunk,charjunk)
    1381  
    1382      def _make_line(lines, format_key, side, num_lines=[0,0]):
    1383          """Returns line of text with user's change markup and line formatting.
    1384  
    1385          lines -- list of lines from the ndiff generator to produce a line of
    1386                   text from.  When producing the line of text to return, the
    1387                   lines used are removed from this list.
    1388          format_key -- '+' return first line in list with "add" markup around
    1389                            the entire line.
    1390                        '-' return first line in list with "delete" markup around
    1391                            the entire line.
    1392                        '?' return first line in list with add/delete/change
    1393                            intraline markup (indices obtained from second line)
    1394                        None return first line in list with no markup
    1395          side -- indice into the num_lines list (0=from,1=to)
    1396          num_lines -- from/to current line number.  This is NOT intended to be a
    1397                       passed parameter.  It is present as a keyword argument to
    1398                       maintain memory of the current line numbers between calls
    1399                       of this function.
    1400  
    1401          Note, this function is purposefully not defined at the module scope so
    1402          that data it needs from its parent function (within whose context it
    1403          is defined) does not need to be of module scope.
    1404          """
    1405          num_lines[side] += 1
    1406          # Handle case where no user markup is to be added, just return line of
    1407          # text with user's line format to allow for usage of the line number.
    1408          if format_key is None:
    1409              return (num_lines[side],lines.pop(0)[2:])
    1410          # Handle case of intraline changes
    1411          if format_key == '?':
    1412              text, markers = lines.pop(0), lines.pop(0)
    1413              # find intraline changes (store change type and indices in tuples)
    1414              sub_info = []
    1415              def record_sub_info(match_object,sub_info=sub_info):
    1416                  sub_info.append([match_object.group(1)[0],match_object.span()])
    1417                  return match_object.group(1)
    1418              change_re.sub(record_sub_info,markers)
    1419              # process each tuple inserting our special marks that won't be
    1420              # noticed by an xml/html escaper.
    1421              for key,(begin,end) in reversed(sub_info):
    1422                  text = text[0:begin]+'\0'+key+text[begin:end]+'\1'+text[end:]
    1423              text = text[2:]
    1424          # Handle case of add/delete entire line
    1425          else:
    1426              text = lines.pop(0)[2:]
    1427              # if line of text is just a newline, insert a space so there is
    1428              # something for the user to highlight and see.
    1429              if not text:
    1430                  text = ' '
    1431              # insert marks that won't be noticed by an xml/html escaper.
    1432              text = '\0' + format_key + text + '\1'
    1433          # Return line of text, first allow user's line formatter to do its
    1434          # thing (such as adding the line number) then replace the special
    1435          # marks with what the user's change markup.
    1436          return (num_lines[side],text)
    1437  
    1438      def _line_iterator():
    1439          """Yields from/to lines of text with a change indication.
    1440  
    1441          This function is an iterator.  It itself pulls lines from a
    1442          differencing iterator, processes them and yields them.  When it can
    1443          it yields both a "from" and a "to" line, otherwise it will yield one
    1444          or the other.  In addition to yielding the lines of from/to text, a
    1445          boolean flag is yielded to indicate if the text line(s) have
    1446          differences in them.
    1447  
    1448          Note, this function is purposefully not defined at the module scope so
    1449          that data it needs from its parent function (within whose context it
    1450          is defined) does not need to be of module scope.
    1451          """
    1452          lines = []
    1453          num_blanks_pending, num_blanks_to_yield = 0, 0
    1454          while True:
    1455              # Load up next 4 lines so we can look ahead, create strings which
    1456              # are a concatenation of the first character of each of the 4 lines
    1457              # so we can do some very readable comparisons.
    1458              while len(lines) < 4:
    1459                  lines.append(next(diff_lines_iterator, 'X'))
    1460              s = ''.join([line[0] for line in lines])
    1461              if s.startswith('X'):
    1462                  # When no more lines, pump out any remaining blank lines so the
    1463                  # corresponding add/delete lines get a matching blank line so
    1464                  # all line pairs get yielded at the next level.
    1465                  num_blanks_to_yield = num_blanks_pending
    1466              elif s.startswith('-?+?'):
    1467                  # simple intraline change
    1468                  yield _make_line(lines,'?',0), _make_line(lines,'?',1), True
    1469                  continue
    1470              elif s.startswith('--++'):
    1471                  # in delete block, add block coming: we do NOT want to get
    1472                  # caught up on blank lines yet, just process the delete line
    1473                  num_blanks_pending -= 1
    1474                  yield _make_line(lines,'-',0), None, True
    1475                  continue
    1476              elif s.startswith(('--?+', '--+', '- ')):
    1477                  # in delete block and see an intraline change or unchanged line
    1478                  # coming: yield the delete line and then blanks
    1479                  from_line,to_line = _make_line(lines,'-',0), None
    1480                  num_blanks_to_yield,num_blanks_pending = num_blanks_pending-1,0
    1481              elif s.startswith('-+?'):
    1482                  # intraline change
    1483                  yield _make_line(lines,None,0), _make_line(lines,'?',1), True
    1484                  continue
    1485              elif s.startswith('-?+'):
    1486                  # intraline change
    1487                  yield _make_line(lines,'?',0), _make_line(lines,None,1), True
    1488                  continue
    1489              elif s.startswith('-'):
    1490                  # delete FROM line
    1491                  num_blanks_pending -= 1
    1492                  yield _make_line(lines,'-',0), None, True
    1493                  continue
    1494              elif s.startswith('+--'):
    1495                  # in add block, delete block coming: we do NOT want to get
    1496                  # caught up on blank lines yet, just process the add line
    1497                  num_blanks_pending += 1
    1498                  yield None, _make_line(lines,'+',1), True
    1499                  continue
    1500              elif s.startswith(('+ ', '+-')):
    1501                  # will be leaving an add block: yield blanks then add line
    1502                  from_line, to_line = None, _make_line(lines,'+',1)
    1503                  num_blanks_to_yield,num_blanks_pending = num_blanks_pending+1,0
    1504              elif s.startswith('+'):
    1505                  # inside an add block, yield the add line
    1506                  num_blanks_pending += 1
    1507                  yield None, _make_line(lines,'+',1), True
    1508                  continue
    1509              elif s.startswith(' '):
    1510                  # unchanged text, yield it to both sides
    1511                  yield _make_line(lines[:],None,0),_make_line(lines,None,1),False
    1512                  continue
    1513              # Catch up on the blank lines so when we yield the next from/to
    1514              # pair, they are lined up.
    1515              while(num_blanks_to_yield < 0):
    1516                  num_blanks_to_yield += 1
    1517                  yield None,('','\n'),True
    1518              while(num_blanks_to_yield > 0):
    1519                  num_blanks_to_yield -= 1
    1520                  yield ('','\n'),None,True
    1521              if s.startswith('X'):
    1522                  return
    1523              else:
    1524                  yield from_line,to_line,True
    1525  
    1526      def _line_pair_iterator():
    1527          """Yields from/to lines of text with a change indication.
    1528  
    1529          This function is an iterator.  It itself pulls lines from the line
    1530          iterator.  Its difference from that iterator is that this function
    1531          always yields a pair of from/to text lines (with the change
    1532          indication).  If necessary it will collect single from/to lines
    1533          until it has a matching pair from/to pair to yield.
    1534  
    1535          Note, this function is purposefully not defined at the module scope so
    1536          that data it needs from its parent function (within whose context it
    1537          is defined) does not need to be of module scope.
    1538          """
    1539          line_iterator = _line_iterator()
    1540          fromlines,tolines=[],[]
    1541          while True:
    1542              # Collecting lines of text until we have a from/to pair
    1543              while (len(fromlines)==0 or len(tolines)==0):
    1544                  try:
    1545                      from_line, to_line, found_diff = next(line_iterator)
    1546                  except StopIteration:
    1547                      return
    1548                  if from_line is not None:
    1549                      fromlines.append((from_line,found_diff))
    1550                  if to_line is not None:
    1551                      tolines.append((to_line,found_diff))
    1552              # Once we have a pair, remove them from the collection and yield it
    1553              from_line, fromDiff = fromlines.pop(0)
    1554              to_line, to_diff = tolines.pop(0)
    1555              yield (from_line,to_line,fromDiff or to_diff)
    1556  
    1557      # Handle case where user does not want context differencing, just yield
    1558      # them up without doing anything else with them.
    1559      line_pair_iterator = _line_pair_iterator()
    1560      if context is None:
    1561          yield from line_pair_iterator
    1562      # Handle case where user wants context differencing.  We must do some
    1563      # storage of lines until we know for sure that they are to be yielded.
    1564      else:
    1565          context += 1
    1566          lines_to_write = 0
    1567          while True:
    1568              # Store lines up until we find a difference, note use of a
    1569              # circular queue because we only need to keep around what
    1570              # we need for context.
    1571              index, contextLines = 0, [None]*(context)
    1572              found_diff = False
    1573              while(found_diff is False):
    1574                  try:
    1575                      from_line, to_line, found_diff = next(line_pair_iterator)
    1576                  except StopIteration:
    1577                      return
    1578                  i = index % context
    1579                  contextLines[i] = (from_line, to_line, found_diff)
    1580                  index += 1
    1581              # Yield lines that we have collected so far, but first yield
    1582              # the user's separator.
    1583              if index > context:
    1584                  yield None, None, None
    1585                  lines_to_write = context
    1586              else:
    1587                  lines_to_write = index
    1588                  index = 0
    1589              while(lines_to_write):
    1590                  i = index % context
    1591                  index += 1
    1592                  yield contextLines[i]
    1593                  lines_to_write -= 1
    1594              # Now yield the context lines after the change
    1595              lines_to_write = context-1
    1596              try:
    1597                  while(lines_to_write):
    1598                      from_line, to_line, found_diff = next(line_pair_iterator)
    1599                      # If another change within the context, extend the context
    1600                      if found_diff:
    1601                          lines_to_write = context-1
    1602                      else:
    1603                          lines_to_write -= 1
    1604                      yield from_line, to_line, found_diff
    1605              except StopIteration:
    1606                  # Catch exception from next() and return normally
    1607                  return
    1608  
    1609  
    1610  _file_template = """
    1611  <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
    1612            "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
    1613  
    1614  <html>
    1615  
    1616  <head>
    1617      <meta http-equiv="Content-Type"
    1618            content="text/html; charset=%(charset)s" />
    1619      <title></title>
    1620      <style type="text/css">%(styles)s
    1621      </style>
    1622  </head>
    1623  
    1624  <body>
    1625      %(table)s%(legend)s
    1626  </body>
    1627  
    1628  </html>"""
    1629  
    1630  _styles = """
    1631          table.diff {font-family:Courier; border:medium;}
    1632          .diff_header {background-color:#e0e0e0}
    1633          td.diff_header {text-align:right}
    1634          .diff_next {background-color:#c0c0c0}
    1635          .diff_add {background-color:#aaffaa}
    1636          .diff_chg {background-color:#ffff77}
    1637          .diff_sub {background-color:#ffaaaa}"""
    1638  
    1639  _table_template = """
    1640      <table class="diff" id="difflib_chg_%(prefix)s_top"
    1641             cellspacing="0" cellpadding="0" rules="groups" >
    1642          <colgroup></colgroup> <colgroup></colgroup> <colgroup></colgroup>
    1643          <colgroup></colgroup> <colgroup></colgroup> <colgroup></colgroup>
    1644          %(header_row)s
    1645          <tbody>
    1646  %(data_rows)s        </tbody>
    1647      </table>"""
    1648  
    1649  _legend = """
    1650      <table class="diff" summary="Legends">
    1651          <tr> <th colspan="2"> Legends </th> </tr>
    1652          <tr> <td> <table border="" summary="Colors">
    1653                        <tr><th> Colors </th> </tr>
    1654                        <tr><td class="diff_add">&nbsp;Added&nbsp;</td></tr>
    1655                        <tr><td class="diff_chg">Changed</td> </tr>
    1656                        <tr><td class="diff_sub">Deleted</td> </tr>
    1657                    </table></td>
    1658               <td> <table border="" summary="Links">
    1659                        <tr><th colspan="2"> Links </th> </tr>
    1660                        <tr><td>(f)irst change</td> </tr>
    1661                        <tr><td>(n)ext change</td> </tr>
    1662                        <tr><td>(t)op</td> </tr>
    1663                    </table></td> </tr>
    1664      </table>"""
    1665  
    1666  class ESC[4;38;5;81mHtmlDiff(ESC[4;38;5;149mobject):
    1667      """For producing HTML side by side comparison with change highlights.
    1668  
    1669      This class can be used to create an HTML table (or a complete HTML file
    1670      containing the table) showing a side by side, line by line comparison
    1671      of text with inter-line and intra-line change highlights.  The table can
    1672      be generated in either full or contextual difference mode.
    1673  
    1674      The following methods are provided for HTML generation:
    1675  
    1676      make_table -- generates HTML for a single side by side table
    1677      make_file -- generates complete HTML file with a single side by side table
    1678  
    1679      See tools/scripts/diff.py for an example usage of this class.
    1680      """
    1681  
    1682      _file_template = _file_template
    1683      _styles = _styles
    1684      _table_template = _table_template
    1685      _legend = _legend
    1686      _default_prefix = 0
    1687  
    1688      def __init__(self,tabsize=8,wrapcolumn=None,linejunk=None,
    1689                   charjunk=IS_CHARACTER_JUNK):
    1690          """HtmlDiff instance initializer
    1691  
    1692          Arguments:
    1693          tabsize -- tab stop spacing, defaults to 8.
    1694          wrapcolumn -- column number where lines are broken and wrapped,
    1695              defaults to None where lines are not wrapped.
    1696          linejunk,charjunk -- keyword arguments passed into ndiff() (used by
    1697              HtmlDiff() to generate the side by side HTML differences).  See
    1698              ndiff() documentation for argument default values and descriptions.
    1699          """
    1700          self._tabsize = tabsize
    1701          self._wrapcolumn = wrapcolumn
    1702          self._linejunk = linejunk
    1703          self._charjunk = charjunk
    1704  
    1705      def make_file(self, fromlines, tolines, fromdesc='', todesc='',
    1706                    context=False, numlines=5, *, charset='utf-8'):
    1707          """Returns HTML file of side by side comparison with change highlights
    1708  
    1709          Arguments:
    1710          fromlines -- list of "from" lines
    1711          tolines -- list of "to" lines
    1712          fromdesc -- "from" file column header string
    1713          todesc -- "to" file column header string
    1714          context -- set to True for contextual differences (defaults to False
    1715              which shows full differences).
    1716          numlines -- number of context lines.  When context is set True,
    1717              controls number of lines displayed before and after the change.
    1718              When context is False, controls the number of lines to place
    1719              the "next" link anchors before the next change (so click of
    1720              "next" link jumps to just before the change).
    1721          charset -- charset of the HTML document
    1722          """
    1723  
    1724          return (self._file_template % dict(
    1725              styles=self._styles,
    1726              legend=self._legend,
    1727              table=self.make_table(fromlines, tolines, fromdesc, todesc,
    1728                                    context=context, numlines=numlines),
    1729              charset=charset
    1730          )).encode(charset, 'xmlcharrefreplace').decode(charset)
    1731  
    1732      def _tab_newline_replace(self,fromlines,tolines):
    1733          """Returns from/to line lists with tabs expanded and newlines removed.
    1734  
    1735          Instead of tab characters being replaced by the number of spaces
    1736          needed to fill in to the next tab stop, this function will fill
    1737          the space with tab characters.  This is done so that the difference
    1738          algorithms can identify changes in a file when tabs are replaced by
    1739          spaces and vice versa.  At the end of the HTML generation, the tab
    1740          characters will be replaced with a nonbreakable space.
    1741          """
    1742          def expand_tabs(line):
    1743              # hide real spaces
    1744              line = line.replace(' ','\0')
    1745              # expand tabs into spaces
    1746              line = line.expandtabs(self._tabsize)
    1747              # replace spaces from expanded tabs back into tab characters
    1748              # (we'll replace them with markup after we do differencing)
    1749              line = line.replace(' ','\t')
    1750              return line.replace('\0',' ').rstrip('\n')
    1751          fromlines = [expand_tabs(line) for line in fromlines]
    1752          tolines = [expand_tabs(line) for line in tolines]
    1753          return fromlines,tolines
    1754  
    1755      def _split_line(self,data_list,line_num,text):
    1756          """Builds list of text lines by splitting text lines at wrap point
    1757  
    1758          This function will determine if the input text line needs to be
    1759          wrapped (split) into separate lines.  If so, the first wrap point
    1760          will be determined and the first line appended to the output
    1761          text line list.  This function is used recursively to handle
    1762          the second part of the split line to further split it.
    1763          """
    1764          # if blank line or context separator, just add it to the output list
    1765          if not line_num:
    1766              data_list.append((line_num,text))
    1767              return
    1768  
    1769          # if line text doesn't need wrapping, just add it to the output list
    1770          size = len(text)
    1771          max = self._wrapcolumn
    1772          if (size <= max) or ((size -(text.count('\0')*3)) <= max):
    1773              data_list.append((line_num,text))
    1774              return
    1775  
    1776          # scan text looking for the wrap point, keeping track if the wrap
    1777          # point is inside markers
    1778          i = 0
    1779          n = 0
    1780          mark = ''
    1781          while n < max and i < size:
    1782              if text[i] == '\0':
    1783                  i += 1
    1784                  mark = text[i]
    1785                  i += 1
    1786              elif text[i] == '\1':
    1787                  i += 1
    1788                  mark = ''
    1789              else:
    1790                  i += 1
    1791                  n += 1
    1792  
    1793          # wrap point is inside text, break it up into separate lines
    1794          line1 = text[:i]
    1795          line2 = text[i:]
    1796  
    1797          # if wrap point is inside markers, place end marker at end of first
    1798          # line and start marker at beginning of second line because each
    1799          # line will have its own table tag markup around it.
    1800          if mark:
    1801              line1 = line1 + '\1'
    1802              line2 = '\0' + mark + line2
    1803  
    1804          # tack on first line onto the output list
    1805          data_list.append((line_num,line1))
    1806  
    1807          # use this routine again to wrap the remaining text
    1808          self._split_line(data_list,'>',line2)
    1809  
    1810      def _line_wrapper(self,diffs):
    1811          """Returns iterator that splits (wraps) mdiff text lines"""
    1812  
    1813          # pull from/to data and flags from mdiff iterator
    1814          for fromdata,todata,flag in diffs:
    1815              # check for context separators and pass them through
    1816              if flag is None:
    1817                  yield fromdata,todata,flag
    1818                  continue
    1819              (fromline,fromtext),(toline,totext) = fromdata,todata
    1820              # for each from/to line split it at the wrap column to form
    1821              # list of text lines.
    1822              fromlist,tolist = [],[]
    1823              self._split_line(fromlist,fromline,fromtext)
    1824              self._split_line(tolist,toline,totext)
    1825              # yield from/to line in pairs inserting blank lines as
    1826              # necessary when one side has more wrapped lines
    1827              while fromlist or tolist:
    1828                  if fromlist:
    1829                      fromdata = fromlist.pop(0)
    1830                  else:
    1831                      fromdata = ('',' ')
    1832                  if tolist:
    1833                      todata = tolist.pop(0)
    1834                  else:
    1835                      todata = ('',' ')
    1836                  yield fromdata,todata,flag
    1837  
    1838      def _collect_lines(self,diffs):
    1839          """Collects mdiff output into separate lists
    1840  
    1841          Before storing the mdiff from/to data into a list, it is converted
    1842          into a single line of text with HTML markup.
    1843          """
    1844  
    1845          fromlist,tolist,flaglist = [],[],[]
    1846          # pull from/to data and flags from mdiff style iterator
    1847          for fromdata,todata,flag in diffs:
    1848              try:
    1849                  # store HTML markup of the lines into the lists
    1850                  fromlist.append(self._format_line(0,flag,*fromdata))
    1851                  tolist.append(self._format_line(1,flag,*todata))
    1852              except TypeError:
    1853                  # exceptions occur for lines where context separators go
    1854                  fromlist.append(None)
    1855                  tolist.append(None)
    1856              flaglist.append(flag)
    1857          return fromlist,tolist,flaglist
    1858  
    1859      def _format_line(self,side,flag,linenum,text):
    1860          """Returns HTML markup of "from" / "to" text lines
    1861  
    1862          side -- 0 or 1 indicating "from" or "to" text
    1863          flag -- indicates if difference on line
    1864          linenum -- line number (used for line number column)
    1865          text -- line text to be marked up
    1866          """
    1867          try:
    1868              linenum = '%d' % linenum
    1869              id = ' id="%s%s"' % (self._prefix[side],linenum)
    1870          except TypeError:
    1871              # handle blank lines where linenum is '>' or ''
    1872              id = ''
    1873          # replace those things that would get confused with HTML symbols
    1874          text=text.replace("&","&amp;").replace(">","&gt;").replace("<","&lt;")
    1875  
    1876          # make space non-breakable so they don't get compressed or line wrapped
    1877          text = text.replace(' ','&nbsp;').rstrip()
    1878  
    1879          return '<td class="diff_header"%s>%s</td><td nowrap="nowrap">%s</td>' \
    1880                 % (id,linenum,text)
    1881  
    1882      def _make_prefix(self):
    1883          """Create unique anchor prefixes"""
    1884  
    1885          # Generate a unique anchor prefix so multiple tables
    1886          # can exist on the same HTML page without conflicts.
    1887          fromprefix = "from%d_" % HtmlDiff._default_prefix
    1888          toprefix = "to%d_" % HtmlDiff._default_prefix
    1889          HtmlDiff._default_prefix += 1
    1890          # store prefixes so line format method has access
    1891          self._prefix = [fromprefix,toprefix]
    1892  
    1893      def _convert_flags(self,fromlist,tolist,flaglist,context,numlines):
    1894          """Makes list of "next" links"""
    1895  
    1896          # all anchor names will be generated using the unique "to" prefix
    1897          toprefix = self._prefix[1]
    1898  
    1899          # process change flags, generating middle column of next anchors/links
    1900          next_id = ['']*len(flaglist)
    1901          next_href = ['']*len(flaglist)
    1902          num_chg, in_change = 0, False
    1903          last = 0
    1904          for i,flag in enumerate(flaglist):
    1905              if flag:
    1906                  if not in_change:
    1907                      in_change = True
    1908                      last = i
    1909                      # at the beginning of a change, drop an anchor a few lines
    1910                      # (the context lines) before the change for the previous
    1911                      # link
    1912                      i = max([0,i-numlines])
    1913                      next_id[i] = ' id="difflib_chg_%s_%d"' % (toprefix,num_chg)
    1914                      # at the beginning of a change, drop a link to the next
    1915                      # change
    1916                      num_chg += 1
    1917                      next_href[last] = '<a href="#difflib_chg_%s_%d">n</a>' % (
    1918                           toprefix,num_chg)
    1919              else:
    1920                  in_change = False
    1921          # check for cases where there is no content to avoid exceptions
    1922          if not flaglist:
    1923              flaglist = [False]
    1924              next_id = ['']
    1925              next_href = ['']
    1926              last = 0
    1927              if context:
    1928                  fromlist = ['<td></td><td>&nbsp;No Differences Found&nbsp;</td>']
    1929                  tolist = fromlist
    1930              else:
    1931                  fromlist = tolist = ['<td></td><td>&nbsp;Empty File&nbsp;</td>']
    1932          # if not a change on first line, drop a link
    1933          if not flaglist[0]:
    1934              next_href[0] = '<a href="#difflib_chg_%s_0">f</a>' % toprefix
    1935          # redo the last link to link to the top
    1936          next_href[last] = '<a href="#difflib_chg_%s_top">t</a>' % (toprefix)
    1937  
    1938          return fromlist,tolist,flaglist,next_href,next_id
    1939  
    1940      def make_table(self,fromlines,tolines,fromdesc='',todesc='',context=False,
    1941                     numlines=5):
    1942          """Returns HTML table of side by side comparison with change highlights
    1943  
    1944          Arguments:
    1945          fromlines -- list of "from" lines
    1946          tolines -- list of "to" lines
    1947          fromdesc -- "from" file column header string
    1948          todesc -- "to" file column header string
    1949          context -- set to True for contextual differences (defaults to False
    1950              which shows full differences).
    1951          numlines -- number of context lines.  When context is set True,
    1952              controls number of lines displayed before and after the change.
    1953              When context is False, controls the number of lines to place
    1954              the "next" link anchors before the next change (so click of
    1955              "next" link jumps to just before the change).
    1956          """
    1957  
    1958          # make unique anchor prefixes so that multiple tables may exist
    1959          # on the same page without conflict.
    1960          self._make_prefix()
    1961  
    1962          # change tabs to spaces before it gets more difficult after we insert
    1963          # markup
    1964          fromlines,tolines = self._tab_newline_replace(fromlines,tolines)
    1965  
    1966          # create diffs iterator which generates side by side from/to data
    1967          if context:
    1968              context_lines = numlines
    1969          else:
    1970              context_lines = None
    1971          diffs = _mdiff(fromlines,tolines,context_lines,linejunk=self._linejunk,
    1972                        charjunk=self._charjunk)
    1973  
    1974          # set up iterator to wrap lines that exceed desired width
    1975          if self._wrapcolumn:
    1976              diffs = self._line_wrapper(diffs)
    1977  
    1978          # collect up from/to lines and flags into lists (also format the lines)
    1979          fromlist,tolist,flaglist = self._collect_lines(diffs)
    1980  
    1981          # process change flags, generating middle column of next anchors/links
    1982          fromlist,tolist,flaglist,next_href,next_id = self._convert_flags(
    1983              fromlist,tolist,flaglist,context,numlines)
    1984  
    1985          s = []
    1986          fmt = '            <tr><td class="diff_next"%s>%s</td>%s' + \
    1987                '<td class="diff_next">%s</td>%s</tr>\n'
    1988          for i in range(len(flaglist)):
    1989              if flaglist[i] is None:
    1990                  # mdiff yields None on separator lines skip the bogus ones
    1991                  # generated for the first line
    1992                  if i > 0:
    1993                      s.append('        </tbody>        \n        <tbody>\n')
    1994              else:
    1995                  s.append( fmt % (next_id[i],next_href[i],fromlist[i],
    1996                                             next_href[i],tolist[i]))
    1997          if fromdesc or todesc:
    1998              header_row = '<thead><tr>%s%s%s%s</tr></thead>' % (
    1999                  '<th class="diff_next"><br /></th>',
    2000                  '<th colspan="2" class="diff_header">%s</th>' % fromdesc,
    2001                  '<th class="diff_next"><br /></th>',
    2002                  '<th colspan="2" class="diff_header">%s</th>' % todesc)
    2003          else:
    2004              header_row = ''
    2005  
    2006          table = self._table_template % dict(
    2007              data_rows=''.join(s),
    2008              header_row=header_row,
    2009              prefix=self._prefix[1])
    2010  
    2011          return table.replace('\0+','<span class="diff_add">'). \
    2012                       replace('\0-','<span class="diff_sub">'). \
    2013                       replace('\0^','<span class="diff_chg">'). \
    2014                       replace('\1','</span>'). \
    2015                       replace('\t','&nbsp;')
    2016  
    2017  del re
    2018  
    2019  def restore(delta, which):
    2020      r"""
    2021      Generate one of the two sequences that generated a delta.
    2022  
    2023      Given a `delta` produced by `Differ.compare()` or `ndiff()`, extract
    2024      lines originating from file 1 or 2 (parameter `which`), stripping off line
    2025      prefixes.
    2026  
    2027      Examples:
    2028  
    2029      >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(keepends=True),
    2030      ...              'ore\ntree\nemu\n'.splitlines(keepends=True))
    2031      >>> diff = list(diff)
    2032      >>> print(''.join(restore(diff, 1)), end="")
    2033      one
    2034      two
    2035      three
    2036      >>> print(''.join(restore(diff, 2)), end="")
    2037      ore
    2038      tree
    2039      emu
    2040      """
    2041      try:
    2042          tag = {1: "- ", 2: "+ "}[int(which)]
    2043      except KeyError:
    2044          raise ValueError('unknown delta choice (must be 1 or 2): %r'
    2045                             % which) from None
    2046      prefixes = ("  ", tag)
    2047      for line in delta:
    2048          if line[:2] in prefixes:
    2049              yield line[2:]
    2050  
    2051  def _test():
    2052      import doctest, difflib
    2053      return doctest.testmod(difflib)
    2054  
    2055  if __name__ == "__main__":
    2056      _test()