1  """Text wrapping and filling.
       2  """
       3  
       4  # Copyright (C) 1999-2001 Gregory P. Ward.
       5  # Copyright (C) 2002, 2003 Python Software Foundation.
       6  # Written by Greg Ward <gward@python.net>
       7  
       8  import re
       9  
      10  __all__ = ['TextWrapper', 'wrap', 'fill', 'dedent', 'indent', 'shorten']
      11  
      12  # Hardcode the recognized whitespace characters to the US-ASCII
      13  # whitespace characters.  The main reason for doing this is that
      14  # some Unicode spaces (like \u00a0) are non-breaking whitespaces.
      15  _whitespace = '\t\n\x0b\x0c\r '
      16  
      17  class ESC[4;38;5;81mTextWrapper:
      18      """
      19      Object for wrapping/filling text.  The public interface consists of
      20      the wrap() and fill() methods; the other methods are just there for
      21      subclasses to override in order to tweak the default behaviour.
      22      If you want to completely replace the main wrapping algorithm,
      23      you'll probably have to override _wrap_chunks().
      24  
      25      Several instance attributes control various aspects of wrapping:
      26        width (default: 70)
      27          the maximum width of wrapped lines (unless break_long_words
      28          is false)
      29        initial_indent (default: "")
      30          string that will be prepended to the first line of wrapped
      31          output.  Counts towards the line's width.
      32        subsequent_indent (default: "")
      33          string that will be prepended to all lines save the first
      34          of wrapped output; also counts towards each line's width.
      35        expand_tabs (default: true)
      36          Expand tabs in input text to spaces before further processing.
      37          Each tab will become 0 .. 'tabsize' spaces, depending on its position
      38          in its line.  If false, each tab is treated as a single character.
      39        tabsize (default: 8)
      40          Expand tabs in input text to 0 .. 'tabsize' spaces, unless
      41          'expand_tabs' is false.
      42        replace_whitespace (default: true)
      43          Replace all whitespace characters in the input text by spaces
      44          after tab expansion.  Note that if expand_tabs is false and
      45          replace_whitespace is true, every tab will be converted to a
      46          single space!
      47        fix_sentence_endings (default: false)
      48          Ensure that sentence-ending punctuation is always followed
      49          by two spaces.  Off by default because the algorithm is
      50          (unavoidably) imperfect.
      51        break_long_words (default: true)
      52          Break words longer than 'width'.  If false, those words will not
      53          be broken, and some lines might be longer than 'width'.
      54        break_on_hyphens (default: true)
      55          Allow breaking hyphenated words. If true, wrapping will occur
      56          preferably on whitespaces and right after hyphens part of
      57          compound words.
      58        drop_whitespace (default: true)
      59          Drop leading and trailing whitespace from lines.
      60        max_lines (default: None)
      61          Truncate wrapped lines.
      62        placeholder (default: ' [...]')
      63          Append to the last line of truncated text.
      64      """
      65  
      66      unicode_whitespace_trans = dict.fromkeys(map(ord, _whitespace), ord(' '))
      67  
      68      # This funky little regex is just the trick for splitting
      69      # text up into word-wrappable chunks.  E.g.
      70      #   "Hello there -- you goof-ball, use the -b option!"
      71      # splits into
      72      #   Hello/ /there/ /--/ /you/ /goof-/ball,/ /use/ /the/ /-b/ /option!
      73      # (after stripping out empty strings).
      74      word_punct = r'[\w!"\'&.,?]'
      75      letter = r'[^\d\W]'
      76      whitespace = r'[%s]' % re.escape(_whitespace)
      77      nowhitespace = '[^' + whitespace[1:]
      78      wordsep_re = re.compile(r'''
      79          ( # any whitespace
      80            %(ws)s+
      81          | # em-dash between words
      82            (?<=%(wp)s) -{2,} (?=\w)
      83          | # word, possibly hyphenated
      84            %(nws)s+? (?:
      85              # hyphenated word
      86                -(?: (?<=%(lt)s{2}-) | (?<=%(lt)s-%(lt)s-))
      87                (?= %(lt)s -? %(lt)s)
      88              | # end of word
      89                (?=%(ws)s|\Z)
      90              | # em-dash
      91                (?<=%(wp)s) (?=-{2,}\w)
      92              )
      93          )''' % {'wp': word_punct, 'lt': letter,
      94                  'ws': whitespace, 'nws': nowhitespace},
      95          re.VERBOSE)
      96      del word_punct, letter, nowhitespace
      97  
      98      # This less funky little regex just split on recognized spaces. E.g.
      99      #   "Hello there -- you goof-ball, use the -b option!"
     100      # splits into
     101      #   Hello/ /there/ /--/ /you/ /goof-ball,/ /use/ /the/ /-b/ /option!/
     102      wordsep_simple_re = re.compile(r'(%s+)' % whitespace)
     103      del whitespace
     104  
     105      # XXX this is not locale- or charset-aware -- string.lowercase
     106      # is US-ASCII only (and therefore English-only)
     107      sentence_end_re = re.compile(r'[a-z]'             # lowercase letter
     108                                   r'[\.\!\?]'          # sentence-ending punct.
     109                                   r'[\"\']?'           # optional end-of-quote
     110                                   r'\Z')               # end of chunk
     111  
     112      def __init__(self,
     113                   width=70,
     114                   initial_indent="",
     115                   subsequent_indent="",
     116                   expand_tabs=True,
     117                   replace_whitespace=True,
     118                   fix_sentence_endings=False,
     119                   break_long_words=True,
     120                   drop_whitespace=True,
     121                   break_on_hyphens=True,
     122                   tabsize=8,
     123                   *,
     124                   max_lines=None,
     125                   placeholder=' [...]'):
     126          self.width = width
     127          self.initial_indent = initial_indent
     128          self.subsequent_indent = subsequent_indent
     129          self.expand_tabs = expand_tabs
     130          self.replace_whitespace = replace_whitespace
     131          self.fix_sentence_endings = fix_sentence_endings
     132          self.break_long_words = break_long_words
     133          self.drop_whitespace = drop_whitespace
     134          self.break_on_hyphens = break_on_hyphens
     135          self.tabsize = tabsize
     136          self.max_lines = max_lines
     137          self.placeholder = placeholder
     138  
     139  
     140      # -- Private methods -----------------------------------------------
     141      # (possibly useful for subclasses to override)
     142  
     143      def _munge_whitespace(self, text):
     144          """_munge_whitespace(text : string) -> string
     145  
     146          Munge whitespace in text: expand tabs and convert all other
     147          whitespace characters to spaces.  Eg. " foo\\tbar\\n\\nbaz"
     148          becomes " foo    bar  baz".
     149          """
     150          if self.expand_tabs:
     151              text = text.expandtabs(self.tabsize)
     152          if self.replace_whitespace:
     153              text = text.translate(self.unicode_whitespace_trans)
     154          return text
     155  
     156  
     157      def _split(self, text):
     158          """_split(text : string) -> [string]
     159  
     160          Split the text to wrap into indivisible chunks.  Chunks are
     161          not quite the same as words; see _wrap_chunks() for full
     162          details.  As an example, the text
     163            Look, goof-ball -- use the -b option!
     164          breaks into the following chunks:
     165            'Look,', ' ', 'goof-', 'ball', ' ', '--', ' ',
     166            'use', ' ', 'the', ' ', '-b', ' ', 'option!'
     167          if break_on_hyphens is True, or in:
     168            'Look,', ' ', 'goof-ball', ' ', '--', ' ',
     169            'use', ' ', 'the', ' ', '-b', ' ', option!'
     170          otherwise.
     171          """
     172          if self.break_on_hyphens is True:
     173              chunks = self.wordsep_re.split(text)
     174          else:
     175              chunks = self.wordsep_simple_re.split(text)
     176          chunks = [c for c in chunks if c]
     177          return chunks
     178  
     179      def _fix_sentence_endings(self, chunks):
     180          """_fix_sentence_endings(chunks : [string])
     181  
     182          Correct for sentence endings buried in 'chunks'.  Eg. when the
     183          original text contains "... foo.\\nBar ...", munge_whitespace()
     184          and split() will convert that to [..., "foo.", " ", "Bar", ...]
     185          which has one too few spaces; this method simply changes the one
     186          space to two.
     187          """
     188          i = 0
     189          patsearch = self.sentence_end_re.search
     190          while i < len(chunks)-1:
     191              if chunks[i+1] == " " and patsearch(chunks[i]):
     192                  chunks[i+1] = "  "
     193                  i += 2
     194              else:
     195                  i += 1
     196  
     197      def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width):
     198          """_handle_long_word(chunks : [string],
     199                               cur_line : [string],
     200                               cur_len : int, width : int)
     201  
     202          Handle a chunk of text (most likely a word, not whitespace) that
     203          is too long to fit in any line.
     204          """
     205          # Figure out when indent is larger than the specified width, and make
     206          # sure at least one character is stripped off on every pass
     207          if width < 1:
     208              space_left = 1
     209          else:
     210              space_left = width - cur_len
     211  
     212          # If we're allowed to break long words, then do so: put as much
     213          # of the next chunk onto the current line as will fit.
     214          if self.break_long_words:
     215              end = space_left
     216              chunk = reversed_chunks[-1]
     217              if self.break_on_hyphens and len(chunk) > space_left:
     218                  # break after last hyphen, but only if there are
     219                  # non-hyphens before it
     220                  hyphen = chunk.rfind('-', 0, space_left)
     221                  if hyphen > 0 and any(c != '-' for c in chunk[:hyphen]):
     222                      end = hyphen + 1
     223              cur_line.append(chunk[:end])
     224              reversed_chunks[-1] = chunk[end:]
     225  
     226          # Otherwise, we have to preserve the long word intact.  Only add
     227          # it to the current line if there's nothing already there --
     228          # that minimizes how much we violate the width constraint.
     229          elif not cur_line:
     230              cur_line.append(reversed_chunks.pop())
     231  
     232          # If we're not allowed to break long words, and there's already
     233          # text on the current line, do nothing.  Next time through the
     234          # main loop of _wrap_chunks(), we'll wind up here again, but
     235          # cur_len will be zero, so the next line will be entirely
     236          # devoted to the long word that we can't handle right now.
     237  
     238      def _wrap_chunks(self, chunks):
     239          """_wrap_chunks(chunks : [string]) -> [string]
     240  
     241          Wrap a sequence of text chunks and return a list of lines of
     242          length 'self.width' or less.  (If 'break_long_words' is false,
     243          some lines may be longer than this.)  Chunks correspond roughly
     244          to words and the whitespace between them: each chunk is
     245          indivisible (modulo 'break_long_words'), but a line break can
     246          come between any two chunks.  Chunks should not have internal
     247          whitespace; ie. a chunk is either all whitespace or a "word".
     248          Whitespace chunks will be removed from the beginning and end of
     249          lines, but apart from that whitespace is preserved.
     250          """
     251          lines = []
     252          if self.width <= 0:
     253              raise ValueError("invalid width %r (must be > 0)" % self.width)
     254          if self.max_lines is not None:
     255              if self.max_lines > 1:
     256                  indent = self.subsequent_indent
     257              else:
     258                  indent = self.initial_indent
     259              if len(indent) + len(self.placeholder.lstrip()) > self.width:
     260                  raise ValueError("placeholder too large for max width")
     261  
     262          # Arrange in reverse order so items can be efficiently popped
     263          # from a stack of chucks.
     264          chunks.reverse()
     265  
     266          while chunks:
     267  
     268              # Start the list of chunks that will make up the current line.
     269              # cur_len is just the length of all the chunks in cur_line.
     270              cur_line = []
     271              cur_len = 0
     272  
     273              # Figure out which static string will prefix this line.
     274              if lines:
     275                  indent = self.subsequent_indent
     276              else:
     277                  indent = self.initial_indent
     278  
     279              # Maximum width for this line.
     280              width = self.width - len(indent)
     281  
     282              # First chunk on line is whitespace -- drop it, unless this
     283              # is the very beginning of the text (ie. no lines started yet).
     284              if self.drop_whitespace and chunks[-1].strip() == '' and lines:
     285                  del chunks[-1]
     286  
     287              while chunks:
     288                  l = len(chunks[-1])
     289  
     290                  # Can at least squeeze this chunk onto the current line.
     291                  if cur_len + l <= width:
     292                      cur_line.append(chunks.pop())
     293                      cur_len += l
     294  
     295                  # Nope, this line is full.
     296                  else:
     297                      break
     298  
     299              # The current line is full, and the next chunk is too big to
     300              # fit on *any* line (not just this one).
     301              if chunks and len(chunks[-1]) > width:
     302                  self._handle_long_word(chunks, cur_line, cur_len, width)
     303                  cur_len = sum(map(len, cur_line))
     304  
     305              # If the last chunk on this line is all whitespace, drop it.
     306              if self.drop_whitespace and cur_line and cur_line[-1].strip() == '':
     307                  cur_len -= len(cur_line[-1])
     308                  del cur_line[-1]
     309  
     310              if cur_line:
     311                  if (self.max_lines is None or
     312                      len(lines) + 1 < self.max_lines or
     313                      (not chunks or
     314                       self.drop_whitespace and
     315                       len(chunks) == 1 and
     316                       not chunks[0].strip()) and cur_len <= width):
     317                      # Convert current line back to a string and store it in
     318                      # list of all lines (return value).
     319                      lines.append(indent + ''.join(cur_line))
     320                  else:
     321                      while cur_line:
     322                          if (cur_line[-1].strip() and
     323                              cur_len + len(self.placeholder) <= width):
     324                              cur_line.append(self.placeholder)
     325                              lines.append(indent + ''.join(cur_line))
     326                              break
     327                          cur_len -= len(cur_line[-1])
     328                          del cur_line[-1]
     329                      else:
     330                          if lines:
     331                              prev_line = lines[-1].rstrip()
     332                              if (len(prev_line) + len(self.placeholder) <=
     333                                      self.width):
     334                                  lines[-1] = prev_line + self.placeholder
     335                                  break
     336                          lines.append(indent + self.placeholder.lstrip())
     337                      break
     338  
     339          return lines
     340  
     341      def _split_chunks(self, text):
     342          text = self._munge_whitespace(text)
     343          return self._split(text)
     344  
     345      # -- Public interface ----------------------------------------------
     346  
     347      def wrap(self, text):
     348          """wrap(text : string) -> [string]
     349  
     350          Reformat the single paragraph in 'text' so it fits in lines of
     351          no more than 'self.width' columns, and return a list of wrapped
     352          lines.  Tabs in 'text' are expanded with string.expandtabs(),
     353          and all other whitespace characters (including newline) are
     354          converted to space.
     355          """
     356          chunks = self._split_chunks(text)
     357          if self.fix_sentence_endings:
     358              self._fix_sentence_endings(chunks)
     359          return self._wrap_chunks(chunks)
     360  
     361      def fill(self, text):
     362          """fill(text : string) -> string
     363  
     364          Reformat the single paragraph in 'text' to fit in lines of no
     365          more than 'self.width' columns, and return a new string
     366          containing the entire wrapped paragraph.
     367          """
     368          return "\n".join(self.wrap(text))
     369  
     370  
     371  # -- Convenience interface ---------------------------------------------
     372  
     373  def wrap(text, width=70, **kwargs):
     374      """Wrap a single paragraph of text, returning a list of wrapped lines.
     375  
     376      Reformat the single paragraph in 'text' so it fits in lines of no
     377      more than 'width' columns, and return a list of wrapped lines.  By
     378      default, tabs in 'text' are expanded with string.expandtabs(), and
     379      all other whitespace characters (including newline) are converted to
     380      space.  See TextWrapper class for available keyword args to customize
     381      wrapping behaviour.
     382      """
     383      w = TextWrapper(width=width, **kwargs)
     384      return w.wrap(text)
     385  
     386  def fill(text, width=70, **kwargs):
     387      """Fill a single paragraph of text, returning a new string.
     388  
     389      Reformat the single paragraph in 'text' to fit in lines of no more
     390      than 'width' columns, and return a new string containing the entire
     391      wrapped paragraph.  As with wrap(), tabs are expanded and other
     392      whitespace characters converted to space.  See TextWrapper class for
     393      available keyword args to customize wrapping behaviour.
     394      """
     395      w = TextWrapper(width=width, **kwargs)
     396      return w.fill(text)
     397  
     398  def shorten(text, width, **kwargs):
     399      """Collapse and truncate the given text to fit in the given width.
     400  
     401      The text first has its whitespace collapsed.  If it then fits in
     402      the *width*, it is returned as is.  Otherwise, as many words
     403      as possible are joined and then the placeholder is appended::
     404  
     405          >>> textwrap.shorten("Hello  world!", width=12)
     406          'Hello world!'
     407          >>> textwrap.shorten("Hello  world!", width=11)
     408          'Hello [...]'
     409      """
     410      w = TextWrapper(width=width, max_lines=1, **kwargs)
     411      return w.fill(' '.join(text.strip().split()))
     412  
     413  
     414  # -- Loosely related functionality -------------------------------------
     415  
     416  _whitespace_only_re = re.compile('^[ \t]+$', re.MULTILINE)
     417  _leading_whitespace_re = re.compile('(^[ \t]*)(?:[^ \t\n])', re.MULTILINE)
     418  
     419  def dedent(text):
     420      """Remove any common leading whitespace from every line in `text`.
     421  
     422      This can be used to make triple-quoted strings line up with the left
     423      edge of the display, while still presenting them in the source code
     424      in indented form.
     425  
     426      Note that tabs and spaces are both treated as whitespace, but they
     427      are not equal: the lines "  hello" and "\\thello" are
     428      considered to have no common leading whitespace.
     429  
     430      Entirely blank lines are normalized to a newline character.
     431      """
     432      # Look for the longest leading string of spaces and tabs common to
     433      # all lines.
     434      margin = None
     435      text = _whitespace_only_re.sub('', text)
     436      indents = _leading_whitespace_re.findall(text)
     437      for indent in indents:
     438          if margin is None:
     439              margin = indent
     440  
     441          # Current line more deeply indented than previous winner:
     442          # no change (previous winner is still on top).
     443          elif indent.startswith(margin):
     444              pass
     445  
     446          # Current line consistent with and no deeper than previous winner:
     447          # it's the new winner.
     448          elif margin.startswith(indent):
     449              margin = indent
     450  
     451          # Find the largest common whitespace between current line and previous
     452          # winner.
     453          else:
     454              for i, (x, y) in enumerate(zip(margin, indent)):
     455                  if x != y:
     456                      margin = margin[:i]
     457                      break
     458  
     459      # sanity check (testing/debugging only)
     460      if 0 and margin:
     461          for line in text.split("\n"):
     462              assert not line or line.startswith(margin), \
     463                     "line = %r, margin = %r" % (line, margin)
     464  
     465      if margin:
     466          text = re.sub(r'(?m)^' + margin, '', text)
     467      return text
     468  
     469  
     470  def indent(text, prefix, predicate=None):
     471      """Adds 'prefix' to the beginning of selected lines in 'text'.
     472  
     473      If 'predicate' is provided, 'prefix' will only be added to the lines
     474      where 'predicate(line)' is True. If 'predicate' is not provided,
     475      it will default to adding 'prefix' to all non-empty lines that do not
     476      consist solely of whitespace characters.
     477      """
     478      if predicate is None:
     479          def predicate(line):
     480              return line.strip()
     481  
     482      def prefixed_lines():
     483          for line in text.splitlines(True):
     484              yield (prefix + line if predicate(line) else line)
     485      return ''.join(prefixed_lines())
     486  
     487  
     488  if __name__ == "__main__":
     489      #print dedent("\tfoo\n\tbar")
     490      #print dedent("  \thello there\n  \t  how are you?")
     491      print(dedent("Hello there.\n  This is indented."))