1 """Text wrapping and filling.
2 """
3
4 # Copyright (C) 1999-2001 Gregory P. Ward.
5 # Copyright (C) 2002, 2003 Python Software Foundation.
6 # Written by Greg Ward <gward@python.net>
7
8 import re
9
10 __all__ = ['TextWrapper', 'wrap', 'fill', 'dedent', 'indent', 'shorten']
11
12 # Hardcode the recognized whitespace characters to the US-ASCII
13 # whitespace characters. The main reason for doing this is that
14 # some Unicode spaces (like \u00a0) are non-breaking whitespaces.
15 _whitespace = '\t\n\x0b\x0c\r '
16
17 class ESC[4;38;5;81mTextWrapper:
18 """
19 Object for wrapping/filling text. The public interface consists of
20 the wrap() and fill() methods; the other methods are just there for
21 subclasses to override in order to tweak the default behaviour.
22 If you want to completely replace the main wrapping algorithm,
23 you'll probably have to override _wrap_chunks().
24
25 Several instance attributes control various aspects of wrapping:
26 width (default: 70)
27 the maximum width of wrapped lines (unless break_long_words
28 is false)
29 initial_indent (default: "")
30 string that will be prepended to the first line of wrapped
31 output. Counts towards the line's width.
32 subsequent_indent (default: "")
33 string that will be prepended to all lines save the first
34 of wrapped output; also counts towards each line's width.
35 expand_tabs (default: true)
36 Expand tabs in input text to spaces before further processing.
37 Each tab will become 0 .. 'tabsize' spaces, depending on its position
38 in its line. If false, each tab is treated as a single character.
39 tabsize (default: 8)
40 Expand tabs in input text to 0 .. 'tabsize' spaces, unless
41 'expand_tabs' is false.
42 replace_whitespace (default: true)
43 Replace all whitespace characters in the input text by spaces
44 after tab expansion. Note that if expand_tabs is false and
45 replace_whitespace is true, every tab will be converted to a
46 single space!
47 fix_sentence_endings (default: false)
48 Ensure that sentence-ending punctuation is always followed
49 by two spaces. Off by default because the algorithm is
50 (unavoidably) imperfect.
51 break_long_words (default: true)
52 Break words longer than 'width'. If false, those words will not
53 be broken, and some lines might be longer than 'width'.
54 break_on_hyphens (default: true)
55 Allow breaking hyphenated words. If true, wrapping will occur
56 preferably on whitespaces and right after hyphens part of
57 compound words.
58 drop_whitespace (default: true)
59 Drop leading and trailing whitespace from lines.
60 max_lines (default: None)
61 Truncate wrapped lines.
62 placeholder (default: ' [...]')
63 Append to the last line of truncated text.
64 """
65
66 unicode_whitespace_trans = dict.fromkeys(map(ord, _whitespace), ord(' '))
67
68 # This funky little regex is just the trick for splitting
69 # text up into word-wrappable chunks. E.g.
70 # "Hello there -- you goof-ball, use the -b option!"
71 # splits into
72 # Hello/ /there/ /--/ /you/ /goof-/ball,/ /use/ /the/ /-b/ /option!
73 # (after stripping out empty strings).
74 word_punct = r'[\w!"\'&.,?]'
75 letter = r'[^\d\W]'
76 whitespace = r'[%s]' % re.escape(_whitespace)
77 nowhitespace = '[^' + whitespace[1:]
78 wordsep_re = re.compile(r'''
79 ( # any whitespace
80 %(ws)s+
81 | # em-dash between words
82 (?<=%(wp)s) -{2,} (?=\w)
83 | # word, possibly hyphenated
84 %(nws)s+? (?:
85 # hyphenated word
86 -(?: (?<=%(lt)s{2}-) | (?<=%(lt)s-%(lt)s-))
87 (?= %(lt)s -? %(lt)s)
88 | # end of word
89 (?=%(ws)s|\Z)
90 | # em-dash
91 (?<=%(wp)s) (?=-{2,}\w)
92 )
93 )''' % {'wp': word_punct, 'lt': letter,
94 'ws': whitespace, 'nws': nowhitespace},
95 re.VERBOSE)
96 del word_punct, letter, nowhitespace
97
98 # This less funky little regex just split on recognized spaces. E.g.
99 # "Hello there -- you goof-ball, use the -b option!"
100 # splits into
101 # Hello/ /there/ /--/ /you/ /goof-ball,/ /use/ /the/ /-b/ /option!/
102 wordsep_simple_re = re.compile(r'(%s+)' % whitespace)
103 del whitespace
104
105 # XXX this is not locale- or charset-aware -- string.lowercase
106 # is US-ASCII only (and therefore English-only)
107 sentence_end_re = re.compile(r'[a-z]' # lowercase letter
108 r'[\.\!\?]' # sentence-ending punct.
109 r'[\"\']?' # optional end-of-quote
110 r'\Z') # end of chunk
111
112 def __init__(self,
113 width=70,
114 initial_indent="",
115 subsequent_indent="",
116 expand_tabs=True,
117 replace_whitespace=True,
118 fix_sentence_endings=False,
119 break_long_words=True,
120 drop_whitespace=True,
121 break_on_hyphens=True,
122 tabsize=8,
123 *,
124 max_lines=None,
125 placeholder=' [...]'):
126 self.width = width
127 self.initial_indent = initial_indent
128 self.subsequent_indent = subsequent_indent
129 self.expand_tabs = expand_tabs
130 self.replace_whitespace = replace_whitespace
131 self.fix_sentence_endings = fix_sentence_endings
132 self.break_long_words = break_long_words
133 self.drop_whitespace = drop_whitespace
134 self.break_on_hyphens = break_on_hyphens
135 self.tabsize = tabsize
136 self.max_lines = max_lines
137 self.placeholder = placeholder
138
139
140 # -- Private methods -----------------------------------------------
141 # (possibly useful for subclasses to override)
142
143 def _munge_whitespace(self, text):
144 """_munge_whitespace(text : string) -> string
145
146 Munge whitespace in text: expand tabs and convert all other
147 whitespace characters to spaces. Eg. " foo\\tbar\\n\\nbaz"
148 becomes " foo bar baz".
149 """
150 if self.expand_tabs:
151 text = text.expandtabs(self.tabsize)
152 if self.replace_whitespace:
153 text = text.translate(self.unicode_whitespace_trans)
154 return text
155
156
157 def _split(self, text):
158 """_split(text : string) -> [string]
159
160 Split the text to wrap into indivisible chunks. Chunks are
161 not quite the same as words; see _wrap_chunks() for full
162 details. As an example, the text
163 Look, goof-ball -- use the -b option!
164 breaks into the following chunks:
165 'Look,', ' ', 'goof-', 'ball', ' ', '--', ' ',
166 'use', ' ', 'the', ' ', '-b', ' ', 'option!'
167 if break_on_hyphens is True, or in:
168 'Look,', ' ', 'goof-ball', ' ', '--', ' ',
169 'use', ' ', 'the', ' ', '-b', ' ', option!'
170 otherwise.
171 """
172 if self.break_on_hyphens is True:
173 chunks = self.wordsep_re.split(text)
174 else:
175 chunks = self.wordsep_simple_re.split(text)
176 chunks = [c for c in chunks if c]
177 return chunks
178
179 def _fix_sentence_endings(self, chunks):
180 """_fix_sentence_endings(chunks : [string])
181
182 Correct for sentence endings buried in 'chunks'. Eg. when the
183 original text contains "... foo.\\nBar ...", munge_whitespace()
184 and split() will convert that to [..., "foo.", " ", "Bar", ...]
185 which has one too few spaces; this method simply changes the one
186 space to two.
187 """
188 i = 0
189 patsearch = self.sentence_end_re.search
190 while i < len(chunks)-1:
191 if chunks[i+1] == " " and patsearch(chunks[i]):
192 chunks[i+1] = " "
193 i += 2
194 else:
195 i += 1
196
197 def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width):
198 """_handle_long_word(chunks : [string],
199 cur_line : [string],
200 cur_len : int, width : int)
201
202 Handle a chunk of text (most likely a word, not whitespace) that
203 is too long to fit in any line.
204 """
205 # Figure out when indent is larger than the specified width, and make
206 # sure at least one character is stripped off on every pass
207 if width < 1:
208 space_left = 1
209 else:
210 space_left = width - cur_len
211
212 # If we're allowed to break long words, then do so: put as much
213 # of the next chunk onto the current line as will fit.
214 if self.break_long_words:
215 end = space_left
216 chunk = reversed_chunks[-1]
217 if self.break_on_hyphens and len(chunk) > space_left:
218 # break after last hyphen, but only if there are
219 # non-hyphens before it
220 hyphen = chunk.rfind('-', 0, space_left)
221 if hyphen > 0 and any(c != '-' for c in chunk[:hyphen]):
222 end = hyphen + 1
223 cur_line.append(chunk[:end])
224 reversed_chunks[-1] = chunk[end:]
225
226 # Otherwise, we have to preserve the long word intact. Only add
227 # it to the current line if there's nothing already there --
228 # that minimizes how much we violate the width constraint.
229 elif not cur_line:
230 cur_line.append(reversed_chunks.pop())
231
232 # If we're not allowed to break long words, and there's already
233 # text on the current line, do nothing. Next time through the
234 # main loop of _wrap_chunks(), we'll wind up here again, but
235 # cur_len will be zero, so the next line will be entirely
236 # devoted to the long word that we can't handle right now.
237
238 def _wrap_chunks(self, chunks):
239 """_wrap_chunks(chunks : [string]) -> [string]
240
241 Wrap a sequence of text chunks and return a list of lines of
242 length 'self.width' or less. (If 'break_long_words' is false,
243 some lines may be longer than this.) Chunks correspond roughly
244 to words and the whitespace between them: each chunk is
245 indivisible (modulo 'break_long_words'), but a line break can
246 come between any two chunks. Chunks should not have internal
247 whitespace; ie. a chunk is either all whitespace or a "word".
248 Whitespace chunks will be removed from the beginning and end of
249 lines, but apart from that whitespace is preserved.
250 """
251 lines = []
252 if self.width <= 0:
253 raise ValueError("invalid width %r (must be > 0)" % self.width)
254 if self.max_lines is not None:
255 if self.max_lines > 1:
256 indent = self.subsequent_indent
257 else:
258 indent = self.initial_indent
259 if len(indent) + len(self.placeholder.lstrip()) > self.width:
260 raise ValueError("placeholder too large for max width")
261
262 # Arrange in reverse order so items can be efficiently popped
263 # from a stack of chucks.
264 chunks.reverse()
265
266 while chunks:
267
268 # Start the list of chunks that will make up the current line.
269 # cur_len is just the length of all the chunks in cur_line.
270 cur_line = []
271 cur_len = 0
272
273 # Figure out which static string will prefix this line.
274 if lines:
275 indent = self.subsequent_indent
276 else:
277 indent = self.initial_indent
278
279 # Maximum width for this line.
280 width = self.width - len(indent)
281
282 # First chunk on line is whitespace -- drop it, unless this
283 # is the very beginning of the text (ie. no lines started yet).
284 if self.drop_whitespace and chunks[-1].strip() == '' and lines:
285 del chunks[-1]
286
287 while chunks:
288 l = len(chunks[-1])
289
290 # Can at least squeeze this chunk onto the current line.
291 if cur_len + l <= width:
292 cur_line.append(chunks.pop())
293 cur_len += l
294
295 # Nope, this line is full.
296 else:
297 break
298
299 # The current line is full, and the next chunk is too big to
300 # fit on *any* line (not just this one).
301 if chunks and len(chunks[-1]) > width:
302 self._handle_long_word(chunks, cur_line, cur_len, width)
303 cur_len = sum(map(len, cur_line))
304
305 # If the last chunk on this line is all whitespace, drop it.
306 if self.drop_whitespace and cur_line and cur_line[-1].strip() == '':
307 cur_len -= len(cur_line[-1])
308 del cur_line[-1]
309
310 if cur_line:
311 if (self.max_lines is None or
312 len(lines) + 1 < self.max_lines or
313 (not chunks or
314 self.drop_whitespace and
315 len(chunks) == 1 and
316 not chunks[0].strip()) and cur_len <= width):
317 # Convert current line back to a string and store it in
318 # list of all lines (return value).
319 lines.append(indent + ''.join(cur_line))
320 else:
321 while cur_line:
322 if (cur_line[-1].strip() and
323 cur_len + len(self.placeholder) <= width):
324 cur_line.append(self.placeholder)
325 lines.append(indent + ''.join(cur_line))
326 break
327 cur_len -= len(cur_line[-1])
328 del cur_line[-1]
329 else:
330 if lines:
331 prev_line = lines[-1].rstrip()
332 if (len(prev_line) + len(self.placeholder) <=
333 self.width):
334 lines[-1] = prev_line + self.placeholder
335 break
336 lines.append(indent + self.placeholder.lstrip())
337 break
338
339 return lines
340
341 def _split_chunks(self, text):
342 text = self._munge_whitespace(text)
343 return self._split(text)
344
345 # -- Public interface ----------------------------------------------
346
347 def wrap(self, text):
348 """wrap(text : string) -> [string]
349
350 Reformat the single paragraph in 'text' so it fits in lines of
351 no more than 'self.width' columns, and return a list of wrapped
352 lines. Tabs in 'text' are expanded with string.expandtabs(),
353 and all other whitespace characters (including newline) are
354 converted to space.
355 """
356 chunks = self._split_chunks(text)
357 if self.fix_sentence_endings:
358 self._fix_sentence_endings(chunks)
359 return self._wrap_chunks(chunks)
360
361 def fill(self, text):
362 """fill(text : string) -> string
363
364 Reformat the single paragraph in 'text' to fit in lines of no
365 more than 'self.width' columns, and return a new string
366 containing the entire wrapped paragraph.
367 """
368 return "\n".join(self.wrap(text))
369
370
371 # -- Convenience interface ---------------------------------------------
372
373 def wrap(text, width=70, **kwargs):
374 """Wrap a single paragraph of text, returning a list of wrapped lines.
375
376 Reformat the single paragraph in 'text' so it fits in lines of no
377 more than 'width' columns, and return a list of wrapped lines. By
378 default, tabs in 'text' are expanded with string.expandtabs(), and
379 all other whitespace characters (including newline) are converted to
380 space. See TextWrapper class for available keyword args to customize
381 wrapping behaviour.
382 """
383 w = TextWrapper(width=width, **kwargs)
384 return w.wrap(text)
385
386 def fill(text, width=70, **kwargs):
387 """Fill a single paragraph of text, returning a new string.
388
389 Reformat the single paragraph in 'text' to fit in lines of no more
390 than 'width' columns, and return a new string containing the entire
391 wrapped paragraph. As with wrap(), tabs are expanded and other
392 whitespace characters converted to space. See TextWrapper class for
393 available keyword args to customize wrapping behaviour.
394 """
395 w = TextWrapper(width=width, **kwargs)
396 return w.fill(text)
397
398 def shorten(text, width, **kwargs):
399 """Collapse and truncate the given text to fit in the given width.
400
401 The text first has its whitespace collapsed. If it then fits in
402 the *width*, it is returned as is. Otherwise, as many words
403 as possible are joined and then the placeholder is appended::
404
405 >>> textwrap.shorten("Hello world!", width=12)
406 'Hello world!'
407 >>> textwrap.shorten("Hello world!", width=11)
408 'Hello [...]'
409 """
410 w = TextWrapper(width=width, max_lines=1, **kwargs)
411 return w.fill(' '.join(text.strip().split()))
412
413
414 # -- Loosely related functionality -------------------------------------
415
416 _whitespace_only_re = re.compile('^[ \t]+$', re.MULTILINE)
417 _leading_whitespace_re = re.compile('(^[ \t]*)(?:[^ \t\n])', re.MULTILINE)
418
419 def dedent(text):
420 """Remove any common leading whitespace from every line in `text`.
421
422 This can be used to make triple-quoted strings line up with the left
423 edge of the display, while still presenting them in the source code
424 in indented form.
425
426 Note that tabs and spaces are both treated as whitespace, but they
427 are not equal: the lines " hello" and "\\thello" are
428 considered to have no common leading whitespace.
429
430 Entirely blank lines are normalized to a newline character.
431 """
432 # Look for the longest leading string of spaces and tabs common to
433 # all lines.
434 margin = None
435 text = _whitespace_only_re.sub('', text)
436 indents = _leading_whitespace_re.findall(text)
437 for indent in indents:
438 if margin is None:
439 margin = indent
440
441 # Current line more deeply indented than previous winner:
442 # no change (previous winner is still on top).
443 elif indent.startswith(margin):
444 pass
445
446 # Current line consistent with and no deeper than previous winner:
447 # it's the new winner.
448 elif margin.startswith(indent):
449 margin = indent
450
451 # Find the largest common whitespace between current line and previous
452 # winner.
453 else:
454 for i, (x, y) in enumerate(zip(margin, indent)):
455 if x != y:
456 margin = margin[:i]
457 break
458
459 # sanity check (testing/debugging only)
460 if 0 and margin:
461 for line in text.split("\n"):
462 assert not line or line.startswith(margin), \
463 "line = %r, margin = %r" % (line, margin)
464
465 if margin:
466 text = re.sub(r'(?m)^' + margin, '', text)
467 return text
468
469
470 def indent(text, prefix, predicate=None):
471 """Adds 'prefix' to the beginning of selected lines in 'text'.
472
473 If 'predicate' is provided, 'prefix' will only be added to the lines
474 where 'predicate(line)' is True. If 'predicate' is not provided,
475 it will default to adding 'prefix' to all non-empty lines that do not
476 consist solely of whitespace characters.
477 """
478 if predicate is None:
479 def predicate(line):
480 return line.strip()
481
482 def prefixed_lines():
483 for line in text.splitlines(True):
484 yield (prefix + line if predicate(line) else line)
485 return ''.join(prefixed_lines())
486
487
488 if __name__ == "__main__":
489 #print dedent("\tfoo\n\tbar")
490 #print dedent(" \thello there\n \t how are you?")
491 print(dedent("Hello there.\n This is indented."))