1 #! /usr/bin/env python3
2 # -*- coding: iso-8859-1 -*-
3 # Originally written by Barry Warsaw <barry@python.org>
4 #
5 # Minimally patched to make it even more xgettext compatible
6 # by Peter Funk <pf@artcom-gmbh.de>
7 #
8 # 2002-11-22 J�rgen Hermann <jh@web.de>
9 # Added checks that _() only contains string literals, and
10 # command line args are resolved to module lists, i.e. you
11 # can now pass a filename, a module or package name, or a
12 # directory (including globbing chars, important for Win32).
13 # Made docstring fit in 80 chars wide displays using pydoc.
14 #
15
16 # for selftesting
17 try:
18 import fintl
19 _ = fintl.gettext
20 except ImportError:
21 _ = lambda s: s
22
23 __doc__ = _("""pygettext -- Python equivalent of xgettext(1)
24
25 Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the
26 internationalization of C programs. Most of these tools are independent of
27 the programming language and can be used from within Python programs.
28 Martin von Loewis' work[1] helps considerably in this regard.
29
30 There's one problem though; xgettext is the program that scans source code
31 looking for message strings, but it groks only C (or C++). Python
32 introduces a few wrinkles, such as dual quoting characters, triple quoted
33 strings, and raw strings. xgettext understands none of this.
34
35 Enter pygettext, which uses Python's standard tokenize module to scan
36 Python source code, generating .pot files identical to what GNU xgettext[2]
37 generates for C and C++ code. From there, the standard GNU tools can be
38 used.
39
40 A word about marking Python strings as candidates for translation. GNU
41 xgettext recognizes the following keywords: gettext, dgettext, dcgettext,
42 and gettext_noop. But those can be a lot of text to include all over your
43 code. C and C++ have a trick: they use the C preprocessor. Most
44 internationalized C source includes a #define for gettext() to _() so that
45 what has to be written in the source is much less. Thus these are both
46 translatable strings:
47
48 gettext("Translatable String")
49 _("Translatable String")
50
51 Python of course has no preprocessor so this doesn't work so well. Thus,
52 pygettext searches only for _() by default, but see the -k/--keyword flag
53 below for how to augment this.
54
55 [1] https://www.python.org/workshops/1997-10/proceedings/loewis.html
56 [2] https://www.gnu.org/software/gettext/gettext.html
57
58 NOTE: pygettext attempts to be option and feature compatible with GNU
59 xgettext where ever possible. However some options are still missing or are
60 not fully implemented. Also, xgettext's use of command line switches with
61 option arguments is broken, and in these cases, pygettext just defines
62 additional switches.
63
64 Usage: pygettext [options] inputfile ...
65
66 Options:
67
68 -a
69 --extract-all
70 Extract all strings.
71
72 -d name
73 --default-domain=name
74 Rename the default output file from messages.pot to name.pot.
75
76 -E
77 --escape
78 Replace non-ASCII characters with octal escape sequences.
79
80 -D
81 --docstrings
82 Extract module, class, method, and function docstrings. These do
83 not need to be wrapped in _() markers, and in fact cannot be for
84 Python to consider them docstrings. (See also the -X option).
85
86 -h
87 --help
88 Print this help message and exit.
89
90 -k word
91 --keyword=word
92 Keywords to look for in addition to the default set, which are:
93 %(DEFAULTKEYWORDS)s
94
95 You can have multiple -k flags on the command line.
96
97 -K
98 --no-default-keywords
99 Disable the default set of keywords (see above). Any keywords
100 explicitly added with the -k/--keyword option are still recognized.
101
102 --no-location
103 Do not write filename/lineno location comments.
104
105 -n
106 --add-location
107 Write filename/lineno location comments indicating where each
108 extracted string is found in the source. These lines appear before
109 each msgid. The style of comments is controlled by the -S/--style
110 option. This is the default.
111
112 -o filename
113 --output=filename
114 Rename the default output file from messages.pot to filename. If
115 filename is `-' then the output is sent to standard out.
116
117 -p dir
118 --output-dir=dir
119 Output files will be placed in directory dir.
120
121 -S stylename
122 --style stylename
123 Specify which style to use for location comments. Two styles are
124 supported:
125
126 Solaris # File: filename, line: line-number
127 GNU #: filename:line
128
129 The style name is case insensitive. GNU style is the default.
130
131 -v
132 --verbose
133 Print the names of the files being processed.
134
135 -V
136 --version
137 Print the version of pygettext and exit.
138
139 -w columns
140 --width=columns
141 Set width of output to columns.
142
143 -x filename
144 --exclude-file=filename
145 Specify a file that contains a list of strings that are not be
146 extracted from the input files. Each string to be excluded must
147 appear on a line by itself in the file.
148
149 -X filename
150 --no-docstrings=filename
151 Specify a file that contains a list of files (one per line) that
152 should not have their docstrings extracted. This is only useful in
153 conjunction with the -D option above.
154
155 If `inputfile' is -, standard input is read.
156 """)
157
158 import os
159 import importlib.machinery
160 import importlib.util
161 import sys
162 import glob
163 import time
164 import getopt
165 import ast
166 import token
167 import tokenize
168
169 __version__ = '1.5'
170
171 default_keywords = ['_']
172 DEFAULTKEYWORDS = ', '.join(default_keywords)
173
174 EMPTYSTRING = ''
175
176
177 # The normal pot-file header. msgmerge and Emacs's po-mode work better if it's
178 # there.
179 pot_header = _('''\
180 # SOME DESCRIPTIVE TITLE.
181 # Copyright (C) YEAR ORGANIZATION
182 # FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
183 #
184 msgid ""
185 msgstr ""
186 "Project-Id-Version: PACKAGE VERSION\\n"
187 "POT-Creation-Date: %(time)s\\n"
188 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n"
189 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n"
190 "Language-Team: LANGUAGE <LL@li.org>\\n"
191 "MIME-Version: 1.0\\n"
192 "Content-Type: text/plain; charset=%(charset)s\\n"
193 "Content-Transfer-Encoding: %(encoding)s\\n"
194 "Generated-By: pygettext.py %(version)s\\n"
195
196 ''')
197
198
199 def usage(code, msg=''):
200 print(__doc__ % globals(), file=sys.stderr)
201 if msg:
202 print(msg, file=sys.stderr)
203 sys.exit(code)
204
205
206 def make_escapes(pass_nonascii):
207 global escapes, escape
208 if pass_nonascii:
209 # Allow non-ascii characters to pass through so that e.g. 'msgid
210 # "H�he"' would result not result in 'msgid "H\366he"'. Otherwise we
211 # escape any character outside the 32..126 range.
212 mod = 128
213 escape = escape_ascii
214 else:
215 mod = 256
216 escape = escape_nonascii
217 escapes = [r"\%03o" % i for i in range(mod)]
218 for i in range(32, 127):
219 escapes[i] = chr(i)
220 escapes[ord('\\')] = r'\\'
221 escapes[ord('\t')] = r'\t'
222 escapes[ord('\r')] = r'\r'
223 escapes[ord('\n')] = r'\n'
224 escapes[ord('\"')] = r'\"'
225
226
227 def escape_ascii(s, encoding):
228 return ''.join(escapes[ord(c)] if ord(c) < 128 else c for c in s)
229
230 def escape_nonascii(s, encoding):
231 return ''.join(escapes[b] for b in s.encode(encoding))
232
233
234 def is_literal_string(s):
235 return s[0] in '\'"' or (s[0] in 'rRuU' and s[1] in '\'"')
236
237
238 def safe_eval(s):
239 # unwrap quotes, safely
240 return eval(s, {'__builtins__':{}}, {})
241
242
243 def normalize(s, encoding):
244 # This converts the various Python string types into a format that is
245 # appropriate for .po files, namely much closer to C style.
246 lines = s.split('\n')
247 if len(lines) == 1:
248 s = '"' + escape(s, encoding) + '"'
249 else:
250 if not lines[-1]:
251 del lines[-1]
252 lines[-1] = lines[-1] + '\n'
253 for i in range(len(lines)):
254 lines[i] = escape(lines[i], encoding)
255 lineterm = '\\n"\n"'
256 s = '""\n"' + lineterm.join(lines) + '"'
257 return s
258
259
260 def containsAny(str, set):
261 """Check whether 'str' contains ANY of the chars in 'set'"""
262 return 1 in [c in str for c in set]
263
264
265 def getFilesForName(name):
266 """Get a list of module files for a filename, a module or package name,
267 or a directory.
268 """
269 if not os.path.exists(name):
270 # check for glob chars
271 if containsAny(name, "*?[]"):
272 files = glob.glob(name)
273 list = []
274 for file in files:
275 list.extend(getFilesForName(file))
276 return list
277
278 # try to find module or package
279 try:
280 spec = importlib.util.find_spec(name)
281 name = spec.origin
282 except ImportError:
283 name = None
284 if not name:
285 return []
286
287 if os.path.isdir(name):
288 # find all python files in directory
289 list = []
290 # get extension for python source files
291 _py_ext = importlib.machinery.SOURCE_SUFFIXES[0]
292 for root, dirs, files in os.walk(name):
293 # don't recurse into CVS directories
294 if 'CVS' in dirs:
295 dirs.remove('CVS')
296 # add all *.py files to list
297 list.extend(
298 [os.path.join(root, file) for file in files
299 if os.path.splitext(file)[1] == _py_ext]
300 )
301 return list
302 elif os.path.exists(name):
303 # a single file
304 return [name]
305
306 return []
307
308
309 class ESC[4;38;5;81mTokenEater:
310 def __init__(self, options):
311 self.__options = options
312 self.__messages = {}
313 self.__state = self.__waiting
314 self.__data = []
315 self.__lineno = -1
316 self.__freshmodule = 1
317 self.__curfile = None
318 self.__enclosurecount = 0
319
320 def __call__(self, ttype, tstring, stup, etup, line):
321 # dispatch
322 ## import token
323 ## print('ttype:', token.tok_name[ttype], 'tstring:', tstring,
324 ## file=sys.stderr)
325 self.__state(ttype, tstring, stup[0])
326
327 def __waiting(self, ttype, tstring, lineno):
328 opts = self.__options
329 # Do docstring extractions, if enabled
330 if opts.docstrings and not opts.nodocstrings.get(self.__curfile):
331 # module docstring?
332 if self.__freshmodule:
333 if ttype == tokenize.STRING and is_literal_string(tstring):
334 self.__addentry(safe_eval(tstring), lineno, isdocstring=1)
335 self.__freshmodule = 0
336 return
337 if ttype in (tokenize.COMMENT, tokenize.NL, tokenize.ENCODING):
338 return
339 self.__freshmodule = 0
340 # class or func/method docstring?
341 if ttype == tokenize.NAME and tstring in ('class', 'def'):
342 self.__state = self.__suiteseen
343 return
344 if ttype == tokenize.NAME and tstring in opts.keywords:
345 self.__state = self.__keywordseen
346 return
347 if ttype == tokenize.STRING:
348 maybe_fstring = ast.parse(tstring, mode='eval').body
349 if not isinstance(maybe_fstring, ast.JoinedStr):
350 return
351 for value in filter(lambda node: isinstance(node, ast.FormattedValue),
352 maybe_fstring.values):
353 for call in filter(lambda node: isinstance(node, ast.Call),
354 ast.walk(value)):
355 func = call.func
356 if isinstance(func, ast.Name):
357 func_name = func.id
358 elif isinstance(func, ast.Attribute):
359 func_name = func.attr
360 else:
361 continue
362
363 if func_name not in opts.keywords:
364 continue
365 if len(call.args) != 1:
366 print(_(
367 '*** %(file)s:%(lineno)s: Seen unexpected amount of'
368 ' positional arguments in gettext call: %(source_segment)s'
369 ) % {
370 'source_segment': ast.get_source_segment(tstring, call) or tstring,
371 'file': self.__curfile,
372 'lineno': lineno
373 }, file=sys.stderr)
374 continue
375 if call.keywords:
376 print(_(
377 '*** %(file)s:%(lineno)s: Seen unexpected keyword arguments'
378 ' in gettext call: %(source_segment)s'
379 ) % {
380 'source_segment': ast.get_source_segment(tstring, call) or tstring,
381 'file': self.__curfile,
382 'lineno': lineno
383 }, file=sys.stderr)
384 continue
385 arg = call.args[0]
386 if not isinstance(arg, ast.Constant):
387 print(_(
388 '*** %(file)s:%(lineno)s: Seen unexpected argument type'
389 ' in gettext call: %(source_segment)s'
390 ) % {
391 'source_segment': ast.get_source_segment(tstring, call) or tstring,
392 'file': self.__curfile,
393 'lineno': lineno
394 }, file=sys.stderr)
395 continue
396 if isinstance(arg.value, str):
397 self.__addentry(arg.value, lineno)
398
399 def __suiteseen(self, ttype, tstring, lineno):
400 # skip over any enclosure pairs until we see the colon
401 if ttype == tokenize.OP:
402 if tstring == ':' and self.__enclosurecount == 0:
403 # we see a colon and we're not in an enclosure: end of def
404 self.__state = self.__suitedocstring
405 elif tstring in '([{':
406 self.__enclosurecount += 1
407 elif tstring in ')]}':
408 self.__enclosurecount -= 1
409
410 def __suitedocstring(self, ttype, tstring, lineno):
411 # ignore any intervening noise
412 if ttype == tokenize.STRING and is_literal_string(tstring):
413 self.__addentry(safe_eval(tstring), lineno, isdocstring=1)
414 self.__state = self.__waiting
415 elif ttype not in (tokenize.NEWLINE, tokenize.INDENT,
416 tokenize.COMMENT):
417 # there was no class docstring
418 self.__state = self.__waiting
419
420 def __keywordseen(self, ttype, tstring, lineno):
421 if ttype == tokenize.OP and tstring == '(':
422 self.__data = []
423 self.__lineno = lineno
424 self.__state = self.__openseen
425 else:
426 self.__state = self.__waiting
427
428 def __openseen(self, ttype, tstring, lineno):
429 if ttype == tokenize.OP and tstring == ')':
430 # We've seen the last of the translatable strings. Record the
431 # line number of the first line of the strings and update the list
432 # of messages seen. Reset state for the next batch. If there
433 # were no strings inside _(), then just ignore this entry.
434 if self.__data:
435 self.__addentry(EMPTYSTRING.join(self.__data))
436 self.__state = self.__waiting
437 elif ttype == tokenize.STRING and is_literal_string(tstring):
438 self.__data.append(safe_eval(tstring))
439 elif ttype not in [tokenize.COMMENT, token.INDENT, token.DEDENT,
440 token.NEWLINE, tokenize.NL]:
441 # warn if we see anything else than STRING or whitespace
442 print(_(
443 '*** %(file)s:%(lineno)s: Seen unexpected token "%(token)s"'
444 ) % {
445 'token': tstring,
446 'file': self.__curfile,
447 'lineno': self.__lineno
448 }, file=sys.stderr)
449 self.__state = self.__waiting
450
451 def __addentry(self, msg, lineno=None, isdocstring=0):
452 if lineno is None:
453 lineno = self.__lineno
454 if not msg in self.__options.toexclude:
455 entry = (self.__curfile, lineno)
456 self.__messages.setdefault(msg, {})[entry] = isdocstring
457
458 def set_filename(self, filename):
459 self.__curfile = filename
460 self.__freshmodule = 1
461
462 def write(self, fp):
463 options = self.__options
464 timestamp = time.strftime('%Y-%m-%d %H:%M%z')
465 encoding = fp.encoding if fp.encoding else 'UTF-8'
466 print(pot_header % {'time': timestamp, 'version': __version__,
467 'charset': encoding,
468 'encoding': '8bit'}, file=fp)
469 # Sort the entries. First sort each particular entry's keys, then
470 # sort all the entries by their first item.
471 reverse = {}
472 for k, v in self.__messages.items():
473 keys = sorted(v.keys())
474 reverse.setdefault(tuple(keys), []).append((k, v))
475 rkeys = sorted(reverse.keys())
476 for rkey in rkeys:
477 rentries = reverse[rkey]
478 rentries.sort()
479 for k, v in rentries:
480 # If the entry was gleaned out of a docstring, then add a
481 # comment stating so. This is to aid translators who may wish
482 # to skip translating some unimportant docstrings.
483 isdocstring = any(v.values())
484 # k is the message string, v is a dictionary-set of (filename,
485 # lineno) tuples. We want to sort the entries in v first by
486 # file name and then by line number.
487 v = sorted(v.keys())
488 if not options.writelocations:
489 pass
490 # location comments are different b/w Solaris and GNU:
491 elif options.locationstyle == options.SOLARIS:
492 for filename, lineno in v:
493 d = {'filename': filename, 'lineno': lineno}
494 print(_(
495 '# File: %(filename)s, line: %(lineno)d') % d, file=fp)
496 elif options.locationstyle == options.GNU:
497 # fit as many locations on one line, as long as the
498 # resulting line length doesn't exceed 'options.width'
499 locline = '#:'
500 for filename, lineno in v:
501 d = {'filename': filename, 'lineno': lineno}
502 s = _(' %(filename)s:%(lineno)d') % d
503 if len(locline) + len(s) <= options.width:
504 locline = locline + s
505 else:
506 print(locline, file=fp)
507 locline = "#:" + s
508 if len(locline) > 2:
509 print(locline, file=fp)
510 if isdocstring:
511 print('#, docstring', file=fp)
512 print('msgid', normalize(k, encoding), file=fp)
513 print('msgstr ""\n', file=fp)
514
515
516 def main():
517 global default_keywords
518 try:
519 opts, args = getopt.getopt(
520 sys.argv[1:],
521 'ad:DEhk:Kno:p:S:Vvw:x:X:',
522 ['extract-all', 'default-domain=', 'escape', 'help',
523 'keyword=', 'no-default-keywords',
524 'add-location', 'no-location', 'output=', 'output-dir=',
525 'style=', 'verbose', 'version', 'width=', 'exclude-file=',
526 'docstrings', 'no-docstrings',
527 ])
528 except getopt.error as msg:
529 usage(1, msg)
530
531 # for holding option values
532 class ESC[4;38;5;81mOptions:
533 # constants
534 GNU = 1
535 SOLARIS = 2
536 # defaults
537 extractall = 0 # FIXME: currently this option has no effect at all.
538 escape = 0
539 keywords = []
540 outpath = ''
541 outfile = 'messages.pot'
542 writelocations = 1
543 locationstyle = GNU
544 verbose = 0
545 width = 78
546 excludefilename = ''
547 docstrings = 0
548 nodocstrings = {}
549
550 options = Options()
551 locations = {'gnu' : options.GNU,
552 'solaris' : options.SOLARIS,
553 }
554
555 # parse options
556 for opt, arg in opts:
557 if opt in ('-h', '--help'):
558 usage(0)
559 elif opt in ('-a', '--extract-all'):
560 options.extractall = 1
561 elif opt in ('-d', '--default-domain'):
562 options.outfile = arg + '.pot'
563 elif opt in ('-E', '--escape'):
564 options.escape = 1
565 elif opt in ('-D', '--docstrings'):
566 options.docstrings = 1
567 elif opt in ('-k', '--keyword'):
568 options.keywords.append(arg)
569 elif opt in ('-K', '--no-default-keywords'):
570 default_keywords = []
571 elif opt in ('-n', '--add-location'):
572 options.writelocations = 1
573 elif opt in ('--no-location',):
574 options.writelocations = 0
575 elif opt in ('-S', '--style'):
576 options.locationstyle = locations.get(arg.lower())
577 if options.locationstyle is None:
578 usage(1, _('Invalid value for --style: %s') % arg)
579 elif opt in ('-o', '--output'):
580 options.outfile = arg
581 elif opt in ('-p', '--output-dir'):
582 options.outpath = arg
583 elif opt in ('-v', '--verbose'):
584 options.verbose = 1
585 elif opt in ('-V', '--version'):
586 print(_('pygettext.py (xgettext for Python) %s') % __version__)
587 sys.exit(0)
588 elif opt in ('-w', '--width'):
589 try:
590 options.width = int(arg)
591 except ValueError:
592 usage(1, _('--width argument must be an integer: %s') % arg)
593 elif opt in ('-x', '--exclude-file'):
594 options.excludefilename = arg
595 elif opt in ('-X', '--no-docstrings'):
596 fp = open(arg)
597 try:
598 while 1:
599 line = fp.readline()
600 if not line:
601 break
602 options.nodocstrings[line[:-1]] = 1
603 finally:
604 fp.close()
605
606 # calculate escapes
607 make_escapes(not options.escape)
608
609 # calculate all keywords
610 options.keywords.extend(default_keywords)
611
612 # initialize list of strings to exclude
613 if options.excludefilename:
614 try:
615 with open(options.excludefilename) as fp:
616 options.toexclude = fp.readlines()
617 except IOError:
618 print(_(
619 "Can't read --exclude-file: %s") % options.excludefilename, file=sys.stderr)
620 sys.exit(1)
621 else:
622 options.toexclude = []
623
624 # resolve args to module lists
625 expanded = []
626 for arg in args:
627 if arg == '-':
628 expanded.append(arg)
629 else:
630 expanded.extend(getFilesForName(arg))
631 args = expanded
632
633 # slurp through all the files
634 eater = TokenEater(options)
635 for filename in args:
636 if filename == '-':
637 if options.verbose:
638 print(_('Reading standard input'))
639 fp = sys.stdin.buffer
640 closep = 0
641 else:
642 if options.verbose:
643 print(_('Working on %s') % filename)
644 fp = open(filename, 'rb')
645 closep = 1
646 try:
647 eater.set_filename(filename)
648 try:
649 tokens = tokenize.tokenize(fp.readline)
650 for _token in tokens:
651 eater(*_token)
652 except tokenize.TokenError as e:
653 print('%s: %s, line %d, column %d' % (
654 e.args[0], filename, e.args[1][0], e.args[1][1]),
655 file=sys.stderr)
656 finally:
657 if closep:
658 fp.close()
659
660 # write the output
661 if options.outfile == '-':
662 fp = sys.stdout
663 closep = 0
664 else:
665 if options.outpath:
666 options.outfile = os.path.join(options.outpath, options.outfile)
667 fp = open(options.outfile, 'w')
668 closep = 1
669 try:
670 eater.write(fp)
671 finally:
672 if closep:
673 fp.close()
674
675
676 if __name__ == '__main__':
677 main()
678 # some more test strings
679 # this one creates a warning
680 _('*** Seen unexpected token "%(token)s"') % {'token': 'test'}
681 _('more' 'than' 'one' 'string')