1 """Internationalization and localization support.
2
3 This module provides internationalization (I18N) and localization (L10N)
4 support for your Python programs by providing an interface to the GNU gettext
5 message catalog library.
6
7 I18N refers to the operation by which a program is made aware of multiple
8 languages. L10N refers to the adaptation of your program, once
9 internationalized, to the local language and cultural habits.
10
11 """
12
13 # This module represents the integration of work, contributions, feedback, and
14 # suggestions from the following people:
15 #
16 # Martin von Loewis, who wrote the initial implementation of the underlying
17 # C-based libintlmodule (later renamed _gettext), along with a skeletal
18 # gettext.py implementation.
19 #
20 # Peter Funk, who wrote fintl.py, a fairly complete wrapper around intlmodule,
21 # which also included a pure-Python implementation to read .mo files if
22 # intlmodule wasn't available.
23 #
24 # James Henstridge, who also wrote a gettext.py module, which has some
25 # interesting, but currently unsupported experimental features: the notion of
26 # a Catalog class and instances, and the ability to add to a catalog file via
27 # a Python API.
28 #
29 # Barry Warsaw integrated these modules, wrote the .install() API and code,
30 # and conformed all C and Python code to Python's coding standards.
31 #
32 # Francois Pinard and Marc-Andre Lemburg also contributed valuably to this
33 # module.
34 #
35 # J. David Ibanez implemented plural forms. Bruno Haible fixed some bugs.
36 #
37 # TODO:
38 # - Lazy loading of .mo files. Currently the entire catalog is loaded into
39 # memory, but that's probably bad for large translated programs. Instead,
40 # the lexical sort of original strings in GNU .mo files should be exploited
41 # to do binary searches and lazy initializations. Or you might want to use
42 # the undocumented double-hash algorithm for .mo files with hash tables, but
43 # you'll need to study the GNU gettext code to do this.
44 #
45 # - Support Solaris .mo file formats. Unfortunately, we've been unable to
46 # find this format documented anywhere.
47
48
49 import operator
50 import os
51 import re
52 import sys
53
54
55 __all__ = ['NullTranslations', 'GNUTranslations', 'Catalog',
56 'bindtextdomain', 'find', 'translation', 'install',
57 'textdomain', 'dgettext', 'dngettext', 'gettext',
58 'ngettext', 'pgettext', 'dpgettext', 'npgettext',
59 'dnpgettext'
60 ]
61
62 _default_localedir = os.path.join(sys.base_prefix, 'share', 'locale')
63
64 # Expression parsing for plural form selection.
65 #
66 # The gettext library supports a small subset of C syntax. The only
67 # incompatible difference is that integer literals starting with zero are
68 # decimal.
69 #
70 # https://www.gnu.org/software/gettext/manual/gettext.html#Plural-forms
71 # http://git.savannah.gnu.org/cgit/gettext.git/tree/gettext-runtime/intl/plural.y
72
73 _token_pattern = re.compile(r"""
74 (?P<WHITESPACES>[ \t]+) | # spaces and horizontal tabs
75 (?P<NUMBER>[0-9]+\b) | # decimal integer
76 (?P<NAME>n\b) | # only n is allowed
77 (?P<PARENTHESIS>[()]) |
78 (?P<OPERATOR>[-*/%+?:]|[><!]=?|==|&&|\|\|) | # !, *, /, %, +, -, <, >,
79 # <=, >=, ==, !=, &&, ||,
80 # ? :
81 # unary and bitwise ops
82 # not allowed
83 (?P<INVALID>\w+|.) # invalid token
84 """, re.VERBOSE|re.DOTALL)
85
86
87 def _tokenize(plural):
88 for mo in re.finditer(_token_pattern, plural):
89 kind = mo.lastgroup
90 if kind == 'WHITESPACES':
91 continue
92 value = mo.group(kind)
93 if kind == 'INVALID':
94 raise ValueError('invalid token in plural form: %s' % value)
95 yield value
96 yield ''
97
98
99 def _error(value):
100 if value:
101 return ValueError('unexpected token in plural form: %s' % value)
102 else:
103 return ValueError('unexpected end of plural form')
104
105
106 _binary_ops = (
107 ('||',),
108 ('&&',),
109 ('==', '!='),
110 ('<', '>', '<=', '>='),
111 ('+', '-'),
112 ('*', '/', '%'),
113 )
114 _binary_ops = {op: i for i, ops in enumerate(_binary_ops, 1) for op in ops}
115 _c2py_ops = {'||': 'or', '&&': 'and', '/': '//'}
116
117
118 def _parse(tokens, priority=-1):
119 result = ''
120 nexttok = next(tokens)
121 while nexttok == '!':
122 result += 'not '
123 nexttok = next(tokens)
124
125 if nexttok == '(':
126 sub, nexttok = _parse(tokens)
127 result = '%s(%s)' % (result, sub)
128 if nexttok != ')':
129 raise ValueError('unbalanced parenthesis in plural form')
130 elif nexttok == 'n':
131 result = '%s%s' % (result, nexttok)
132 else:
133 try:
134 value = int(nexttok, 10)
135 except ValueError:
136 raise _error(nexttok) from None
137 result = '%s%d' % (result, value)
138 nexttok = next(tokens)
139
140 j = 100
141 while nexttok in _binary_ops:
142 i = _binary_ops[nexttok]
143 if i < priority:
144 break
145 # Break chained comparisons
146 if i in (3, 4) and j in (3, 4): # '==', '!=', '<', '>', '<=', '>='
147 result = '(%s)' % result
148 # Replace some C operators by their Python equivalents
149 op = _c2py_ops.get(nexttok, nexttok)
150 right, nexttok = _parse(tokens, i + 1)
151 result = '%s %s %s' % (result, op, right)
152 j = i
153 if j == priority == 4: # '<', '>', '<=', '>='
154 result = '(%s)' % result
155
156 if nexttok == '?' and priority <= 0:
157 if_true, nexttok = _parse(tokens, 0)
158 if nexttok != ':':
159 raise _error(nexttok)
160 if_false, nexttok = _parse(tokens)
161 result = '%s if %s else %s' % (if_true, result, if_false)
162 if priority == 0:
163 result = '(%s)' % result
164
165 return result, nexttok
166
167
168 def _as_int(n):
169 try:
170 round(n)
171 except TypeError:
172 raise TypeError('Plural value must be an integer, got %s' %
173 (n.__class__.__name__,)) from None
174
175 import warnings
176 frame = sys._getframe(1)
177 stacklevel = 2
178 while frame.f_back is not None and frame.f_globals.get('__name__') == __name__:
179 stacklevel += 1
180 frame = frame.f_back
181 warnings.warn('Plural value must be an integer, got %s' %
182 (n.__class__.__name__,),
183 DeprecationWarning,
184 stacklevel)
185 return n
186
187
188 def c2py(plural):
189 """Gets a C expression as used in PO files for plural forms and returns a
190 Python function that implements an equivalent expression.
191 """
192
193 if len(plural) > 1000:
194 raise ValueError('plural form expression is too long')
195 try:
196 result, nexttok = _parse(_tokenize(plural))
197 if nexttok:
198 raise _error(nexttok)
199
200 depth = 0
201 for c in result:
202 if c == '(':
203 depth += 1
204 if depth > 20:
205 # Python compiler limit is about 90.
206 # The most complex example has 2.
207 raise ValueError('plural form expression is too complex')
208 elif c == ')':
209 depth -= 1
210
211 ns = {'_as_int': _as_int, '__name__': __name__}
212 exec('''if True:
213 def func(n):
214 if not isinstance(n, int):
215 n = _as_int(n)
216 return int(%s)
217 ''' % result, ns)
218 return ns['func']
219 except RecursionError:
220 # Recursion error can be raised in _parse() or exec().
221 raise ValueError('plural form expression is too complex')
222
223
224 def _expand_lang(loc):
225 import locale
226 loc = locale.normalize(loc)
227 COMPONENT_CODESET = 1 << 0
228 COMPONENT_TERRITORY = 1 << 1
229 COMPONENT_MODIFIER = 1 << 2
230 # split up the locale into its base components
231 mask = 0
232 pos = loc.find('@')
233 if pos >= 0:
234 modifier = loc[pos:]
235 loc = loc[:pos]
236 mask |= COMPONENT_MODIFIER
237 else:
238 modifier = ''
239 pos = loc.find('.')
240 if pos >= 0:
241 codeset = loc[pos:]
242 loc = loc[:pos]
243 mask |= COMPONENT_CODESET
244 else:
245 codeset = ''
246 pos = loc.find('_')
247 if pos >= 0:
248 territory = loc[pos:]
249 loc = loc[:pos]
250 mask |= COMPONENT_TERRITORY
251 else:
252 territory = ''
253 language = loc
254 ret = []
255 for i in range(mask+1):
256 if not (i & ~mask): # if all components for this combo exist ...
257 val = language
258 if i & COMPONENT_TERRITORY: val += territory
259 if i & COMPONENT_CODESET: val += codeset
260 if i & COMPONENT_MODIFIER: val += modifier
261 ret.append(val)
262 ret.reverse()
263 return ret
264
265
266 class ESC[4;38;5;81mNullTranslations:
267 def __init__(self, fp=None):
268 self._info = {}
269 self._charset = None
270 self._fallback = None
271 if fp is not None:
272 self._parse(fp)
273
274 def _parse(self, fp):
275 pass
276
277 def add_fallback(self, fallback):
278 if self._fallback:
279 self._fallback.add_fallback(fallback)
280 else:
281 self._fallback = fallback
282
283 def gettext(self, message):
284 if self._fallback:
285 return self._fallback.gettext(message)
286 return message
287
288 def ngettext(self, msgid1, msgid2, n):
289 if self._fallback:
290 return self._fallback.ngettext(msgid1, msgid2, n)
291 if n == 1:
292 return msgid1
293 else:
294 return msgid2
295
296 def pgettext(self, context, message):
297 if self._fallback:
298 return self._fallback.pgettext(context, message)
299 return message
300
301 def npgettext(self, context, msgid1, msgid2, n):
302 if self._fallback:
303 return self._fallback.npgettext(context, msgid1, msgid2, n)
304 if n == 1:
305 return msgid1
306 else:
307 return msgid2
308
309 def info(self):
310 return self._info
311
312 def charset(self):
313 return self._charset
314
315 def install(self, names=None):
316 import builtins
317 builtins.__dict__['_'] = self.gettext
318 if names is not None:
319 allowed = {'gettext', 'ngettext', 'npgettext', 'pgettext'}
320 for name in allowed & set(names):
321 builtins.__dict__[name] = getattr(self, name)
322
323
324 class ESC[4;38;5;81mGNUTranslations(ESC[4;38;5;149mNullTranslations):
325 # Magic number of .mo files
326 LE_MAGIC = 0x950412de
327 BE_MAGIC = 0xde120495
328
329 # The encoding of a msgctxt and a msgid in a .mo file is
330 # msgctxt + "\x04" + msgid (gettext version >= 0.15)
331 CONTEXT = "%s\x04%s"
332
333 # Acceptable .mo versions
334 VERSIONS = (0, 1)
335
336 def _get_versions(self, version):
337 """Returns a tuple of major version, minor version"""
338 return (version >> 16, version & 0xffff)
339
340 def _parse(self, fp):
341 """Override this method to support alternative .mo formats."""
342 # Delay struct import for speeding up gettext import when .mo files
343 # are not used.
344 from struct import unpack
345 filename = getattr(fp, 'name', '')
346 # Parse the .mo file header, which consists of 5 little endian 32
347 # bit words.
348 self._catalog = catalog = {}
349 self.plural = lambda n: int(n != 1) # germanic plural by default
350 buf = fp.read()
351 buflen = len(buf)
352 # Are we big endian or little endian?
353 magic = unpack('<I', buf[:4])[0]
354 if magic == self.LE_MAGIC:
355 version, msgcount, masteridx, transidx = unpack('<4I', buf[4:20])
356 ii = '<II'
357 elif magic == self.BE_MAGIC:
358 version, msgcount, masteridx, transidx = unpack('>4I', buf[4:20])
359 ii = '>II'
360 else:
361 raise OSError(0, 'Bad magic number', filename)
362
363 major_version, minor_version = self._get_versions(version)
364
365 if major_version not in self.VERSIONS:
366 raise OSError(0, 'Bad version number ' + str(major_version), filename)
367
368 # Now put all messages from the .mo file buffer into the catalog
369 # dictionary.
370 for i in range(0, msgcount):
371 mlen, moff = unpack(ii, buf[masteridx:masteridx+8])
372 mend = moff + mlen
373 tlen, toff = unpack(ii, buf[transidx:transidx+8])
374 tend = toff + tlen
375 if mend < buflen and tend < buflen:
376 msg = buf[moff:mend]
377 tmsg = buf[toff:tend]
378 else:
379 raise OSError(0, 'File is corrupt', filename)
380 # See if we're looking at GNU .mo conventions for metadata
381 if mlen == 0:
382 # Catalog description
383 lastk = None
384 for b_item in tmsg.split(b'\n'):
385 item = b_item.decode().strip()
386 if not item:
387 continue
388 # Skip over comment lines:
389 if item.startswith('#-#-#-#-#') and item.endswith('#-#-#-#-#'):
390 continue
391 k = v = None
392 if ':' in item:
393 k, v = item.split(':', 1)
394 k = k.strip().lower()
395 v = v.strip()
396 self._info[k] = v
397 lastk = k
398 elif lastk:
399 self._info[lastk] += '\n' + item
400 if k == 'content-type':
401 self._charset = v.split('charset=')[1]
402 elif k == 'plural-forms':
403 v = v.split(';')
404 plural = v[1].split('plural=')[1]
405 self.plural = c2py(plural)
406 # Note: we unconditionally convert both msgids and msgstrs to
407 # Unicode using the character encoding specified in the charset
408 # parameter of the Content-Type header. The gettext documentation
409 # strongly encourages msgids to be us-ascii, but some applications
410 # require alternative encodings (e.g. Zope's ZCML and ZPT). For
411 # traditional gettext applications, the msgid conversion will
412 # cause no problems since us-ascii should always be a subset of
413 # the charset encoding. We may want to fall back to 8-bit msgids
414 # if the Unicode conversion fails.
415 charset = self._charset or 'ascii'
416 if b'\x00' in msg:
417 # Plural forms
418 msgid1, msgid2 = msg.split(b'\x00')
419 tmsg = tmsg.split(b'\x00')
420 msgid1 = str(msgid1, charset)
421 for i, x in enumerate(tmsg):
422 catalog[(msgid1, i)] = str(x, charset)
423 else:
424 catalog[str(msg, charset)] = str(tmsg, charset)
425 # advance to next entry in the seek tables
426 masteridx += 8
427 transidx += 8
428
429 def gettext(self, message):
430 missing = object()
431 tmsg = self._catalog.get(message, missing)
432 if tmsg is missing:
433 tmsg = self._catalog.get((message, self.plural(1)), missing)
434 if tmsg is not missing:
435 return tmsg
436 if self._fallback:
437 return self._fallback.gettext(message)
438 return message
439
440 def ngettext(self, msgid1, msgid2, n):
441 try:
442 tmsg = self._catalog[(msgid1, self.plural(n))]
443 except KeyError:
444 if self._fallback:
445 return self._fallback.ngettext(msgid1, msgid2, n)
446 if n == 1:
447 tmsg = msgid1
448 else:
449 tmsg = msgid2
450 return tmsg
451
452 def pgettext(self, context, message):
453 ctxt_msg_id = self.CONTEXT % (context, message)
454 missing = object()
455 tmsg = self._catalog.get(ctxt_msg_id, missing)
456 if tmsg is missing:
457 tmsg = self._catalog.get((ctxt_msg_id, self.plural(1)), missing)
458 if tmsg is not missing:
459 return tmsg
460 if self._fallback:
461 return self._fallback.pgettext(context, message)
462 return message
463
464 def npgettext(self, context, msgid1, msgid2, n):
465 ctxt_msg_id = self.CONTEXT % (context, msgid1)
466 try:
467 tmsg = self._catalog[ctxt_msg_id, self.plural(n)]
468 except KeyError:
469 if self._fallback:
470 return self._fallback.npgettext(context, msgid1, msgid2, n)
471 if n == 1:
472 tmsg = msgid1
473 else:
474 tmsg = msgid2
475 return tmsg
476
477
478 # Locate a .mo file using the gettext strategy
479 def find(domain, localedir=None, languages=None, all=False):
480 # Get some reasonable defaults for arguments that were not supplied
481 if localedir is None:
482 localedir = _default_localedir
483 if languages is None:
484 languages = []
485 for envar in ('LANGUAGE', 'LC_ALL', 'LC_MESSAGES', 'LANG'):
486 val = os.environ.get(envar)
487 if val:
488 languages = val.split(':')
489 break
490 if 'C' not in languages:
491 languages.append('C')
492 # now normalize and expand the languages
493 nelangs = []
494 for lang in languages:
495 for nelang in _expand_lang(lang):
496 if nelang not in nelangs:
497 nelangs.append(nelang)
498 # select a language
499 if all:
500 result = []
501 else:
502 result = None
503 for lang in nelangs:
504 if lang == 'C':
505 break
506 mofile = os.path.join(localedir, lang, 'LC_MESSAGES', '%s.mo' % domain)
507 if os.path.exists(mofile):
508 if all:
509 result.append(mofile)
510 else:
511 return mofile
512 return result
513
514
515 # a mapping between absolute .mo file path and Translation object
516 _translations = {}
517
518
519 def translation(domain, localedir=None, languages=None,
520 class_=None, fallback=False):
521 if class_ is None:
522 class_ = GNUTranslations
523 mofiles = find(domain, localedir, languages, all=True)
524 if not mofiles:
525 if fallback:
526 return NullTranslations()
527 from errno import ENOENT
528 raise FileNotFoundError(ENOENT,
529 'No translation file found for domain', domain)
530 # Avoid opening, reading, and parsing the .mo file after it's been done
531 # once.
532 result = None
533 for mofile in mofiles:
534 key = (class_, os.path.abspath(mofile))
535 t = _translations.get(key)
536 if t is None:
537 with open(mofile, 'rb') as fp:
538 t = _translations.setdefault(key, class_(fp))
539 # Copy the translation object to allow setting fallbacks and
540 # output charset. All other instance data is shared with the
541 # cached object.
542 # Delay copy import for speeding up gettext import when .mo files
543 # are not used.
544 import copy
545 t = copy.copy(t)
546 if result is None:
547 result = t
548 else:
549 result.add_fallback(t)
550 return result
551
552
553 def install(domain, localedir=None, *, names=None):
554 t = translation(domain, localedir, fallback=True)
555 t.install(names)
556
557
558 # a mapping b/w domains and locale directories
559 _localedirs = {}
560 # current global domain, `messages' used for compatibility w/ GNU gettext
561 _current_domain = 'messages'
562
563
564 def textdomain(domain=None):
565 global _current_domain
566 if domain is not None:
567 _current_domain = domain
568 return _current_domain
569
570
571 def bindtextdomain(domain, localedir=None):
572 global _localedirs
573 if localedir is not None:
574 _localedirs[domain] = localedir
575 return _localedirs.get(domain, _default_localedir)
576
577
578 def dgettext(domain, message):
579 try:
580 t = translation(domain, _localedirs.get(domain, None))
581 except OSError:
582 return message
583 return t.gettext(message)
584
585
586 def dngettext(domain, msgid1, msgid2, n):
587 try:
588 t = translation(domain, _localedirs.get(domain, None))
589 except OSError:
590 if n == 1:
591 return msgid1
592 else:
593 return msgid2
594 return t.ngettext(msgid1, msgid2, n)
595
596
597 def dpgettext(domain, context, message):
598 try:
599 t = translation(domain, _localedirs.get(domain, None))
600 except OSError:
601 return message
602 return t.pgettext(context, message)
603
604
605 def dnpgettext(domain, context, msgid1, msgid2, n):
606 try:
607 t = translation(domain, _localedirs.get(domain, None))
608 except OSError:
609 if n == 1:
610 return msgid1
611 else:
612 return msgid2
613 return t.npgettext(context, msgid1, msgid2, n)
614
615
616 def gettext(message):
617 return dgettext(_current_domain, message)
618
619
620 def ngettext(msgid1, msgid2, n):
621 return dngettext(_current_domain, msgid1, msgid2, n)
622
623
624 def pgettext(context, message):
625 return dpgettext(_current_domain, context, message)
626
627
628 def npgettext(context, msgid1, msgid2, n):
629 return dnpgettext(_current_domain, context, msgid1, msgid2, n)
630
631
632 # dcgettext() has been deemed unnecessary and is not implemented.
633
634 # James Henstridge's Catalog constructor from GNOME gettext. Documented usage
635 # was:
636 #
637 # import gettext
638 # cat = gettext.Catalog(PACKAGE, localedir=LOCALEDIR)
639 # _ = cat.gettext
640 # print _('Hello World')
641
642 # The resulting catalog object currently don't support access through a
643 # dictionary API, which was supported (but apparently unused) in GNOME
644 # gettext.
645
646 Catalog = translation