1 """Internationalization and localization support.
2
3 This module provides internationalization (I18N) and localization (L10N)
4 support for your Python programs by providing an interface to the GNU gettext
5 message catalog library.
6
7 I18N refers to the operation by which a program is made aware of multiple
8 languages. L10N refers to the adaptation of your program, once
9 internationalized, to the local language and cultural habits.
10
11 """
12
13 # This module represents the integration of work, contributions, feedback, and
14 # suggestions from the following people:
15 #
16 # Martin von Loewis, who wrote the initial implementation of the underlying
17 # C-based libintlmodule (later renamed _gettext), along with a skeletal
18 # gettext.py implementation.
19 #
20 # Peter Funk, who wrote fintl.py, a fairly complete wrapper around intlmodule,
21 # which also included a pure-Python implementation to read .mo files if
22 # intlmodule wasn't available.
23 #
24 # James Henstridge, who also wrote a gettext.py module, which has some
25 # interesting, but currently unsupported experimental features: the notion of
26 # a Catalog class and instances, and the ability to add to a catalog file via
27 # a Python API.
28 #
29 # Barry Warsaw integrated these modules, wrote the .install() API and code,
30 # and conformed all C and Python code to Python's coding standards.
31 #
32 # Francois Pinard and Marc-Andre Lemburg also contributed valuably to this
33 # module.
34 #
35 # J. David Ibanez implemented plural forms. Bruno Haible fixed some bugs.
36 #
37 # TODO:
38 # - Lazy loading of .mo files. Currently the entire catalog is loaded into
39 # memory, but that's probably bad for large translated programs. Instead,
40 # the lexical sort of original strings in GNU .mo files should be exploited
41 # to do binary searches and lazy initializations. Or you might want to use
42 # the undocumented double-hash algorithm for .mo files with hash tables, but
43 # you'll need to study the GNU gettext code to do this.
44 #
45 # - Support Solaris .mo file formats. Unfortunately, we've been unable to
46 # find this format documented anywhere.
47
48
49 import os
50 import re
51 import sys
52
53
54 __all__ = ['NullTranslations', 'GNUTranslations', 'Catalog',
55 'bindtextdomain', 'find', 'translation', 'install',
56 'textdomain', 'dgettext', 'dngettext', 'gettext',
57 'ngettext', 'pgettext', 'dpgettext', 'npgettext',
58 'dnpgettext'
59 ]
60
61 _default_localedir = os.path.join(sys.base_prefix, 'share', 'locale')
62
63 # Expression parsing for plural form selection.
64 #
65 # The gettext library supports a small subset of C syntax. The only
66 # incompatible difference is that integer literals starting with zero are
67 # decimal.
68 #
69 # https://www.gnu.org/software/gettext/manual/gettext.html#Plural-forms
70 # http://git.savannah.gnu.org/cgit/gettext.git/tree/gettext-runtime/intl/plural.y
71
72 _token_pattern = re.compile(r"""
73 (?P<WHITESPACES>[ \t]+) | # spaces and horizontal tabs
74 (?P<NUMBER>[0-9]+\b) | # decimal integer
75 (?P<NAME>n\b) | # only n is allowed
76 (?P<PARENTHESIS>[()]) |
77 (?P<OPERATOR>[-*/%+?:]|[><!]=?|==|&&|\|\|) | # !, *, /, %, +, -, <, >,
78 # <=, >=, ==, !=, &&, ||,
79 # ? :
80 # unary and bitwise ops
81 # not allowed
82 (?P<INVALID>\w+|.) # invalid token
83 """, re.VERBOSE|re.DOTALL)
84
85
86 def _tokenize(plural):
87 for mo in re.finditer(_token_pattern, plural):
88 kind = mo.lastgroup
89 if kind == 'WHITESPACES':
90 continue
91 value = mo.group(kind)
92 if kind == 'INVALID':
93 raise ValueError('invalid token in plural form: %s' % value)
94 yield value
95 yield ''
96
97
98 def _error(value):
99 if value:
100 return ValueError('unexpected token in plural form: %s' % value)
101 else:
102 return ValueError('unexpected end of plural form')
103
104
105 _binary_ops = (
106 ('||',),
107 ('&&',),
108 ('==', '!='),
109 ('<', '>', '<=', '>='),
110 ('+', '-'),
111 ('*', '/', '%'),
112 )
113 _binary_ops = {op: i for i, ops in enumerate(_binary_ops, 1) for op in ops}
114 _c2py_ops = {'||': 'or', '&&': 'and', '/': '//'}
115
116
117 def _parse(tokens, priority=-1):
118 result = ''
119 nexttok = next(tokens)
120 while nexttok == '!':
121 result += 'not '
122 nexttok = next(tokens)
123
124 if nexttok == '(':
125 sub, nexttok = _parse(tokens)
126 result = '%s(%s)' % (result, sub)
127 if nexttok != ')':
128 raise ValueError('unbalanced parenthesis in plural form')
129 elif nexttok == 'n':
130 result = '%s%s' % (result, nexttok)
131 else:
132 try:
133 value = int(nexttok, 10)
134 except ValueError:
135 raise _error(nexttok) from None
136 result = '%s%d' % (result, value)
137 nexttok = next(tokens)
138
139 j = 100
140 while nexttok in _binary_ops:
141 i = _binary_ops[nexttok]
142 if i < priority:
143 break
144 # Break chained comparisons
145 if i in (3, 4) and j in (3, 4): # '==', '!=', '<', '>', '<=', '>='
146 result = '(%s)' % result
147 # Replace some C operators by their Python equivalents
148 op = _c2py_ops.get(nexttok, nexttok)
149 right, nexttok = _parse(tokens, i + 1)
150 result = '%s %s %s' % (result, op, right)
151 j = i
152 if j == priority == 4: # '<', '>', '<=', '>='
153 result = '(%s)' % result
154
155 if nexttok == '?' and priority <= 0:
156 if_true, nexttok = _parse(tokens, 0)
157 if nexttok != ':':
158 raise _error(nexttok)
159 if_false, nexttok = _parse(tokens)
160 result = '%s if %s else %s' % (if_true, result, if_false)
161 if priority == 0:
162 result = '(%s)' % result
163
164 return result, nexttok
165
166
167 def _as_int(n):
168 try:
169 i = round(n)
170 except TypeError:
171 raise TypeError('Plural value must be an integer, got %s' %
172 (n.__class__.__name__,)) from None
173 import warnings
174 warnings.warn('Plural value must be an integer, got %s' %
175 (n.__class__.__name__,),
176 DeprecationWarning, 4)
177 return n
178
179
180 def c2py(plural):
181 """Gets a C expression as used in PO files for plural forms and returns a
182 Python function that implements an equivalent expression.
183 """
184
185 if len(plural) > 1000:
186 raise ValueError('plural form expression is too long')
187 try:
188 result, nexttok = _parse(_tokenize(plural))
189 if nexttok:
190 raise _error(nexttok)
191
192 depth = 0
193 for c in result:
194 if c == '(':
195 depth += 1
196 if depth > 20:
197 # Python compiler limit is about 90.
198 # The most complex example has 2.
199 raise ValueError('plural form expression is too complex')
200 elif c == ')':
201 depth -= 1
202
203 ns = {'_as_int': _as_int}
204 exec('''if True:
205 def func(n):
206 if not isinstance(n, int):
207 n = _as_int(n)
208 return int(%s)
209 ''' % result, ns)
210 return ns['func']
211 except RecursionError:
212 # Recursion error can be raised in _parse() or exec().
213 raise ValueError('plural form expression is too complex')
214
215
216 def _expand_lang(loc):
217 import locale
218 loc = locale.normalize(loc)
219 COMPONENT_CODESET = 1 << 0
220 COMPONENT_TERRITORY = 1 << 1
221 COMPONENT_MODIFIER = 1 << 2
222 # split up the locale into its base components
223 mask = 0
224 pos = loc.find('@')
225 if pos >= 0:
226 modifier = loc[pos:]
227 loc = loc[:pos]
228 mask |= COMPONENT_MODIFIER
229 else:
230 modifier = ''
231 pos = loc.find('.')
232 if pos >= 0:
233 codeset = loc[pos:]
234 loc = loc[:pos]
235 mask |= COMPONENT_CODESET
236 else:
237 codeset = ''
238 pos = loc.find('_')
239 if pos >= 0:
240 territory = loc[pos:]
241 loc = loc[:pos]
242 mask |= COMPONENT_TERRITORY
243 else:
244 territory = ''
245 language = loc
246 ret = []
247 for i in range(mask+1):
248 if not (i & ~mask): # if all components for this combo exist ...
249 val = language
250 if i & COMPONENT_TERRITORY: val += territory
251 if i & COMPONENT_CODESET: val += codeset
252 if i & COMPONENT_MODIFIER: val += modifier
253 ret.append(val)
254 ret.reverse()
255 return ret
256
257
258 class ESC[4;38;5;81mNullTranslations:
259 def __init__(self, fp=None):
260 self._info = {}
261 self._charset = None
262 self._fallback = None
263 if fp is not None:
264 self._parse(fp)
265
266 def _parse(self, fp):
267 pass
268
269 def add_fallback(self, fallback):
270 if self._fallback:
271 self._fallback.add_fallback(fallback)
272 else:
273 self._fallback = fallback
274
275 def gettext(self, message):
276 if self._fallback:
277 return self._fallback.gettext(message)
278 return message
279
280 def ngettext(self, msgid1, msgid2, n):
281 if self._fallback:
282 return self._fallback.ngettext(msgid1, msgid2, n)
283 if n == 1:
284 return msgid1
285 else:
286 return msgid2
287
288 def pgettext(self, context, message):
289 if self._fallback:
290 return self._fallback.pgettext(context, message)
291 return message
292
293 def npgettext(self, context, msgid1, msgid2, n):
294 if self._fallback:
295 return self._fallback.npgettext(context, msgid1, msgid2, n)
296 if n == 1:
297 return msgid1
298 else:
299 return msgid2
300
301 def info(self):
302 return self._info
303
304 def charset(self):
305 return self._charset
306
307 def install(self, names=None):
308 import builtins
309 builtins.__dict__['_'] = self.gettext
310 if names is not None:
311 allowed = {'gettext', 'ngettext', 'npgettext', 'pgettext'}
312 for name in allowed & set(names):
313 builtins.__dict__[name] = getattr(self, name)
314
315
316 class ESC[4;38;5;81mGNUTranslations(ESC[4;38;5;149mNullTranslations):
317 # Magic number of .mo files
318 LE_MAGIC = 0x950412de
319 BE_MAGIC = 0xde120495
320
321 # The encoding of a msgctxt and a msgid in a .mo file is
322 # msgctxt + "\x04" + msgid (gettext version >= 0.15)
323 CONTEXT = "%s\x04%s"
324
325 # Acceptable .mo versions
326 VERSIONS = (0, 1)
327
328 def _get_versions(self, version):
329 """Returns a tuple of major version, minor version"""
330 return (version >> 16, version & 0xffff)
331
332 def _parse(self, fp):
333 """Override this method to support alternative .mo formats."""
334 # Delay struct import for speeding up gettext import when .mo files
335 # are not used.
336 from struct import unpack
337 filename = getattr(fp, 'name', '')
338 # Parse the .mo file header, which consists of 5 little endian 32
339 # bit words.
340 self._catalog = catalog = {}
341 self.plural = lambda n: int(n != 1) # germanic plural by default
342 buf = fp.read()
343 buflen = len(buf)
344 # Are we big endian or little endian?
345 magic = unpack('<I', buf[:4])[0]
346 if magic == self.LE_MAGIC:
347 version, msgcount, masteridx, transidx = unpack('<4I', buf[4:20])
348 ii = '<II'
349 elif magic == self.BE_MAGIC:
350 version, msgcount, masteridx, transidx = unpack('>4I', buf[4:20])
351 ii = '>II'
352 else:
353 raise OSError(0, 'Bad magic number', filename)
354
355 major_version, minor_version = self._get_versions(version)
356
357 if major_version not in self.VERSIONS:
358 raise OSError(0, 'Bad version number ' + str(major_version), filename)
359
360 # Now put all messages from the .mo file buffer into the catalog
361 # dictionary.
362 for i in range(0, msgcount):
363 mlen, moff = unpack(ii, buf[masteridx:masteridx+8])
364 mend = moff + mlen
365 tlen, toff = unpack(ii, buf[transidx:transidx+8])
366 tend = toff + tlen
367 if mend < buflen and tend < buflen:
368 msg = buf[moff:mend]
369 tmsg = buf[toff:tend]
370 else:
371 raise OSError(0, 'File is corrupt', filename)
372 # See if we're looking at GNU .mo conventions for metadata
373 if mlen == 0:
374 # Catalog description
375 lastk = None
376 for b_item in tmsg.split(b'\n'):
377 item = b_item.decode().strip()
378 if not item:
379 continue
380 # Skip over comment lines:
381 if item.startswith('#-#-#-#-#') and item.endswith('#-#-#-#-#'):
382 continue
383 k = v = None
384 if ':' in item:
385 k, v = item.split(':', 1)
386 k = k.strip().lower()
387 v = v.strip()
388 self._info[k] = v
389 lastk = k
390 elif lastk:
391 self._info[lastk] += '\n' + item
392 if k == 'content-type':
393 self._charset = v.split('charset=')[1]
394 elif k == 'plural-forms':
395 v = v.split(';')
396 plural = v[1].split('plural=')[1]
397 self.plural = c2py(plural)
398 # Note: we unconditionally convert both msgids and msgstrs to
399 # Unicode using the character encoding specified in the charset
400 # parameter of the Content-Type header. The gettext documentation
401 # strongly encourages msgids to be us-ascii, but some applications
402 # require alternative encodings (e.g. Zope's ZCML and ZPT). For
403 # traditional gettext applications, the msgid conversion will
404 # cause no problems since us-ascii should always be a subset of
405 # the charset encoding. We may want to fall back to 8-bit msgids
406 # if the Unicode conversion fails.
407 charset = self._charset or 'ascii'
408 if b'\x00' in msg:
409 # Plural forms
410 msgid1, msgid2 = msg.split(b'\x00')
411 tmsg = tmsg.split(b'\x00')
412 msgid1 = str(msgid1, charset)
413 for i, x in enumerate(tmsg):
414 catalog[(msgid1, i)] = str(x, charset)
415 else:
416 catalog[str(msg, charset)] = str(tmsg, charset)
417 # advance to next entry in the seek tables
418 masteridx += 8
419 transidx += 8
420
421 def gettext(self, message):
422 missing = object()
423 tmsg = self._catalog.get(message, missing)
424 if tmsg is missing:
425 tmsg = self._catalog.get((message, self.plural(1)), missing)
426 if tmsg is not missing:
427 return tmsg
428 if self._fallback:
429 return self._fallback.gettext(message)
430 return message
431
432 def ngettext(self, msgid1, msgid2, n):
433 try:
434 tmsg = self._catalog[(msgid1, self.plural(n))]
435 except KeyError:
436 if self._fallback:
437 return self._fallback.ngettext(msgid1, msgid2, n)
438 if n == 1:
439 tmsg = msgid1
440 else:
441 tmsg = msgid2
442 return tmsg
443
444 def pgettext(self, context, message):
445 ctxt_msg_id = self.CONTEXT % (context, message)
446 missing = object()
447 tmsg = self._catalog.get(ctxt_msg_id, missing)
448 if tmsg is missing:
449 tmsg = self._catalog.get((ctxt_msg_id, self.plural(1)), missing)
450 if tmsg is not missing:
451 return tmsg
452 if self._fallback:
453 return self._fallback.pgettext(context, message)
454 return message
455
456 def npgettext(self, context, msgid1, msgid2, n):
457 ctxt_msg_id = self.CONTEXT % (context, msgid1)
458 try:
459 tmsg = self._catalog[ctxt_msg_id, self.plural(n)]
460 except KeyError:
461 if self._fallback:
462 return self._fallback.npgettext(context, msgid1, msgid2, n)
463 if n == 1:
464 tmsg = msgid1
465 else:
466 tmsg = msgid2
467 return tmsg
468
469
470 # Locate a .mo file using the gettext strategy
471 def find(domain, localedir=None, languages=None, all=False):
472 # Get some reasonable defaults for arguments that were not supplied
473 if localedir is None:
474 localedir = _default_localedir
475 if languages is None:
476 languages = []
477 for envar in ('LANGUAGE', 'LC_ALL', 'LC_MESSAGES', 'LANG'):
478 val = os.environ.get(envar)
479 if val:
480 languages = val.split(':')
481 break
482 if 'C' not in languages:
483 languages.append('C')
484 # now normalize and expand the languages
485 nelangs = []
486 for lang in languages:
487 for nelang in _expand_lang(lang):
488 if nelang not in nelangs:
489 nelangs.append(nelang)
490 # select a language
491 if all:
492 result = []
493 else:
494 result = None
495 for lang in nelangs:
496 if lang == 'C':
497 break
498 mofile = os.path.join(localedir, lang, 'LC_MESSAGES', '%s.mo' % domain)
499 if os.path.exists(mofile):
500 if all:
501 result.append(mofile)
502 else:
503 return mofile
504 return result
505
506
507 # a mapping between absolute .mo file path and Translation object
508 _translations = {}
509
510
511 def translation(domain, localedir=None, languages=None,
512 class_=None, fallback=False):
513 if class_ is None:
514 class_ = GNUTranslations
515 mofiles = find(domain, localedir, languages, all=True)
516 if not mofiles:
517 if fallback:
518 return NullTranslations()
519 from errno import ENOENT
520 raise FileNotFoundError(ENOENT,
521 'No translation file found for domain', domain)
522 # Avoid opening, reading, and parsing the .mo file after it's been done
523 # once.
524 result = None
525 for mofile in mofiles:
526 key = (class_, os.path.abspath(mofile))
527 t = _translations.get(key)
528 if t is None:
529 with open(mofile, 'rb') as fp:
530 t = _translations.setdefault(key, class_(fp))
531 # Copy the translation object to allow setting fallbacks and
532 # output charset. All other instance data is shared with the
533 # cached object.
534 # Delay copy import for speeding up gettext import when .mo files
535 # are not used.
536 import copy
537 t = copy.copy(t)
538 if result is None:
539 result = t
540 else:
541 result.add_fallback(t)
542 return result
543
544
545 def install(domain, localedir=None, *, names=None):
546 t = translation(domain, localedir, fallback=True)
547 t.install(names)
548
549
550 # a mapping b/w domains and locale directories
551 _localedirs = {}
552 # current global domain, `messages' used for compatibility w/ GNU gettext
553 _current_domain = 'messages'
554
555
556 def textdomain(domain=None):
557 global _current_domain
558 if domain is not None:
559 _current_domain = domain
560 return _current_domain
561
562
563 def bindtextdomain(domain, localedir=None):
564 global _localedirs
565 if localedir is not None:
566 _localedirs[domain] = localedir
567 return _localedirs.get(domain, _default_localedir)
568
569
570 def dgettext(domain, message):
571 try:
572 t = translation(domain, _localedirs.get(domain, None))
573 except OSError:
574 return message
575 return t.gettext(message)
576
577
578 def dngettext(domain, msgid1, msgid2, n):
579 try:
580 t = translation(domain, _localedirs.get(domain, None))
581 except OSError:
582 if n == 1:
583 return msgid1
584 else:
585 return msgid2
586 return t.ngettext(msgid1, msgid2, n)
587
588
589 def dpgettext(domain, context, message):
590 try:
591 t = translation(domain, _localedirs.get(domain, None))
592 except OSError:
593 return message
594 return t.pgettext(context, message)
595
596
597 def dnpgettext(domain, context, msgid1, msgid2, n):
598 try:
599 t = translation(domain, _localedirs.get(domain, None))
600 except OSError:
601 if n == 1:
602 return msgid1
603 else:
604 return msgid2
605 return t.npgettext(context, msgid1, msgid2, n)
606
607
608 def gettext(message):
609 return dgettext(_current_domain, message)
610
611
612 def ngettext(msgid1, msgid2, n):
613 return dngettext(_current_domain, msgid1, msgid2, n)
614
615
616 def pgettext(context, message):
617 return dpgettext(_current_domain, context, message)
618
619
620 def npgettext(context, msgid1, msgid2, n):
621 return dnpgettext(_current_domain, context, msgid1, msgid2, n)
622
623
624 # dcgettext() has been deemed unnecessary and is not implemented.
625
626 # James Henstridge's Catalog constructor from GNOME gettext. Documented usage
627 # was:
628 #
629 # import gettext
630 # cat = gettext.Catalog(PACKAGE, localedir=LOCALEDIR)
631 # _ = cat.gettext
632 # print _('Hello World')
633
634 # The resulting catalog object currently don't support access through a
635 # dictionary API, which was supported (but apparently unused) in GNOME
636 # gettext.
637
638 Catalog = translation