1  """Internationalization and localization support.
       2  
       3  This module provides internationalization (I18N) and localization (L10N)
       4  support for your Python programs by providing an interface to the GNU gettext
       5  message catalog library.
       6  
       7  I18N refers to the operation by which a program is made aware of multiple
       8  languages.  L10N refers to the adaptation of your program, once
       9  internationalized, to the local language and cultural habits.
      10  
      11  """
      12  
      13  # This module represents the integration of work, contributions, feedback, and
      14  # suggestions from the following people:
      15  #
      16  # Martin von Loewis, who wrote the initial implementation of the underlying
      17  # C-based libintlmodule (later renamed _gettext), along with a skeletal
      18  # gettext.py implementation.
      19  #
      20  # Peter Funk, who wrote fintl.py, a fairly complete wrapper around intlmodule,
      21  # which also included a pure-Python implementation to read .mo files if
      22  # intlmodule wasn't available.
      23  #
      24  # James Henstridge, who also wrote a gettext.py module, which has some
      25  # interesting, but currently unsupported experimental features: the notion of
      26  # a Catalog class and instances, and the ability to add to a catalog file via
      27  # a Python API.
      28  #
      29  # Barry Warsaw integrated these modules, wrote the .install() API and code,
      30  # and conformed all C and Python code to Python's coding standards.
      31  #
      32  # Francois Pinard and Marc-Andre Lemburg also contributed valuably to this
      33  # module.
      34  #
      35  # J. David Ibanez implemented plural forms. Bruno Haible fixed some bugs.
      36  #
      37  # TODO:
      38  # - Lazy loading of .mo files.  Currently the entire catalog is loaded into
      39  #   memory, but that's probably bad for large translated programs.  Instead,
      40  #   the lexical sort of original strings in GNU .mo files should be exploited
      41  #   to do binary searches and lazy initializations.  Or you might want to use
      42  #   the undocumented double-hash algorithm for .mo files with hash tables, but
      43  #   you'll need to study the GNU gettext code to do this.
      44  #
      45  # - Support Solaris .mo file formats.  Unfortunately, we've been unable to
      46  #   find this format documented anywhere.
      47  
      48  
      49  import operator
      50  import os
      51  import re
      52  import sys
      53  
      54  
      55  __all__ = ['NullTranslations', 'GNUTranslations', 'Catalog',
      56             'bindtextdomain', 'find', 'translation', 'install',
      57             'textdomain', 'dgettext', 'dngettext', 'gettext',
      58             'ngettext', 'pgettext', 'dpgettext', 'npgettext',
      59             'dnpgettext'
      60             ]
      61  
      62  _default_localedir = os.path.join(sys.base_prefix, 'share', 'locale')
      63  
      64  # Expression parsing for plural form selection.
      65  #
      66  # The gettext library supports a small subset of C syntax.  The only
      67  # incompatible difference is that integer literals starting with zero are
      68  # decimal.
      69  #
      70  # https://www.gnu.org/software/gettext/manual/gettext.html#Plural-forms
      71  # http://git.savannah.gnu.org/cgit/gettext.git/tree/gettext-runtime/intl/plural.y
      72  
      73  _token_pattern = re.compile(r"""
      74          (?P<WHITESPACES>[ \t]+)                    | # spaces and horizontal tabs
      75          (?P<NUMBER>[0-9]+\b)                       | # decimal integer
      76          (?P<NAME>n\b)                              | # only n is allowed
      77          (?P<PARENTHESIS>[()])                      |
      78          (?P<OPERATOR>[-*/%+?:]|[><!]=?|==|&&|\|\|) | # !, *, /, %, +, -, <, >,
      79                                                       # <=, >=, ==, !=, &&, ||,
      80                                                       # ? :
      81                                                       # unary and bitwise ops
      82                                                       # not allowed
      83          (?P<INVALID>\w+|.)                           # invalid token
      84      """, re.VERBOSE|re.DOTALL)
      85  
      86  
      87  def _tokenize(plural):
      88      for mo in re.finditer(_token_pattern, plural):
      89          kind = mo.lastgroup
      90          if kind == 'WHITESPACES':
      91              continue
      92          value = mo.group(kind)
      93          if kind == 'INVALID':
      94              raise ValueError('invalid token in plural form: %s' % value)
      95          yield value
      96      yield ''
      97  
      98  
      99  def _error(value):
     100      if value:
     101          return ValueError('unexpected token in plural form: %s' % value)
     102      else:
     103          return ValueError('unexpected end of plural form')
     104  
     105  
     106  _binary_ops = (
     107      ('||',),
     108      ('&&',),
     109      ('==', '!='),
     110      ('<', '>', '<=', '>='),
     111      ('+', '-'),
     112      ('*', '/', '%'),
     113  )
     114  _binary_ops = {op: i for i, ops in enumerate(_binary_ops, 1) for op in ops}
     115  _c2py_ops = {'||': 'or', '&&': 'and', '/': '//'}
     116  
     117  
     118  def _parse(tokens, priority=-1):
     119      result = ''
     120      nexttok = next(tokens)
     121      while nexttok == '!':
     122          result += 'not '
     123          nexttok = next(tokens)
     124  
     125      if nexttok == '(':
     126          sub, nexttok = _parse(tokens)
     127          result = '%s(%s)' % (result, sub)
     128          if nexttok != ')':
     129              raise ValueError('unbalanced parenthesis in plural form')
     130      elif nexttok == 'n':
     131          result = '%s%s' % (result, nexttok)
     132      else:
     133          try:
     134              value = int(nexttok, 10)
     135          except ValueError:
     136              raise _error(nexttok) from None
     137          result = '%s%d' % (result, value)
     138      nexttok = next(tokens)
     139  
     140      j = 100
     141      while nexttok in _binary_ops:
     142          i = _binary_ops[nexttok]
     143          if i < priority:
     144              break
     145          # Break chained comparisons
     146          if i in (3, 4) and j in (3, 4):  # '==', '!=', '<', '>', '<=', '>='
     147              result = '(%s)' % result
     148          # Replace some C operators by their Python equivalents
     149          op = _c2py_ops.get(nexttok, nexttok)
     150          right, nexttok = _parse(tokens, i + 1)
     151          result = '%s %s %s' % (result, op, right)
     152          j = i
     153      if j == priority == 4:  # '<', '>', '<=', '>='
     154          result = '(%s)' % result
     155  
     156      if nexttok == '?' and priority <= 0:
     157          if_true, nexttok = _parse(tokens, 0)
     158          if nexttok != ':':
     159              raise _error(nexttok)
     160          if_false, nexttok = _parse(tokens)
     161          result = '%s if %s else %s' % (if_true, result, if_false)
     162          if priority == 0:
     163              result = '(%s)' % result
     164  
     165      return result, nexttok
     166  
     167  
     168  def _as_int(n):
     169      try:
     170          round(n)
     171      except TypeError:
     172          raise TypeError('Plural value must be an integer, got %s' %
     173                          (n.__class__.__name__,)) from None
     174  
     175      import warnings
     176      frame = sys._getframe(1)
     177      stacklevel = 2
     178      while frame.f_back is not None and frame.f_globals.get('__name__') == __name__:
     179          stacklevel += 1
     180          frame = frame.f_back
     181      warnings.warn('Plural value must be an integer, got %s' %
     182                    (n.__class__.__name__,),
     183                    DeprecationWarning,
     184                    stacklevel)
     185      return n
     186  
     187  
     188  def c2py(plural):
     189      """Gets a C expression as used in PO files for plural forms and returns a
     190      Python function that implements an equivalent expression.
     191      """
     192  
     193      if len(plural) > 1000:
     194          raise ValueError('plural form expression is too long')
     195      try:
     196          result, nexttok = _parse(_tokenize(plural))
     197          if nexttok:
     198              raise _error(nexttok)
     199  
     200          depth = 0
     201          for c in result:
     202              if c == '(':
     203                  depth += 1
     204                  if depth > 20:
     205                      # Python compiler limit is about 90.
     206                      # The most complex example has 2.
     207                      raise ValueError('plural form expression is too complex')
     208              elif c == ')':
     209                  depth -= 1
     210  
     211          ns = {'_as_int': _as_int, '__name__': __name__}
     212          exec('''if True:
     213              def func(n):
     214                  if not isinstance(n, int):
     215                      n = _as_int(n)
     216                  return int(%s)
     217              ''' % result, ns)
     218          return ns['func']
     219      except RecursionError:
     220          # Recursion error can be raised in _parse() or exec().
     221          raise ValueError('plural form expression is too complex')
     222  
     223  
     224  def _expand_lang(loc):
     225      import locale
     226      loc = locale.normalize(loc)
     227      COMPONENT_CODESET   = 1 << 0
     228      COMPONENT_TERRITORY = 1 << 1
     229      COMPONENT_MODIFIER  = 1 << 2
     230      # split up the locale into its base components
     231      mask = 0
     232      pos = loc.find('@')
     233      if pos >= 0:
     234          modifier = loc[pos:]
     235          loc = loc[:pos]
     236          mask |= COMPONENT_MODIFIER
     237      else:
     238          modifier = ''
     239      pos = loc.find('.')
     240      if pos >= 0:
     241          codeset = loc[pos:]
     242          loc = loc[:pos]
     243          mask |= COMPONENT_CODESET
     244      else:
     245          codeset = ''
     246      pos = loc.find('_')
     247      if pos >= 0:
     248          territory = loc[pos:]
     249          loc = loc[:pos]
     250          mask |= COMPONENT_TERRITORY
     251      else:
     252          territory = ''
     253      language = loc
     254      ret = []
     255      for i in range(mask+1):
     256          if not (i & ~mask):  # if all components for this combo exist ...
     257              val = language
     258              if i & COMPONENT_TERRITORY: val += territory
     259              if i & COMPONENT_CODESET:   val += codeset
     260              if i & COMPONENT_MODIFIER:  val += modifier
     261              ret.append(val)
     262      ret.reverse()
     263      return ret
     264  
     265  
     266  class ESC[4;38;5;81mNullTranslations:
     267      def __init__(self, fp=None):
     268          self._info = {}
     269          self._charset = None
     270          self._fallback = None
     271          if fp is not None:
     272              self._parse(fp)
     273  
     274      def _parse(self, fp):
     275          pass
     276  
     277      def add_fallback(self, fallback):
     278          if self._fallback:
     279              self._fallback.add_fallback(fallback)
     280          else:
     281              self._fallback = fallback
     282  
     283      def gettext(self, message):
     284          if self._fallback:
     285              return self._fallback.gettext(message)
     286          return message
     287  
     288      def ngettext(self, msgid1, msgid2, n):
     289          if self._fallback:
     290              return self._fallback.ngettext(msgid1, msgid2, n)
     291          if n == 1:
     292              return msgid1
     293          else:
     294              return msgid2
     295  
     296      def pgettext(self, context, message):
     297          if self._fallback:
     298              return self._fallback.pgettext(context, message)
     299          return message
     300  
     301      def npgettext(self, context, msgid1, msgid2, n):
     302          if self._fallback:
     303              return self._fallback.npgettext(context, msgid1, msgid2, n)
     304          if n == 1:
     305              return msgid1
     306          else:
     307              return msgid2
     308  
     309      def info(self):
     310          return self._info
     311  
     312      def charset(self):
     313          return self._charset
     314  
     315      def install(self, names=None):
     316          import builtins
     317          builtins.__dict__['_'] = self.gettext
     318          if names is not None:
     319              allowed = {'gettext', 'ngettext', 'npgettext', 'pgettext'}
     320              for name in allowed & set(names):
     321                  builtins.__dict__[name] = getattr(self, name)
     322  
     323  
     324  class ESC[4;38;5;81mGNUTranslations(ESC[4;38;5;149mNullTranslations):
     325      # Magic number of .mo files
     326      LE_MAGIC = 0x950412de
     327      BE_MAGIC = 0xde120495
     328  
     329      # The encoding of a msgctxt and a msgid in a .mo file is
     330      # msgctxt + "\x04" + msgid (gettext version >= 0.15)
     331      CONTEXT = "%s\x04%s"
     332  
     333      # Acceptable .mo versions
     334      VERSIONS = (0, 1)
     335  
     336      def _get_versions(self, version):
     337          """Returns a tuple of major version, minor version"""
     338          return (version >> 16, version & 0xffff)
     339  
     340      def _parse(self, fp):
     341          """Override this method to support alternative .mo formats."""
     342          # Delay struct import for speeding up gettext import when .mo files
     343          # are not used.
     344          from struct import unpack
     345          filename = getattr(fp, 'name', '')
     346          # Parse the .mo file header, which consists of 5 little endian 32
     347          # bit words.
     348          self._catalog = catalog = {}
     349          self.plural = lambda n: int(n != 1) # germanic plural by default
     350          buf = fp.read()
     351          buflen = len(buf)
     352          # Are we big endian or little endian?
     353          magic = unpack('<I', buf[:4])[0]
     354          if magic == self.LE_MAGIC:
     355              version, msgcount, masteridx, transidx = unpack('<4I', buf[4:20])
     356              ii = '<II'
     357          elif magic == self.BE_MAGIC:
     358              version, msgcount, masteridx, transidx = unpack('>4I', buf[4:20])
     359              ii = '>II'
     360          else:
     361              raise OSError(0, 'Bad magic number', filename)
     362  
     363          major_version, minor_version = self._get_versions(version)
     364  
     365          if major_version not in self.VERSIONS:
     366              raise OSError(0, 'Bad version number ' + str(major_version), filename)
     367  
     368          # Now put all messages from the .mo file buffer into the catalog
     369          # dictionary.
     370          for i in range(0, msgcount):
     371              mlen, moff = unpack(ii, buf[masteridx:masteridx+8])
     372              mend = moff + mlen
     373              tlen, toff = unpack(ii, buf[transidx:transidx+8])
     374              tend = toff + tlen
     375              if mend < buflen and tend < buflen:
     376                  msg = buf[moff:mend]
     377                  tmsg = buf[toff:tend]
     378              else:
     379                  raise OSError(0, 'File is corrupt', filename)
     380              # See if we're looking at GNU .mo conventions for metadata
     381              if mlen == 0:
     382                  # Catalog description
     383                  lastk = None
     384                  for b_item in tmsg.split(b'\n'):
     385                      item = b_item.decode().strip()
     386                      if not item:
     387                          continue
     388                      # Skip over comment lines:
     389                      if item.startswith('#-#-#-#-#') and item.endswith('#-#-#-#-#'):
     390                          continue
     391                      k = v = None
     392                      if ':' in item:
     393                          k, v = item.split(':', 1)
     394                          k = k.strip().lower()
     395                          v = v.strip()
     396                          self._info[k] = v
     397                          lastk = k
     398                      elif lastk:
     399                          self._info[lastk] += '\n' + item
     400                      if k == 'content-type':
     401                          self._charset = v.split('charset=')[1]
     402                      elif k == 'plural-forms':
     403                          v = v.split(';')
     404                          plural = v[1].split('plural=')[1]
     405                          self.plural = c2py(plural)
     406              # Note: we unconditionally convert both msgids and msgstrs to
     407              # Unicode using the character encoding specified in the charset
     408              # parameter of the Content-Type header.  The gettext documentation
     409              # strongly encourages msgids to be us-ascii, but some applications
     410              # require alternative encodings (e.g. Zope's ZCML and ZPT).  For
     411              # traditional gettext applications, the msgid conversion will
     412              # cause no problems since us-ascii should always be a subset of
     413              # the charset encoding.  We may want to fall back to 8-bit msgids
     414              # if the Unicode conversion fails.
     415              charset = self._charset or 'ascii'
     416              if b'\x00' in msg:
     417                  # Plural forms
     418                  msgid1, msgid2 = msg.split(b'\x00')
     419                  tmsg = tmsg.split(b'\x00')
     420                  msgid1 = str(msgid1, charset)
     421                  for i, x in enumerate(tmsg):
     422                      catalog[(msgid1, i)] = str(x, charset)
     423              else:
     424                  catalog[str(msg, charset)] = str(tmsg, charset)
     425              # advance to next entry in the seek tables
     426              masteridx += 8
     427              transidx += 8
     428  
     429      def gettext(self, message):
     430          missing = object()
     431          tmsg = self._catalog.get(message, missing)
     432          if tmsg is missing:
     433              tmsg = self._catalog.get((message, self.plural(1)), missing)
     434          if tmsg is not missing:
     435              return tmsg
     436          if self._fallback:
     437              return self._fallback.gettext(message)
     438          return message
     439  
     440      def ngettext(self, msgid1, msgid2, n):
     441          try:
     442              tmsg = self._catalog[(msgid1, self.plural(n))]
     443          except KeyError:
     444              if self._fallback:
     445                  return self._fallback.ngettext(msgid1, msgid2, n)
     446              if n == 1:
     447                  tmsg = msgid1
     448              else:
     449                  tmsg = msgid2
     450          return tmsg
     451  
     452      def pgettext(self, context, message):
     453          ctxt_msg_id = self.CONTEXT % (context, message)
     454          missing = object()
     455          tmsg = self._catalog.get(ctxt_msg_id, missing)
     456          if tmsg is missing:
     457              tmsg = self._catalog.get((ctxt_msg_id, self.plural(1)), missing)
     458          if tmsg is not missing:
     459              return tmsg
     460          if self._fallback:
     461              return self._fallback.pgettext(context, message)
     462          return message
     463  
     464      def npgettext(self, context, msgid1, msgid2, n):
     465          ctxt_msg_id = self.CONTEXT % (context, msgid1)
     466          try:
     467              tmsg = self._catalog[ctxt_msg_id, self.plural(n)]
     468          except KeyError:
     469              if self._fallback:
     470                  return self._fallback.npgettext(context, msgid1, msgid2, n)
     471              if n == 1:
     472                  tmsg = msgid1
     473              else:
     474                  tmsg = msgid2
     475          return tmsg
     476  
     477  
     478  # Locate a .mo file using the gettext strategy
     479  def find(domain, localedir=None, languages=None, all=False):
     480      # Get some reasonable defaults for arguments that were not supplied
     481      if localedir is None:
     482          localedir = _default_localedir
     483      if languages is None:
     484          languages = []
     485          for envar in ('LANGUAGE', 'LC_ALL', 'LC_MESSAGES', 'LANG'):
     486              val = os.environ.get(envar)
     487              if val:
     488                  languages = val.split(':')
     489                  break
     490          if 'C' not in languages:
     491              languages.append('C')
     492      # now normalize and expand the languages
     493      nelangs = []
     494      for lang in languages:
     495          for nelang in _expand_lang(lang):
     496              if nelang not in nelangs:
     497                  nelangs.append(nelang)
     498      # select a language
     499      if all:
     500          result = []
     501      else:
     502          result = None
     503      for lang in nelangs:
     504          if lang == 'C':
     505              break
     506          mofile = os.path.join(localedir, lang, 'LC_MESSAGES', '%s.mo' % domain)
     507          if os.path.exists(mofile):
     508              if all:
     509                  result.append(mofile)
     510              else:
     511                  return mofile
     512      return result
     513  
     514  
     515  # a mapping between absolute .mo file path and Translation object
     516  _translations = {}
     517  
     518  
     519  def translation(domain, localedir=None, languages=None,
     520                  class_=None, fallback=False):
     521      if class_ is None:
     522          class_ = GNUTranslations
     523      mofiles = find(domain, localedir, languages, all=True)
     524      if not mofiles:
     525          if fallback:
     526              return NullTranslations()
     527          from errno import ENOENT
     528          raise FileNotFoundError(ENOENT,
     529                                  'No translation file found for domain', domain)
     530      # Avoid opening, reading, and parsing the .mo file after it's been done
     531      # once.
     532      result = None
     533      for mofile in mofiles:
     534          key = (class_, os.path.abspath(mofile))
     535          t = _translations.get(key)
     536          if t is None:
     537              with open(mofile, 'rb') as fp:
     538                  t = _translations.setdefault(key, class_(fp))
     539          # Copy the translation object to allow setting fallbacks and
     540          # output charset. All other instance data is shared with the
     541          # cached object.
     542          # Delay copy import for speeding up gettext import when .mo files
     543          # are not used.
     544          import copy
     545          t = copy.copy(t)
     546          if result is None:
     547              result = t
     548          else:
     549              result.add_fallback(t)
     550      return result
     551  
     552  
     553  def install(domain, localedir=None, *, names=None):
     554      t = translation(domain, localedir, fallback=True)
     555      t.install(names)
     556  
     557  
     558  # a mapping b/w domains and locale directories
     559  _localedirs = {}
     560  # current global domain, `messages' used for compatibility w/ GNU gettext
     561  _current_domain = 'messages'
     562  
     563  
     564  def textdomain(domain=None):
     565      global _current_domain
     566      if domain is not None:
     567          _current_domain = domain
     568      return _current_domain
     569  
     570  
     571  def bindtextdomain(domain, localedir=None):
     572      global _localedirs
     573      if localedir is not None:
     574          _localedirs[domain] = localedir
     575      return _localedirs.get(domain, _default_localedir)
     576  
     577  
     578  def dgettext(domain, message):
     579      try:
     580          t = translation(domain, _localedirs.get(domain, None))
     581      except OSError:
     582          return message
     583      return t.gettext(message)
     584  
     585  
     586  def dngettext(domain, msgid1, msgid2, n):
     587      try:
     588          t = translation(domain, _localedirs.get(domain, None))
     589      except OSError:
     590          if n == 1:
     591              return msgid1
     592          else:
     593              return msgid2
     594      return t.ngettext(msgid1, msgid2, n)
     595  
     596  
     597  def dpgettext(domain, context, message):
     598      try:
     599          t = translation(domain, _localedirs.get(domain, None))
     600      except OSError:
     601          return message
     602      return t.pgettext(context, message)
     603  
     604  
     605  def dnpgettext(domain, context, msgid1, msgid2, n):
     606      try:
     607          t = translation(domain, _localedirs.get(domain, None))
     608      except OSError:
     609          if n == 1:
     610              return msgid1
     611          else:
     612              return msgid2
     613      return t.npgettext(context, msgid1, msgid2, n)
     614  
     615  
     616  def gettext(message):
     617      return dgettext(_current_domain, message)
     618  
     619  
     620  def ngettext(msgid1, msgid2, n):
     621      return dngettext(_current_domain, msgid1, msgid2, n)
     622  
     623  
     624  def pgettext(context, message):
     625      return dpgettext(_current_domain, context, message)
     626  
     627  
     628  def npgettext(context, msgid1, msgid2, n):
     629      return dnpgettext(_current_domain, context, msgid1, msgid2, n)
     630  
     631  
     632  # dcgettext() has been deemed unnecessary and is not implemented.
     633  
     634  # James Henstridge's Catalog constructor from GNOME gettext.  Documented usage
     635  # was:
     636  #
     637  #    import gettext
     638  #    cat = gettext.Catalog(PACKAGE, localedir=LOCALEDIR)
     639  #    _ = cat.gettext
     640  #    print _('Hello World')
     641  
     642  # The resulting catalog object currently don't support access through a
     643  # dictionary API, which was supported (but apparently unused) in GNOME
     644  # gettext.
     645  
     646  Catalog = translation