1  """Internationalization and localization support.
       2  
       3  This module provides internationalization (I18N) and localization (L10N)
       4  support for your Python programs by providing an interface to the GNU gettext
       5  message catalog library.
       6  
       7  I18N refers to the operation by which a program is made aware of multiple
       8  languages.  L10N refers to the adaptation of your program, once
       9  internationalized, to the local language and cultural habits.
      10  
      11  """
      12  
      13  # This module represents the integration of work, contributions, feedback, and
      14  # suggestions from the following people:
      15  #
      16  # Martin von Loewis, who wrote the initial implementation of the underlying
      17  # C-based libintlmodule (later renamed _gettext), along with a skeletal
      18  # gettext.py implementation.
      19  #
      20  # Peter Funk, who wrote fintl.py, a fairly complete wrapper around intlmodule,
      21  # which also included a pure-Python implementation to read .mo files if
      22  # intlmodule wasn't available.
      23  #
      24  # James Henstridge, who also wrote a gettext.py module, which has some
      25  # interesting, but currently unsupported experimental features: the notion of
      26  # a Catalog class and instances, and the ability to add to a catalog file via
      27  # a Python API.
      28  #
      29  # Barry Warsaw integrated these modules, wrote the .install() API and code,
      30  # and conformed all C and Python code to Python's coding standards.
      31  #
      32  # Francois Pinard and Marc-Andre Lemburg also contributed valuably to this
      33  # module.
      34  #
      35  # J. David Ibanez implemented plural forms. Bruno Haible fixed some bugs.
      36  #
      37  # TODO:
      38  # - Lazy loading of .mo files.  Currently the entire catalog is loaded into
      39  #   memory, but that's probably bad for large translated programs.  Instead,
      40  #   the lexical sort of original strings in GNU .mo files should be exploited
      41  #   to do binary searches and lazy initializations.  Or you might want to use
      42  #   the undocumented double-hash algorithm for .mo files with hash tables, but
      43  #   you'll need to study the GNU gettext code to do this.
      44  #
      45  # - Support Solaris .mo file formats.  Unfortunately, we've been unable to
      46  #   find this format documented anywhere.
      47  
      48  
      49  import os
      50  import re
      51  import sys
      52  
      53  
      54  __all__ = ['NullTranslations', 'GNUTranslations', 'Catalog',
      55             'bindtextdomain', 'find', 'translation', 'install',
      56             'textdomain', 'dgettext', 'dngettext', 'gettext',
      57             'ngettext', 'pgettext', 'dpgettext', 'npgettext',
      58             'dnpgettext'
      59             ]
      60  
      61  _default_localedir = os.path.join(sys.base_prefix, 'share', 'locale')
      62  
      63  # Expression parsing for plural form selection.
      64  #
      65  # The gettext library supports a small subset of C syntax.  The only
      66  # incompatible difference is that integer literals starting with zero are
      67  # decimal.
      68  #
      69  # https://www.gnu.org/software/gettext/manual/gettext.html#Plural-forms
      70  # http://git.savannah.gnu.org/cgit/gettext.git/tree/gettext-runtime/intl/plural.y
      71  
      72  _token_pattern = re.compile(r"""
      73          (?P<WHITESPACES>[ \t]+)                    | # spaces and horizontal tabs
      74          (?P<NUMBER>[0-9]+\b)                       | # decimal integer
      75          (?P<NAME>n\b)                              | # only n is allowed
      76          (?P<PARENTHESIS>[()])                      |
      77          (?P<OPERATOR>[-*/%+?:]|[><!]=?|==|&&|\|\|) | # !, *, /, %, +, -, <, >,
      78                                                       # <=, >=, ==, !=, &&, ||,
      79                                                       # ? :
      80                                                       # unary and bitwise ops
      81                                                       # not allowed
      82          (?P<INVALID>\w+|.)                           # invalid token
      83      """, re.VERBOSE|re.DOTALL)
      84  
      85  
      86  def _tokenize(plural):
      87      for mo in re.finditer(_token_pattern, plural):
      88          kind = mo.lastgroup
      89          if kind == 'WHITESPACES':
      90              continue
      91          value = mo.group(kind)
      92          if kind == 'INVALID':
      93              raise ValueError('invalid token in plural form: %s' % value)
      94          yield value
      95      yield ''
      96  
      97  
      98  def _error(value):
      99      if value:
     100          return ValueError('unexpected token in plural form: %s' % value)
     101      else:
     102          return ValueError('unexpected end of plural form')
     103  
     104  
     105  _binary_ops = (
     106      ('||',),
     107      ('&&',),
     108      ('==', '!='),
     109      ('<', '>', '<=', '>='),
     110      ('+', '-'),
     111      ('*', '/', '%'),
     112  )
     113  _binary_ops = {op: i for i, ops in enumerate(_binary_ops, 1) for op in ops}
     114  _c2py_ops = {'||': 'or', '&&': 'and', '/': '//'}
     115  
     116  
     117  def _parse(tokens, priority=-1):
     118      result = ''
     119      nexttok = next(tokens)
     120      while nexttok == '!':
     121          result += 'not '
     122          nexttok = next(tokens)
     123  
     124      if nexttok == '(':
     125          sub, nexttok = _parse(tokens)
     126          result = '%s(%s)' % (result, sub)
     127          if nexttok != ')':
     128              raise ValueError('unbalanced parenthesis in plural form')
     129      elif nexttok == 'n':
     130          result = '%s%s' % (result, nexttok)
     131      else:
     132          try:
     133              value = int(nexttok, 10)
     134          except ValueError:
     135              raise _error(nexttok) from None
     136          result = '%s%d' % (result, value)
     137      nexttok = next(tokens)
     138  
     139      j = 100
     140      while nexttok in _binary_ops:
     141          i = _binary_ops[nexttok]
     142          if i < priority:
     143              break
     144          # Break chained comparisons
     145          if i in (3, 4) and j in (3, 4):  # '==', '!=', '<', '>', '<=', '>='
     146              result = '(%s)' % result
     147          # Replace some C operators by their Python equivalents
     148          op = _c2py_ops.get(nexttok, nexttok)
     149          right, nexttok = _parse(tokens, i + 1)
     150          result = '%s %s %s' % (result, op, right)
     151          j = i
     152      if j == priority == 4:  # '<', '>', '<=', '>='
     153          result = '(%s)' % result
     154  
     155      if nexttok == '?' and priority <= 0:
     156          if_true, nexttok = _parse(tokens, 0)
     157          if nexttok != ':':
     158              raise _error(nexttok)
     159          if_false, nexttok = _parse(tokens)
     160          result = '%s if %s else %s' % (if_true, result, if_false)
     161          if priority == 0:
     162              result = '(%s)' % result
     163  
     164      return result, nexttok
     165  
     166  
     167  def _as_int(n):
     168      try:
     169          i = round(n)
     170      except TypeError:
     171          raise TypeError('Plural value must be an integer, got %s' %
     172                          (n.__class__.__name__,)) from None
     173      import warnings
     174      warnings.warn('Plural value must be an integer, got %s' %
     175                    (n.__class__.__name__,),
     176                    DeprecationWarning, 4)
     177      return n
     178  
     179  
     180  def c2py(plural):
     181      """Gets a C expression as used in PO files for plural forms and returns a
     182      Python function that implements an equivalent expression.
     183      """
     184  
     185      if len(plural) > 1000:
     186          raise ValueError('plural form expression is too long')
     187      try:
     188          result, nexttok = _parse(_tokenize(plural))
     189          if nexttok:
     190              raise _error(nexttok)
     191  
     192          depth = 0
     193          for c in result:
     194              if c == '(':
     195                  depth += 1
     196                  if depth > 20:
     197                      # Python compiler limit is about 90.
     198                      # The most complex example has 2.
     199                      raise ValueError('plural form expression is too complex')
     200              elif c == ')':
     201                  depth -= 1
     202  
     203          ns = {'_as_int': _as_int}
     204          exec('''if True:
     205              def func(n):
     206                  if not isinstance(n, int):
     207                      n = _as_int(n)
     208                  return int(%s)
     209              ''' % result, ns)
     210          return ns['func']
     211      except RecursionError:
     212          # Recursion error can be raised in _parse() or exec().
     213          raise ValueError('plural form expression is too complex')
     214  
     215  
     216  def _expand_lang(loc):
     217      import locale
     218      loc = locale.normalize(loc)
     219      COMPONENT_CODESET   = 1 << 0
     220      COMPONENT_TERRITORY = 1 << 1
     221      COMPONENT_MODIFIER  = 1 << 2
     222      # split up the locale into its base components
     223      mask = 0
     224      pos = loc.find('@')
     225      if pos >= 0:
     226          modifier = loc[pos:]
     227          loc = loc[:pos]
     228          mask |= COMPONENT_MODIFIER
     229      else:
     230          modifier = ''
     231      pos = loc.find('.')
     232      if pos >= 0:
     233          codeset = loc[pos:]
     234          loc = loc[:pos]
     235          mask |= COMPONENT_CODESET
     236      else:
     237          codeset = ''
     238      pos = loc.find('_')
     239      if pos >= 0:
     240          territory = loc[pos:]
     241          loc = loc[:pos]
     242          mask |= COMPONENT_TERRITORY
     243      else:
     244          territory = ''
     245      language = loc
     246      ret = []
     247      for i in range(mask+1):
     248          if not (i & ~mask):  # if all components for this combo exist ...
     249              val = language
     250              if i & COMPONENT_TERRITORY: val += territory
     251              if i & COMPONENT_CODESET:   val += codeset
     252              if i & COMPONENT_MODIFIER:  val += modifier
     253              ret.append(val)
     254      ret.reverse()
     255      return ret
     256  
     257  
     258  class ESC[4;38;5;81mNullTranslations:
     259      def __init__(self, fp=None):
     260          self._info = {}
     261          self._charset = None
     262          self._fallback = None
     263          if fp is not None:
     264              self._parse(fp)
     265  
     266      def _parse(self, fp):
     267          pass
     268  
     269      def add_fallback(self, fallback):
     270          if self._fallback:
     271              self._fallback.add_fallback(fallback)
     272          else:
     273              self._fallback = fallback
     274  
     275      def gettext(self, message):
     276          if self._fallback:
     277              return self._fallback.gettext(message)
     278          return message
     279  
     280      def ngettext(self, msgid1, msgid2, n):
     281          if self._fallback:
     282              return self._fallback.ngettext(msgid1, msgid2, n)
     283          if n == 1:
     284              return msgid1
     285          else:
     286              return msgid2
     287  
     288      def pgettext(self, context, message):
     289          if self._fallback:
     290              return self._fallback.pgettext(context, message)
     291          return message
     292  
     293      def npgettext(self, context, msgid1, msgid2, n):
     294          if self._fallback:
     295              return self._fallback.npgettext(context, msgid1, msgid2, n)
     296          if n == 1:
     297              return msgid1
     298          else:
     299              return msgid2
     300  
     301      def info(self):
     302          return self._info
     303  
     304      def charset(self):
     305          return self._charset
     306  
     307      def install(self, names=None):
     308          import builtins
     309          builtins.__dict__['_'] = self.gettext
     310          if names is not None:
     311              allowed = {'gettext', 'ngettext', 'npgettext', 'pgettext'}
     312              for name in allowed & set(names):
     313                  builtins.__dict__[name] = getattr(self, name)
     314  
     315  
     316  class ESC[4;38;5;81mGNUTranslations(ESC[4;38;5;149mNullTranslations):
     317      # Magic number of .mo files
     318      LE_MAGIC = 0x950412de
     319      BE_MAGIC = 0xde120495
     320  
     321      # The encoding of a msgctxt and a msgid in a .mo file is
     322      # msgctxt + "\x04" + msgid (gettext version >= 0.15)
     323      CONTEXT = "%s\x04%s"
     324  
     325      # Acceptable .mo versions
     326      VERSIONS = (0, 1)
     327  
     328      def _get_versions(self, version):
     329          """Returns a tuple of major version, minor version"""
     330          return (version >> 16, version & 0xffff)
     331  
     332      def _parse(self, fp):
     333          """Override this method to support alternative .mo formats."""
     334          # Delay struct import for speeding up gettext import when .mo files
     335          # are not used.
     336          from struct import unpack
     337          filename = getattr(fp, 'name', '')
     338          # Parse the .mo file header, which consists of 5 little endian 32
     339          # bit words.
     340          self._catalog = catalog = {}
     341          self.plural = lambda n: int(n != 1) # germanic plural by default
     342          buf = fp.read()
     343          buflen = len(buf)
     344          # Are we big endian or little endian?
     345          magic = unpack('<I', buf[:4])[0]
     346          if magic == self.LE_MAGIC:
     347              version, msgcount, masteridx, transidx = unpack('<4I', buf[4:20])
     348              ii = '<II'
     349          elif magic == self.BE_MAGIC:
     350              version, msgcount, masteridx, transidx = unpack('>4I', buf[4:20])
     351              ii = '>II'
     352          else:
     353              raise OSError(0, 'Bad magic number', filename)
     354  
     355          major_version, minor_version = self._get_versions(version)
     356  
     357          if major_version not in self.VERSIONS:
     358              raise OSError(0, 'Bad version number ' + str(major_version), filename)
     359  
     360          # Now put all messages from the .mo file buffer into the catalog
     361          # dictionary.
     362          for i in range(0, msgcount):
     363              mlen, moff = unpack(ii, buf[masteridx:masteridx+8])
     364              mend = moff + mlen
     365              tlen, toff = unpack(ii, buf[transidx:transidx+8])
     366              tend = toff + tlen
     367              if mend < buflen and tend < buflen:
     368                  msg = buf[moff:mend]
     369                  tmsg = buf[toff:tend]
     370              else:
     371                  raise OSError(0, 'File is corrupt', filename)
     372              # See if we're looking at GNU .mo conventions for metadata
     373              if mlen == 0:
     374                  # Catalog description
     375                  lastk = None
     376                  for b_item in tmsg.split(b'\n'):
     377                      item = b_item.decode().strip()
     378                      if not item:
     379                          continue
     380                      # Skip over comment lines:
     381                      if item.startswith('#-#-#-#-#') and item.endswith('#-#-#-#-#'):
     382                          continue
     383                      k = v = None
     384                      if ':' in item:
     385                          k, v = item.split(':', 1)
     386                          k = k.strip().lower()
     387                          v = v.strip()
     388                          self._info[k] = v
     389                          lastk = k
     390                      elif lastk:
     391                          self._info[lastk] += '\n' + item
     392                      if k == 'content-type':
     393                          self._charset = v.split('charset=')[1]
     394                      elif k == 'plural-forms':
     395                          v = v.split(';')
     396                          plural = v[1].split('plural=')[1]
     397                          self.plural = c2py(plural)
     398              # Note: we unconditionally convert both msgids and msgstrs to
     399              # Unicode using the character encoding specified in the charset
     400              # parameter of the Content-Type header.  The gettext documentation
     401              # strongly encourages msgids to be us-ascii, but some applications
     402              # require alternative encodings (e.g. Zope's ZCML and ZPT).  For
     403              # traditional gettext applications, the msgid conversion will
     404              # cause no problems since us-ascii should always be a subset of
     405              # the charset encoding.  We may want to fall back to 8-bit msgids
     406              # if the Unicode conversion fails.
     407              charset = self._charset or 'ascii'
     408              if b'\x00' in msg:
     409                  # Plural forms
     410                  msgid1, msgid2 = msg.split(b'\x00')
     411                  tmsg = tmsg.split(b'\x00')
     412                  msgid1 = str(msgid1, charset)
     413                  for i, x in enumerate(tmsg):
     414                      catalog[(msgid1, i)] = str(x, charset)
     415              else:
     416                  catalog[str(msg, charset)] = str(tmsg, charset)
     417              # advance to next entry in the seek tables
     418              masteridx += 8
     419              transidx += 8
     420  
     421      def gettext(self, message):
     422          missing = object()
     423          tmsg = self._catalog.get(message, missing)
     424          if tmsg is missing:
     425              tmsg = self._catalog.get((message, self.plural(1)), missing)
     426          if tmsg is not missing:
     427              return tmsg
     428          if self._fallback:
     429              return self._fallback.gettext(message)
     430          return message
     431  
     432      def ngettext(self, msgid1, msgid2, n):
     433          try:
     434              tmsg = self._catalog[(msgid1, self.plural(n))]
     435          except KeyError:
     436              if self._fallback:
     437                  return self._fallback.ngettext(msgid1, msgid2, n)
     438              if n == 1:
     439                  tmsg = msgid1
     440              else:
     441                  tmsg = msgid2
     442          return tmsg
     443  
     444      def pgettext(self, context, message):
     445          ctxt_msg_id = self.CONTEXT % (context, message)
     446          missing = object()
     447          tmsg = self._catalog.get(ctxt_msg_id, missing)
     448          if tmsg is missing:
     449              tmsg = self._catalog.get((ctxt_msg_id, self.plural(1)), missing)
     450          if tmsg is not missing:
     451              return tmsg
     452          if self._fallback:
     453              return self._fallback.pgettext(context, message)
     454          return message
     455  
     456      def npgettext(self, context, msgid1, msgid2, n):
     457          ctxt_msg_id = self.CONTEXT % (context, msgid1)
     458          try:
     459              tmsg = self._catalog[ctxt_msg_id, self.plural(n)]
     460          except KeyError:
     461              if self._fallback:
     462                  return self._fallback.npgettext(context, msgid1, msgid2, n)
     463              if n == 1:
     464                  tmsg = msgid1
     465              else:
     466                  tmsg = msgid2
     467          return tmsg
     468  
     469  
     470  # Locate a .mo file using the gettext strategy
     471  def find(domain, localedir=None, languages=None, all=False):
     472      # Get some reasonable defaults for arguments that were not supplied
     473      if localedir is None:
     474          localedir = _default_localedir
     475      if languages is None:
     476          languages = []
     477          for envar in ('LANGUAGE', 'LC_ALL', 'LC_MESSAGES', 'LANG'):
     478              val = os.environ.get(envar)
     479              if val:
     480                  languages = val.split(':')
     481                  break
     482          if 'C' not in languages:
     483              languages.append('C')
     484      # now normalize and expand the languages
     485      nelangs = []
     486      for lang in languages:
     487          for nelang in _expand_lang(lang):
     488              if nelang not in nelangs:
     489                  nelangs.append(nelang)
     490      # select a language
     491      if all:
     492          result = []
     493      else:
     494          result = None
     495      for lang in nelangs:
     496          if lang == 'C':
     497              break
     498          mofile = os.path.join(localedir, lang, 'LC_MESSAGES', '%s.mo' % domain)
     499          if os.path.exists(mofile):
     500              if all:
     501                  result.append(mofile)
     502              else:
     503                  return mofile
     504      return result
     505  
     506  
     507  # a mapping between absolute .mo file path and Translation object
     508  _translations = {}
     509  
     510  
     511  def translation(domain, localedir=None, languages=None,
     512                  class_=None, fallback=False):
     513      if class_ is None:
     514          class_ = GNUTranslations
     515      mofiles = find(domain, localedir, languages, all=True)
     516      if not mofiles:
     517          if fallback:
     518              return NullTranslations()
     519          from errno import ENOENT
     520          raise FileNotFoundError(ENOENT,
     521                                  'No translation file found for domain', domain)
     522      # Avoid opening, reading, and parsing the .mo file after it's been done
     523      # once.
     524      result = None
     525      for mofile in mofiles:
     526          key = (class_, os.path.abspath(mofile))
     527          t = _translations.get(key)
     528          if t is None:
     529              with open(mofile, 'rb') as fp:
     530                  t = _translations.setdefault(key, class_(fp))
     531          # Copy the translation object to allow setting fallbacks and
     532          # output charset. All other instance data is shared with the
     533          # cached object.
     534          # Delay copy import for speeding up gettext import when .mo files
     535          # are not used.
     536          import copy
     537          t = copy.copy(t)
     538          if result is None:
     539              result = t
     540          else:
     541              result.add_fallback(t)
     542      return result
     543  
     544  
     545  def install(domain, localedir=None, *, names=None):
     546      t = translation(domain, localedir, fallback=True)
     547      t.install(names)
     548  
     549  
     550  # a mapping b/w domains and locale directories
     551  _localedirs = {}
     552  # current global domain, `messages' used for compatibility w/ GNU gettext
     553  _current_domain = 'messages'
     554  
     555  
     556  def textdomain(domain=None):
     557      global _current_domain
     558      if domain is not None:
     559          _current_domain = domain
     560      return _current_domain
     561  
     562  
     563  def bindtextdomain(domain, localedir=None):
     564      global _localedirs
     565      if localedir is not None:
     566          _localedirs[domain] = localedir
     567      return _localedirs.get(domain, _default_localedir)
     568  
     569  
     570  def dgettext(domain, message):
     571      try:
     572          t = translation(domain, _localedirs.get(domain, None))
     573      except OSError:
     574          return message
     575      return t.gettext(message)
     576  
     577  
     578  def dngettext(domain, msgid1, msgid2, n):
     579      try:
     580          t = translation(domain, _localedirs.get(domain, None))
     581      except OSError:
     582          if n == 1:
     583              return msgid1
     584          else:
     585              return msgid2
     586      return t.ngettext(msgid1, msgid2, n)
     587  
     588  
     589  def dpgettext(domain, context, message):
     590      try:
     591          t = translation(domain, _localedirs.get(domain, None))
     592      except OSError:
     593          return message
     594      return t.pgettext(context, message)
     595  
     596  
     597  def dnpgettext(domain, context, msgid1, msgid2, n):
     598      try:
     599          t = translation(domain, _localedirs.get(domain, None))
     600      except OSError:
     601          if n == 1:
     602              return msgid1
     603          else:
     604              return msgid2
     605      return t.npgettext(context, msgid1, msgid2, n)
     606  
     607  
     608  def gettext(message):
     609      return dgettext(_current_domain, message)
     610  
     611  
     612  def ngettext(msgid1, msgid2, n):
     613      return dngettext(_current_domain, msgid1, msgid2, n)
     614  
     615  
     616  def pgettext(context, message):
     617      return dpgettext(_current_domain, context, message)
     618  
     619  
     620  def npgettext(context, msgid1, msgid2, n):
     621      return dnpgettext(_current_domain, context, msgid1, msgid2, n)
     622  
     623  
     624  # dcgettext() has been deemed unnecessary and is not implemented.
     625  
     626  # James Henstridge's Catalog constructor from GNOME gettext.  Documented usage
     627  # was:
     628  #
     629  #    import gettext
     630  #    cat = gettext.Catalog(PACKAGE, localedir=LOCALEDIR)
     631  #    _ = cat.gettext
     632  #    print _('Hello World')
     633  
     634  # The resulting catalog object currently don't support access through a
     635  # dictionary API, which was supported (but apparently unused) in GNOME
     636  # gettext.
     637  
     638  Catalog = translation