1  #! /usr/bin/env python3
       2  # Written by Martin v. Löwis <loewis@informatik.hu-berlin.de>
       3  
       4  """Generate binary message catalog from textual translation description.
       5  
       6  This program converts a textual Uniforum-style message catalog (.po file) into
       7  a binary GNU catalog (.mo file).  This is essentially the same function as the
       8  GNU msgfmt program, however, it is a simpler implementation.  Currently it
       9  does not handle plural forms but it does handle message contexts.
      10  
      11  Usage: msgfmt.py [OPTIONS] filename.po
      12  
      13  Options:
      14      -o file
      15      --output-file=file
      16          Specify the output file to write to.  If omitted, output will go to a
      17          file named filename.mo (based off the input file name).
      18  
      19      -h
      20      --help
      21          Print this message and exit.
      22  
      23      -V
      24      --version
      25          Display version information and exit.
      26  """
      27  
      28  import os
      29  import sys
      30  import ast
      31  import getopt
      32  import struct
      33  import array
      34  from email.parser import HeaderParser
      35  
      36  __version__ = "1.2"
      37  
      38  MESSAGES = {}
      39  
      40  
      41  def usage(code, msg=''):
      42      print(__doc__, file=sys.stderr)
      43      if msg:
      44          print(msg, file=sys.stderr)
      45      sys.exit(code)
      46  
      47  
      48  def add(ctxt, id, str, fuzzy):
      49      "Add a non-fuzzy translation to the dictionary."
      50      global MESSAGES
      51      if not fuzzy and str:
      52          if ctxt is None:
      53              MESSAGES[id] = str
      54          else:
      55              MESSAGES[b"%b\x04%b" % (ctxt, id)] = str
      56  
      57  
      58  def generate():
      59      "Return the generated output."
      60      global MESSAGES
      61      # the keys are sorted in the .mo file
      62      keys = sorted(MESSAGES.keys())
      63      offsets = []
      64      ids = strs = b''
      65      for id in keys:
      66          # For each string, we need size and file offset.  Each string is NUL
      67          # terminated; the NUL does not count into the size.
      68          offsets.append((len(ids), len(id), len(strs), len(MESSAGES[id])))
      69          ids += id + b'\0'
      70          strs += MESSAGES[id] + b'\0'
      71      output = ''
      72      # The header is 7 32-bit unsigned integers.  We don't use hash tables, so
      73      # the keys start right after the index tables.
      74      # translated string.
      75      keystart = 7*4+16*len(keys)
      76      # and the values start after the keys
      77      valuestart = keystart + len(ids)
      78      koffsets = []
      79      voffsets = []
      80      # The string table first has the list of keys, then the list of values.
      81      # Each entry has first the size of the string, then the file offset.
      82      for o1, l1, o2, l2 in offsets:
      83          koffsets += [l1, o1+keystart]
      84          voffsets += [l2, o2+valuestart]
      85      offsets = koffsets + voffsets
      86      output = struct.pack("Iiiiiii",
      87                           0x950412de,       # Magic
      88                           0,                 # Version
      89                           len(keys),         # # of entries
      90                           7*4,               # start of key index
      91                           7*4+len(keys)*8,   # start of value index
      92                           0, 0)              # size and offset of hash table
      93      output += array.array("i", offsets).tobytes()
      94      output += ids
      95      output += strs
      96      return output
      97  
      98  
      99  def make(filename, outfile):
     100      ID = 1
     101      STR = 2
     102      CTXT = 3
     103  
     104      # Compute .mo name from .po name and arguments
     105      if filename.endswith('.po'):
     106          infile = filename
     107      else:
     108          infile = filename + '.po'
     109      if outfile is None:
     110          outfile = os.path.splitext(infile)[0] + '.mo'
     111  
     112      try:
     113          with open(infile, 'rb') as f:
     114              lines = f.readlines()
     115      except IOError as msg:
     116          print(msg, file=sys.stderr)
     117          sys.exit(1)
     118  
     119      section = msgctxt = None
     120      fuzzy = 0
     121  
     122      # Start off assuming Latin-1, so everything decodes without failure,
     123      # until we know the exact encoding
     124      encoding = 'latin-1'
     125  
     126      # Parse the catalog
     127      lno = 0
     128      for l in lines:
     129          l = l.decode(encoding)
     130          lno += 1
     131          # If we get a comment line after a msgstr, this is a new entry
     132          if l[0] == '#' and section == STR:
     133              add(msgctxt, msgid, msgstr, fuzzy)
     134              section = msgctxt = None
     135              fuzzy = 0
     136          # Record a fuzzy mark
     137          if l[:2] == '#,' and 'fuzzy' in l:
     138              fuzzy = 1
     139          # Skip comments
     140          if l[0] == '#':
     141              continue
     142          # Now we are in a msgid or msgctxt section, output previous section
     143          if l.startswith('msgctxt'):
     144              if section == STR:
     145                  add(msgctxt, msgid, msgstr, fuzzy)
     146              section = CTXT
     147              l = l[7:]
     148              msgctxt = b''
     149          elif l.startswith('msgid') and not l.startswith('msgid_plural'):
     150              if section == STR:
     151                  add(msgctxt, msgid, msgstr, fuzzy)
     152                  if not msgid:
     153                      # See whether there is an encoding declaration
     154                      p = HeaderParser()
     155                      charset = p.parsestr(msgstr.decode(encoding)).get_content_charset()
     156                      if charset:
     157                          encoding = charset
     158              section = ID
     159              l = l[5:]
     160              msgid = msgstr = b''
     161              is_plural = False
     162          # This is a message with plural forms
     163          elif l.startswith('msgid_plural'):
     164              if section != ID:
     165                  print('msgid_plural not preceded by msgid on %s:%d' % (infile, lno),
     166                        file=sys.stderr)
     167                  sys.exit(1)
     168              l = l[12:]
     169              msgid += b'\0' # separator of singular and plural
     170              is_plural = True
     171          # Now we are in a msgstr section
     172          elif l.startswith('msgstr'):
     173              section = STR
     174              if l.startswith('msgstr['):
     175                  if not is_plural:
     176                      print('plural without msgid_plural on %s:%d' % (infile, lno),
     177                            file=sys.stderr)
     178                      sys.exit(1)
     179                  l = l.split(']', 1)[1]
     180                  if msgstr:
     181                      msgstr += b'\0' # Separator of the various plural forms
     182              else:
     183                  if is_plural:
     184                      print('indexed msgstr required for plural on  %s:%d' % (infile, lno),
     185                            file=sys.stderr)
     186                      sys.exit(1)
     187                  l = l[6:]
     188          # Skip empty lines
     189          l = l.strip()
     190          if not l:
     191              continue
     192          l = ast.literal_eval(l)
     193          if section == CTXT:
     194              msgctxt += l.encode(encoding)
     195          elif section == ID:
     196              msgid += l.encode(encoding)
     197          elif section == STR:
     198              msgstr += l.encode(encoding)
     199          else:
     200              print('Syntax error on %s:%d' % (infile, lno), \
     201                    'before:', file=sys.stderr)
     202              print(l, file=sys.stderr)
     203              sys.exit(1)
     204      # Add last entry
     205      if section == STR:
     206          add(msgctxt, msgid, msgstr, fuzzy)
     207  
     208      # Compute output
     209      output = generate()
     210  
     211      try:
     212          with open(outfile,"wb") as f:
     213              f.write(output)
     214      except IOError as msg:
     215          print(msg, file=sys.stderr)
     216  
     217  
     218  def main():
     219      try:
     220          opts, args = getopt.getopt(sys.argv[1:], 'hVo:',
     221                                     ['help', 'version', 'output-file='])
     222      except getopt.error as msg:
     223          usage(1, msg)
     224  
     225      outfile = None
     226      # parse options
     227      for opt, arg in opts:
     228          if opt in ('-h', '--help'):
     229              usage(0)
     230          elif opt in ('-V', '--version'):
     231              print("msgfmt.py", __version__)
     232              sys.exit(0)
     233          elif opt in ('-o', '--output-file'):
     234              outfile = arg
     235      # do it
     236      if not args:
     237          print('No input file given', file=sys.stderr)
     238          print("Try `msgfmt --help' for more information.", file=sys.stderr)
     239          return
     240  
     241      for filename in args:
     242          make(filename, outfile)
     243  
     244  
     245  if __name__ == '__main__':
     246      main()