1 #!/usr/bin/env python3
2 """ Utility for parsing HTML entity definitions available from:
3
4 http://www.w3.org/ as e.g.
5 http://www.w3.org/TR/REC-html40/HTMLlat1.ent
6
7 Input is read from stdin, output is written to stdout in form of a
8 Python snippet defining a dictionary "entitydefs" mapping literal
9 entity name to character or numeric entity.
10
11 Marc-Andre Lemburg, mal@lemburg.com, 1999.
12 Use as you like. NO WARRANTIES.
13
14 """
15 import re,sys
16
17 entityRE = re.compile(r'<!ENTITY +(\w+) +CDATA +"([^"]+)" +-- +((?:.|\n)+?) *-->')
18
19 def parse(text,pos=0,endpos=None):
20
21 pos = 0
22 if endpos is None:
23 endpos = len(text)
24 d = {}
25 while 1:
26 m = entityRE.search(text,pos,endpos)
27 if not m:
28 break
29 name,charcode,comment = m.groups()
30 d[name] = charcode,comment
31 pos = m.end()
32 return d
33
34 def writefile(f,defs):
35
36 f.write("entitydefs = {\n")
37 items = sorted(defs.items())
38 for name, (charcode,comment) in items:
39 if charcode[:2] == '&#':
40 code = int(charcode[2:-1])
41 if code < 256:
42 charcode = r"'\%o'" % code
43 else:
44 charcode = repr(charcode)
45 else:
46 charcode = repr(charcode)
47 comment = ' '.join(comment.split())
48 f.write(" '%s':\t%s, \t# %s\n" % (name,charcode,comment))
49 f.write('\n}\n')
50
51 if __name__ == '__main__':
52 if len(sys.argv) > 1:
53 with open(sys.argv[1]) as infile:
54 text = infile.read()
55 else:
56 text = sys.stdin.read()
57
58 defs = parse(text)
59
60 if len(sys.argv) > 2:
61 with open(sys.argv[2],'w') as outfile:
62 writefile(outfile, defs)
63 else:
64 writefile(sys.stdout, defs)