1 """ Standard "encodings" Package
2
3 Standard Python encoding modules are stored in this package
4 directory.
5
6 Codec modules must have names corresponding to normalized encoding
7 names as defined in the normalize_encoding() function below, e.g.
8 'utf-8' must be implemented by the module 'utf_8.py'.
9
10 Each codec module must export the following interface:
11
12 * getregentry() -> codecs.CodecInfo object
13 The getregentry() API must return a CodecInfo object with encoder, decoder,
14 incrementalencoder, incrementaldecoder, streamwriter and streamreader
15 attributes which adhere to the Python Codec Interface Standard.
16
17 In addition, a module may optionally also define the following
18 APIs which are then used by the package's codec search function:
19
20 * getaliases() -> sequence of encoding name strings to use as aliases
21
22 Alias names returned by getaliases() must be normalized encoding
23 names as defined by normalize_encoding().
24
25 Written by Marc-Andre Lemburg (mal@lemburg.com).
26
27 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
28
29 """#"
30
31 import codecs
32 import sys
33 from . import aliases
34
35 _cache = {}
36 _unknown = '--unknown--'
37 _import_tail = ['*']
38 _aliases = aliases.aliases
39
40 class ESC[4;38;5;81mCodecRegistryError(ESC[4;38;5;149mLookupError, ESC[4;38;5;149mSystemError):
41 pass
42
43 def normalize_encoding(encoding):
44
45 """ Normalize an encoding name.
46
47 Normalization works as follows: all non-alphanumeric
48 characters except the dot used for Python package names are
49 collapsed and replaced with a single underscore, e.g. ' -;#'
50 becomes '_'. Leading and trailing underscores are removed.
51
52 Note that encoding names should be ASCII only.
53
54 """
55 if isinstance(encoding, bytes):
56 encoding = str(encoding, "ascii")
57
58 chars = []
59 punct = False
60 for c in encoding:
61 if c.isalnum() or c == '.':
62 if punct and chars:
63 chars.append('_')
64 if c.isascii():
65 chars.append(c)
66 punct = False
67 else:
68 punct = True
69 return ''.join(chars)
70
71 def search_function(encoding):
72
73 # Cache lookup
74 entry = _cache.get(encoding, _unknown)
75 if entry is not _unknown:
76 return entry
77
78 # Import the module:
79 #
80 # First try to find an alias for the normalized encoding
81 # name and lookup the module using the aliased name, then try to
82 # lookup the module using the standard import scheme, i.e. first
83 # try in the encodings package, then at top-level.
84 #
85 norm_encoding = normalize_encoding(encoding)
86 aliased_encoding = _aliases.get(norm_encoding) or \
87 _aliases.get(norm_encoding.replace('.', '_'))
88 if aliased_encoding is not None:
89 modnames = [aliased_encoding,
90 norm_encoding]
91 else:
92 modnames = [norm_encoding]
93 for modname in modnames:
94 if not modname or '.' in modname:
95 continue
96 try:
97 # Import is absolute to prevent the possibly malicious import of a
98 # module with side-effects that is not in the 'encodings' package.
99 mod = __import__('encodings.' + modname, fromlist=_import_tail,
100 level=0)
101 except ImportError:
102 # ImportError may occur because 'encodings.(modname)' does not exist,
103 # or because it imports a name that does not exist (see mbcs and oem)
104 pass
105 else:
106 break
107 else:
108 mod = None
109
110 try:
111 getregentry = mod.getregentry
112 except AttributeError:
113 # Not a codec module
114 mod = None
115
116 if mod is None:
117 # Cache misses
118 _cache[encoding] = None
119 return None
120
121 # Now ask the module for the registry entry
122 entry = getregentry()
123 if not isinstance(entry, codecs.CodecInfo):
124 if not 4 <= len(entry) <= 7:
125 raise CodecRegistryError('module "%s" (%s) failed to register'
126 % (mod.__name__, mod.__file__))
127 if not callable(entry[0]) or not callable(entry[1]) or \
128 (entry[2] is not None and not callable(entry[2])) or \
129 (entry[3] is not None and not callable(entry[3])) or \
130 (len(entry) > 4 and entry[4] is not None and not callable(entry[4])) or \
131 (len(entry) > 5 and entry[5] is not None and not callable(entry[5])):
132 raise CodecRegistryError('incompatible codecs in module "%s" (%s)'
133 % (mod.__name__, mod.__file__))
134 if len(entry)<7 or entry[6] is None:
135 entry += (None,)*(6-len(entry)) + (mod.__name__.split(".", 1)[1],)
136 entry = codecs.CodecInfo(*entry)
137
138 # Cache the codec registry entry
139 _cache[encoding] = entry
140
141 # Register its aliases (without overwriting previously registered
142 # aliases)
143 try:
144 codecaliases = mod.getaliases()
145 except AttributeError:
146 pass
147 else:
148 for alias in codecaliases:
149 if alias not in _aliases:
150 _aliases[alias] = modname
151
152 # Return the registry entry
153 return entry
154
155 # Register the search_function in the Python codec registry
156 codecs.register(search_function)
157
158 if sys.platform == 'win32':
159 # bpo-671666, bpo-46668: If Python does not implement a codec for current
160 # Windows ANSI code page, use the "mbcs" codec instead:
161 # WideCharToMultiByte() and MultiByteToWideChar() functions with CP_ACP.
162 # Python does not support custom code pages.
163 def _alias_mbcs(encoding):
164 try:
165 import _winapi
166 ansi_code_page = "cp%s" % _winapi.GetACP()
167 if encoding == ansi_code_page:
168 import encodings.mbcs
169 return encodings.mbcs.getregentry()
170 except ImportError:
171 # Imports may fail while we are shutting down
172 pass
173
174 codecs.register(_alias_mbcs)