1 """Guess the MIME type of a file.
2
3 This module defines two useful functions:
4
5 guess_type(url, strict=True) -- guess the MIME type and encoding of a URL.
6
7 guess_extension(type, strict=True) -- guess the extension for a given MIME type.
8
9 It also contains the following, for tuning the behavior:
10
11 Data:
12
13 knownfiles -- list of files to parse
14 inited -- flag set when init() has been called
15 suffix_map -- dictionary mapping suffixes to suffixes
16 encodings_map -- dictionary mapping suffixes to encodings
17 types_map -- dictionary mapping suffixes to types
18
19 Functions:
20
21 init([files]) -- parse a list of files, default knownfiles (on Windows, the
22 default values are taken from the registry)
23 read_mime_types(file) -- parse one file, return a dictionary or None
24 """
25
26 import os
27 import sys
28 import posixpath
29 import urllib.parse
30
31 try:
32 from _winapi import _mimetypes_read_windows_registry
33 except ImportError:
34 _mimetypes_read_windows_registry = None
35
36 try:
37 import winreg as _winreg
38 except ImportError:
39 _winreg = None
40
41 __all__ = [
42 "knownfiles", "inited", "MimeTypes",
43 "guess_type", "guess_all_extensions", "guess_extension",
44 "add_type", "init", "read_mime_types",
45 "suffix_map", "encodings_map", "types_map", "common_types"
46 ]
47
48 knownfiles = [
49 "/etc/mime.types",
50 "/etc/httpd/mime.types", # Mac OS X
51 "/etc/httpd/conf/mime.types", # Apache
52 "/etc/apache/mime.types", # Apache 1
53 "/etc/apache2/mime.types", # Apache 2
54 "/usr/local/etc/httpd/conf/mime.types",
55 "/usr/local/lib/netscape/mime.types",
56 "/usr/local/etc/httpd/conf/mime.types", # Apache 1.2
57 "/usr/local/etc/mime.types", # Apache 1.3
58 ]
59
60 inited = False
61 _db = None
62
63
64 class ESC[4;38;5;81mMimeTypes:
65 """MIME-types datastore.
66
67 This datastore can handle information from mime.types-style files
68 and supports basic determination of MIME type from a filename or
69 URL, and can guess a reasonable extension given a MIME type.
70 """
71
72 def __init__(self, filenames=(), strict=True):
73 if not inited:
74 init()
75 self.encodings_map = _encodings_map_default.copy()
76 self.suffix_map = _suffix_map_default.copy()
77 self.types_map = ({}, {}) # dict for (non-strict, strict)
78 self.types_map_inv = ({}, {})
79 for (ext, type) in _types_map_default.items():
80 self.add_type(type, ext, True)
81 for (ext, type) in _common_types_default.items():
82 self.add_type(type, ext, False)
83 for name in filenames:
84 self.read(name, strict)
85
86 def add_type(self, type, ext, strict=True):
87 """Add a mapping between a type and an extension.
88
89 When the extension is already known, the new
90 type will replace the old one. When the type
91 is already known the extension will be added
92 to the list of known extensions.
93
94 If strict is true, information will be added to
95 list of standard types, else to the list of non-standard
96 types.
97 """
98 self.types_map[strict][ext] = type
99 exts = self.types_map_inv[strict].setdefault(type, [])
100 if ext not in exts:
101 exts.append(ext)
102
103 def guess_type(self, url, strict=True):
104 """Guess the type of a file which is either a URL or a path-like object.
105
106 Return value is a tuple (type, encoding) where type is None if
107 the type can't be guessed (no or unknown suffix) or a string
108 of the form type/subtype, usable for a MIME Content-type
109 header; and encoding is None for no encoding or the name of
110 the program used to encode (e.g. compress or gzip). The
111 mappings are table driven. Encoding suffixes are case
112 sensitive; type suffixes are first tried case sensitive, then
113 case insensitive.
114
115 The suffixes .tgz, .taz and .tz (case sensitive!) are all
116 mapped to '.tar.gz'. (This is table-driven too, using the
117 dictionary suffix_map.)
118
119 Optional `strict' argument when False adds a bunch of commonly found,
120 but non-standard types.
121 """
122 url = os.fspath(url)
123 scheme, url = urllib.parse._splittype(url)
124 if scheme == 'data':
125 # syntax of data URLs:
126 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
127 # mediatype := [ type "/" subtype ] *( ";" parameter )
128 # data := *urlchar
129 # parameter := attribute "=" value
130 # type/subtype defaults to "text/plain"
131 comma = url.find(',')
132 if comma < 0:
133 # bad data URL
134 return None, None
135 semi = url.find(';', 0, comma)
136 if semi >= 0:
137 type = url[:semi]
138 else:
139 type = url[:comma]
140 if '=' in type or '/' not in type:
141 type = 'text/plain'
142 return type, None # never compressed, so encoding is None
143 base, ext = posixpath.splitext(url)
144 while (ext_lower := ext.lower()) in self.suffix_map:
145 base, ext = posixpath.splitext(base + self.suffix_map[ext_lower])
146 # encodings_map is case sensitive
147 if ext in self.encodings_map:
148 encoding = self.encodings_map[ext]
149 base, ext = posixpath.splitext(base)
150 else:
151 encoding = None
152 ext = ext.lower()
153 types_map = self.types_map[True]
154 if ext in types_map:
155 return types_map[ext], encoding
156 elif strict:
157 return None, encoding
158 types_map = self.types_map[False]
159 if ext in types_map:
160 return types_map[ext], encoding
161 else:
162 return None, encoding
163
164 def guess_all_extensions(self, type, strict=True):
165 """Guess the extensions for a file based on its MIME type.
166
167 Return value is a list of strings giving the possible filename
168 extensions, including the leading dot ('.'). The extension is not
169 guaranteed to have been associated with any particular data stream,
170 but would be mapped to the MIME type `type' by guess_type().
171
172 Optional `strict' argument when false adds a bunch of commonly found,
173 but non-standard types.
174 """
175 type = type.lower()
176 extensions = list(self.types_map_inv[True].get(type, []))
177 if not strict:
178 for ext in self.types_map_inv[False].get(type, []):
179 if ext not in extensions:
180 extensions.append(ext)
181 return extensions
182
183 def guess_extension(self, type, strict=True):
184 """Guess the extension for a file based on its MIME type.
185
186 Return value is a string giving a filename extension,
187 including the leading dot ('.'). The extension is not
188 guaranteed to have been associated with any particular data
189 stream, but would be mapped to the MIME type `type' by
190 guess_type(). If no extension can be guessed for `type', None
191 is returned.
192
193 Optional `strict' argument when false adds a bunch of commonly found,
194 but non-standard types.
195 """
196 extensions = self.guess_all_extensions(type, strict)
197 if not extensions:
198 return None
199 return extensions[0]
200
201 def read(self, filename, strict=True):
202 """
203 Read a single mime.types-format file, specified by pathname.
204
205 If strict is true, information will be added to
206 list of standard types, else to the list of non-standard
207 types.
208 """
209 with open(filename, encoding='utf-8') as fp:
210 self.readfp(fp, strict)
211
212 def readfp(self, fp, strict=True):
213 """
214 Read a single mime.types-format file.
215
216 If strict is true, information will be added to
217 list of standard types, else to the list of non-standard
218 types.
219 """
220 while 1:
221 line = fp.readline()
222 if not line:
223 break
224 words = line.split()
225 for i in range(len(words)):
226 if words[i][0] == '#':
227 del words[i:]
228 break
229 if not words:
230 continue
231 type, suffixes = words[0], words[1:]
232 for suff in suffixes:
233 self.add_type(type, '.' + suff, strict)
234
235 def read_windows_registry(self, strict=True):
236 """
237 Load the MIME types database from Windows registry.
238
239 If strict is true, information will be added to
240 list of standard types, else to the list of non-standard
241 types.
242 """
243
244 if not _mimetypes_read_windows_registry and not _winreg:
245 return
246
247 add_type = self.add_type
248 if strict:
249 add_type = lambda type, ext: self.add_type(type, ext, True)
250
251 # Accelerated function if it is available
252 if _mimetypes_read_windows_registry:
253 _mimetypes_read_windows_registry(add_type)
254 elif _winreg:
255 self._read_windows_registry(add_type)
256
257 @classmethod
258 def _read_windows_registry(cls, add_type):
259 def enum_types(mimedb):
260 i = 0
261 while True:
262 try:
263 ctype = _winreg.EnumKey(mimedb, i)
264 except OSError:
265 break
266 else:
267 if '\0' not in ctype:
268 yield ctype
269 i += 1
270
271 with _winreg.OpenKey(_winreg.HKEY_CLASSES_ROOT, '') as hkcr:
272 for subkeyname in enum_types(hkcr):
273 try:
274 with _winreg.OpenKey(hkcr, subkeyname) as subkey:
275 # Only check file extensions
276 if not subkeyname.startswith("."):
277 continue
278 # raises OSError if no 'Content Type' value
279 mimetype, datatype = _winreg.QueryValueEx(
280 subkey, 'Content Type')
281 if datatype != _winreg.REG_SZ:
282 continue
283 add_type(mimetype, subkeyname)
284 except OSError:
285 continue
286
287 def guess_type(url, strict=True):
288 """Guess the type of a file based on its URL.
289
290 Return value is a tuple (type, encoding) where type is None if the
291 type can't be guessed (no or unknown suffix) or a string of the
292 form type/subtype, usable for a MIME Content-type header; and
293 encoding is None for no encoding or the name of the program used
294 to encode (e.g. compress or gzip). The mappings are table
295 driven. Encoding suffixes are case sensitive; type suffixes are
296 first tried case sensitive, then case insensitive.
297
298 The suffixes .tgz, .taz and .tz (case sensitive!) are all mapped
299 to ".tar.gz". (This is table-driven too, using the dictionary
300 suffix_map).
301
302 Optional `strict' argument when false adds a bunch of commonly found, but
303 non-standard types.
304 """
305 if _db is None:
306 init()
307 return _db.guess_type(url, strict)
308
309
310 def guess_all_extensions(type, strict=True):
311 """Guess the extensions for a file based on its MIME type.
312
313 Return value is a list of strings giving the possible filename
314 extensions, including the leading dot ('.'). The extension is not
315 guaranteed to have been associated with any particular data
316 stream, but would be mapped to the MIME type `type' by
317 guess_type(). If no extension can be guessed for `type', None
318 is returned.
319
320 Optional `strict' argument when false adds a bunch of commonly found,
321 but non-standard types.
322 """
323 if _db is None:
324 init()
325 return _db.guess_all_extensions(type, strict)
326
327 def guess_extension(type, strict=True):
328 """Guess the extension for a file based on its MIME type.
329
330 Return value is a string giving a filename extension, including the
331 leading dot ('.'). The extension is not guaranteed to have been
332 associated with any particular data stream, but would be mapped to the
333 MIME type `type' by guess_type(). If no extension can be guessed for
334 `type', None is returned.
335
336 Optional `strict' argument when false adds a bunch of commonly found,
337 but non-standard types.
338 """
339 if _db is None:
340 init()
341 return _db.guess_extension(type, strict)
342
343 def add_type(type, ext, strict=True):
344 """Add a mapping between a type and an extension.
345
346 When the extension is already known, the new
347 type will replace the old one. When the type
348 is already known the extension will be added
349 to the list of known extensions.
350
351 If strict is true, information will be added to
352 list of standard types, else to the list of non-standard
353 types.
354 """
355 if _db is None:
356 init()
357 return _db.add_type(type, ext, strict)
358
359
360 def init(files=None):
361 global suffix_map, types_map, encodings_map, common_types
362 global inited, _db
363 inited = True # so that MimeTypes.__init__() doesn't call us again
364
365 if files is None or _db is None:
366 db = MimeTypes()
367 # Quick return if not supported
368 db.read_windows_registry()
369
370 if files is None:
371 files = knownfiles
372 else:
373 files = knownfiles + list(files)
374 else:
375 db = _db
376
377 for file in files:
378 if os.path.isfile(file):
379 db.read(file)
380 encodings_map = db.encodings_map
381 suffix_map = db.suffix_map
382 types_map = db.types_map[True]
383 common_types = db.types_map[False]
384 # Make the DB a global variable now that it is fully initialized
385 _db = db
386
387
388 def read_mime_types(file):
389 try:
390 f = open(file, encoding='utf-8')
391 except OSError:
392 return None
393 with f:
394 db = MimeTypes()
395 db.readfp(f, True)
396 return db.types_map[True]
397
398
399 def _default_mime_types():
400 global suffix_map, _suffix_map_default
401 global encodings_map, _encodings_map_default
402 global types_map, _types_map_default
403 global common_types, _common_types_default
404
405 suffix_map = _suffix_map_default = {
406 '.svgz': '.svg.gz',
407 '.tgz': '.tar.gz',
408 '.taz': '.tar.gz',
409 '.tz': '.tar.gz',
410 '.tbz2': '.tar.bz2',
411 '.txz': '.tar.xz',
412 }
413
414 encodings_map = _encodings_map_default = {
415 '.gz': 'gzip',
416 '.Z': 'compress',
417 '.bz2': 'bzip2',
418 '.xz': 'xz',
419 '.br': 'br',
420 }
421
422 # Before adding new types, make sure they are either registered with IANA,
423 # at http://www.iana.org/assignments/media-types
424 # or extensions, i.e. using the x- prefix
425
426 # If you add to these, please keep them sorted by mime type.
427 # Make sure the entry with the preferred file extension for a particular mime type
428 # appears before any others of the same mimetype.
429 types_map = _types_map_default = {
430 '.js' : 'application/javascript',
431 '.mjs' : 'application/javascript',
432 '.json' : 'application/json',
433 '.webmanifest': 'application/manifest+json',
434 '.doc' : 'application/msword',
435 '.dot' : 'application/msword',
436 '.wiz' : 'application/msword',
437 '.nq' : 'application/n-quads',
438 '.nt' : 'application/n-triples',
439 '.bin' : 'application/octet-stream',
440 '.a' : 'application/octet-stream',
441 '.dll' : 'application/octet-stream',
442 '.exe' : 'application/octet-stream',
443 '.o' : 'application/octet-stream',
444 '.obj' : 'application/octet-stream',
445 '.so' : 'application/octet-stream',
446 '.oda' : 'application/oda',
447 '.pdf' : 'application/pdf',
448 '.p7c' : 'application/pkcs7-mime',
449 '.ps' : 'application/postscript',
450 '.ai' : 'application/postscript',
451 '.eps' : 'application/postscript',
452 '.trig' : 'application/trig',
453 '.m3u' : 'application/vnd.apple.mpegurl',
454 '.m3u8' : 'application/vnd.apple.mpegurl',
455 '.xls' : 'application/vnd.ms-excel',
456 '.xlb' : 'application/vnd.ms-excel',
457 '.ppt' : 'application/vnd.ms-powerpoint',
458 '.pot' : 'application/vnd.ms-powerpoint',
459 '.ppa' : 'application/vnd.ms-powerpoint',
460 '.pps' : 'application/vnd.ms-powerpoint',
461 '.pwz' : 'application/vnd.ms-powerpoint',
462 '.wasm' : 'application/wasm',
463 '.bcpio' : 'application/x-bcpio',
464 '.cpio' : 'application/x-cpio',
465 '.csh' : 'application/x-csh',
466 '.dvi' : 'application/x-dvi',
467 '.gtar' : 'application/x-gtar',
468 '.hdf' : 'application/x-hdf',
469 '.h5' : 'application/x-hdf5',
470 '.latex' : 'application/x-latex',
471 '.mif' : 'application/x-mif',
472 '.cdf' : 'application/x-netcdf',
473 '.nc' : 'application/x-netcdf',
474 '.p12' : 'application/x-pkcs12',
475 '.pfx' : 'application/x-pkcs12',
476 '.ram' : 'application/x-pn-realaudio',
477 '.pyc' : 'application/x-python-code',
478 '.pyo' : 'application/x-python-code',
479 '.sh' : 'application/x-sh',
480 '.shar' : 'application/x-shar',
481 '.swf' : 'application/x-shockwave-flash',
482 '.sv4cpio': 'application/x-sv4cpio',
483 '.sv4crc' : 'application/x-sv4crc',
484 '.tar' : 'application/x-tar',
485 '.tcl' : 'application/x-tcl',
486 '.tex' : 'application/x-tex',
487 '.texi' : 'application/x-texinfo',
488 '.texinfo': 'application/x-texinfo',
489 '.roff' : 'application/x-troff',
490 '.t' : 'application/x-troff',
491 '.tr' : 'application/x-troff',
492 '.man' : 'application/x-troff-man',
493 '.me' : 'application/x-troff-me',
494 '.ms' : 'application/x-troff-ms',
495 '.ustar' : 'application/x-ustar',
496 '.src' : 'application/x-wais-source',
497 '.xsl' : 'application/xml',
498 '.rdf' : 'application/xml',
499 '.wsdl' : 'application/xml',
500 '.xpdl' : 'application/xml',
501 '.zip' : 'application/zip',
502 '.3gp' : 'audio/3gpp',
503 '.3gpp' : 'audio/3gpp',
504 '.3g2' : 'audio/3gpp2',
505 '.3gpp2' : 'audio/3gpp2',
506 '.aac' : 'audio/aac',
507 '.adts' : 'audio/aac',
508 '.loas' : 'audio/aac',
509 '.ass' : 'audio/aac',
510 '.au' : 'audio/basic',
511 '.snd' : 'audio/basic',
512 '.mp3' : 'audio/mpeg',
513 '.mp2' : 'audio/mpeg',
514 '.opus' : 'audio/opus',
515 '.aif' : 'audio/x-aiff',
516 '.aifc' : 'audio/x-aiff',
517 '.aiff' : 'audio/x-aiff',
518 '.ra' : 'audio/x-pn-realaudio',
519 '.wav' : 'audio/x-wav',
520 '.avif' : 'image/avif',
521 '.bmp' : 'image/bmp',
522 '.gif' : 'image/gif',
523 '.ief' : 'image/ief',
524 '.jpg' : 'image/jpeg',
525 '.jpe' : 'image/jpeg',
526 '.jpeg' : 'image/jpeg',
527 '.heic' : 'image/heic',
528 '.heif' : 'image/heif',
529 '.png' : 'image/png',
530 '.svg' : 'image/svg+xml',
531 '.tiff' : 'image/tiff',
532 '.tif' : 'image/tiff',
533 '.ico' : 'image/vnd.microsoft.icon',
534 '.ras' : 'image/x-cmu-raster',
535 '.pnm' : 'image/x-portable-anymap',
536 '.pbm' : 'image/x-portable-bitmap',
537 '.pgm' : 'image/x-portable-graymap',
538 '.ppm' : 'image/x-portable-pixmap',
539 '.rgb' : 'image/x-rgb',
540 '.xbm' : 'image/x-xbitmap',
541 '.xpm' : 'image/x-xpixmap',
542 '.xwd' : 'image/x-xwindowdump',
543 '.eml' : 'message/rfc822',
544 '.mht' : 'message/rfc822',
545 '.mhtml' : 'message/rfc822',
546 '.nws' : 'message/rfc822',
547 '.css' : 'text/css',
548 '.csv' : 'text/csv',
549 '.html' : 'text/html',
550 '.htm' : 'text/html',
551 '.n3' : 'text/n3',
552 '.txt' : 'text/plain',
553 '.bat' : 'text/plain',
554 '.c' : 'text/plain',
555 '.h' : 'text/plain',
556 '.ksh' : 'text/plain',
557 '.pl' : 'text/plain',
558 '.srt' : 'text/plain',
559 '.rtx' : 'text/richtext',
560 '.tsv' : 'text/tab-separated-values',
561 '.vtt' : 'text/vtt',
562 '.py' : 'text/x-python',
563 '.etx' : 'text/x-setext',
564 '.sgm' : 'text/x-sgml',
565 '.sgml' : 'text/x-sgml',
566 '.vcf' : 'text/x-vcard',
567 '.xml' : 'text/xml',
568 '.mp4' : 'video/mp4',
569 '.mpeg' : 'video/mpeg',
570 '.m1v' : 'video/mpeg',
571 '.mpa' : 'video/mpeg',
572 '.mpe' : 'video/mpeg',
573 '.mpg' : 'video/mpeg',
574 '.mov' : 'video/quicktime',
575 '.qt' : 'video/quicktime',
576 '.webm' : 'video/webm',
577 '.avi' : 'video/x-msvideo',
578 '.movie' : 'video/x-sgi-movie',
579 }
580
581 # These are non-standard types, commonly found in the wild. They will
582 # only match if strict=0 flag is given to the API methods.
583
584 # Please sort these too
585 common_types = _common_types_default = {
586 '.rtf' : 'application/rtf',
587 '.midi': 'audio/midi',
588 '.mid' : 'audio/midi',
589 '.jpg' : 'image/jpg',
590 '.pict': 'image/pict',
591 '.pct' : 'image/pict',
592 '.pic' : 'image/pict',
593 '.webp': 'image/webp',
594 '.xul' : 'text/xul',
595 }
596
597
598 _default_mime_types()
599
600
601 def _main():
602 import getopt
603
604 USAGE = """\
605 Usage: mimetypes.py [options] type
606
607 Options:
608 --help / -h -- print this message and exit
609 --lenient / -l -- additionally search of some common, but non-standard
610 types.
611 --extension / -e -- guess extension instead of type
612
613 More than one type argument may be given.
614 """
615
616 def usage(code, msg=''):
617 print(USAGE)
618 if msg: print(msg)
619 sys.exit(code)
620
621 try:
622 opts, args = getopt.getopt(sys.argv[1:], 'hle',
623 ['help', 'lenient', 'extension'])
624 except getopt.error as msg:
625 usage(1, msg)
626
627 strict = 1
628 extension = 0
629 for opt, arg in opts:
630 if opt in ('-h', '--help'):
631 usage(0)
632 elif opt in ('-l', '--lenient'):
633 strict = 0
634 elif opt in ('-e', '--extension'):
635 extension = 1
636 for gtype in args:
637 if extension:
638 guess = guess_extension(gtype, strict)
639 if not guess: print("I don't know anything about type", gtype)
640 else: print(guess)
641 else:
642 guess, encoding = guess_type(gtype, strict)
643 if not guess: print("I don't know anything about type", gtype)
644 else: print('type:', guess, 'encoding:', encoding)
645
646
647 if __name__ == '__main__':
648 _main()