1 """Guess the MIME type of a file.
2
3 This module defines two useful functions:
4
5 guess_type(url, strict=True) -- guess the MIME type and encoding of a URL.
6
7 guess_extension(type, strict=True) -- guess the extension for a given MIME type.
8
9 It also contains the following, for tuning the behavior:
10
11 Data:
12
13 knownfiles -- list of files to parse
14 inited -- flag set when init() has been called
15 suffix_map -- dictionary mapping suffixes to suffixes
16 encodings_map -- dictionary mapping suffixes to encodings
17 types_map -- dictionary mapping suffixes to types
18
19 Functions:
20
21 init([files]) -- parse a list of files, default knownfiles (on Windows, the
22 default values are taken from the registry)
23 read_mime_types(file) -- parse one file, return a dictionary or None
24 """
25
26 import os
27 import sys
28 import posixpath
29 import urllib.parse
30
31 try:
32 from _winapi import _mimetypes_read_windows_registry
33 except ImportError:
34 _mimetypes_read_windows_registry = None
35
36 try:
37 import winreg as _winreg
38 except ImportError:
39 _winreg = None
40
41 __all__ = [
42 "knownfiles", "inited", "MimeTypes",
43 "guess_type", "guess_all_extensions", "guess_extension",
44 "add_type", "init", "read_mime_types",
45 "suffix_map", "encodings_map", "types_map", "common_types"
46 ]
47
48 knownfiles = [
49 "/etc/mime.types",
50 "/etc/httpd/mime.types", # Mac OS X
51 "/etc/httpd/conf/mime.types", # Apache
52 "/etc/apache/mime.types", # Apache 1
53 "/etc/apache2/mime.types", # Apache 2
54 "/usr/local/etc/httpd/conf/mime.types",
55 "/usr/local/lib/netscape/mime.types",
56 "/usr/local/etc/httpd/conf/mime.types", # Apache 1.2
57 "/usr/local/etc/mime.types", # Apache 1.3
58 ]
59
60 inited = False
61 _db = None
62
63
64 class ESC[4;38;5;81mMimeTypes:
65 """MIME-types datastore.
66
67 This datastore can handle information from mime.types-style files
68 and supports basic determination of MIME type from a filename or
69 URL, and can guess a reasonable extension given a MIME type.
70 """
71
72 def __init__(self, filenames=(), strict=True):
73 if not inited:
74 init()
75 self.encodings_map = _encodings_map_default.copy()
76 self.suffix_map = _suffix_map_default.copy()
77 self.types_map = ({}, {}) # dict for (non-strict, strict)
78 self.types_map_inv = ({}, {})
79 for (ext, type) in _types_map_default.items():
80 self.add_type(type, ext, True)
81 for (ext, type) in _common_types_default.items():
82 self.add_type(type, ext, False)
83 for name in filenames:
84 self.read(name, strict)
85
86 def add_type(self, type, ext, strict=True):
87 """Add a mapping between a type and an extension.
88
89 When the extension is already known, the new
90 type will replace the old one. When the type
91 is already known the extension will be added
92 to the list of known extensions.
93
94 If strict is true, information will be added to
95 list of standard types, else to the list of non-standard
96 types.
97 """
98 self.types_map[strict][ext] = type
99 exts = self.types_map_inv[strict].setdefault(type, [])
100 if ext not in exts:
101 exts.append(ext)
102
103 def guess_type(self, url, strict=True):
104 """Guess the type of a file which is either a URL or a path-like object.
105
106 Return value is a tuple (type, encoding) where type is None if
107 the type can't be guessed (no or unknown suffix) or a string
108 of the form type/subtype, usable for a MIME Content-type
109 header; and encoding is None for no encoding or the name of
110 the program used to encode (e.g. compress or gzip). The
111 mappings are table driven. Encoding suffixes are case
112 sensitive; type suffixes are first tried case sensitive, then
113 case insensitive.
114
115 The suffixes .tgz, .taz and .tz (case sensitive!) are all
116 mapped to '.tar.gz'. (This is table-driven too, using the
117 dictionary suffix_map.)
118
119 Optional `strict' argument when False adds a bunch of commonly found,
120 but non-standard types.
121 """
122 url = os.fspath(url)
123 scheme, url = urllib.parse._splittype(url)
124 if scheme == 'data':
125 # syntax of data URLs:
126 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
127 # mediatype := [ type "/" subtype ] *( ";" parameter )
128 # data := *urlchar
129 # parameter := attribute "=" value
130 # type/subtype defaults to "text/plain"
131 comma = url.find(',')
132 if comma < 0:
133 # bad data URL
134 return None, None
135 semi = url.find(';', 0, comma)
136 if semi >= 0:
137 type = url[:semi]
138 else:
139 type = url[:comma]
140 if '=' in type or '/' not in type:
141 type = 'text/plain'
142 return type, None # never compressed, so encoding is None
143 base, ext = posixpath.splitext(url)
144 while (ext_lower := ext.lower()) in self.suffix_map:
145 base, ext = posixpath.splitext(base + self.suffix_map[ext_lower])
146 # encodings_map is case sensitive
147 if ext in self.encodings_map:
148 encoding = self.encodings_map[ext]
149 base, ext = posixpath.splitext(base)
150 else:
151 encoding = None
152 ext = ext.lower()
153 types_map = self.types_map[True]
154 if ext in types_map:
155 return types_map[ext], encoding
156 elif strict:
157 return None, encoding
158 types_map = self.types_map[False]
159 if ext in types_map:
160 return types_map[ext], encoding
161 else:
162 return None, encoding
163
164 def guess_all_extensions(self, type, strict=True):
165 """Guess the extensions for a file based on its MIME type.
166
167 Return value is a list of strings giving the possible filename
168 extensions, including the leading dot ('.'). The extension is not
169 guaranteed to have been associated with any particular data stream,
170 but would be mapped to the MIME type `type' by guess_type().
171
172 Optional `strict' argument when false adds a bunch of commonly found,
173 but non-standard types.
174 """
175 type = type.lower()
176 extensions = list(self.types_map_inv[True].get(type, []))
177 if not strict:
178 for ext in self.types_map_inv[False].get(type, []):
179 if ext not in extensions:
180 extensions.append(ext)
181 return extensions
182
183 def guess_extension(self, type, strict=True):
184 """Guess the extension for a file based on its MIME type.
185
186 Return value is a string giving a filename extension,
187 including the leading dot ('.'). The extension is not
188 guaranteed to have been associated with any particular data
189 stream, but would be mapped to the MIME type `type' by
190 guess_type(). If no extension can be guessed for `type', None
191 is returned.
192
193 Optional `strict' argument when false adds a bunch of commonly found,
194 but non-standard types.
195 """
196 extensions = self.guess_all_extensions(type, strict)
197 if not extensions:
198 return None
199 return extensions[0]
200
201 def read(self, filename, strict=True):
202 """
203 Read a single mime.types-format file, specified by pathname.
204
205 If strict is true, information will be added to
206 list of standard types, else to the list of non-standard
207 types.
208 """
209 with open(filename, encoding='utf-8') as fp:
210 self.readfp(fp, strict)
211
212 def readfp(self, fp, strict=True):
213 """
214 Read a single mime.types-format file.
215
216 If strict is true, information will be added to
217 list of standard types, else to the list of non-standard
218 types.
219 """
220 while line := fp.readline():
221 words = line.split()
222 for i in range(len(words)):
223 if words[i][0] == '#':
224 del words[i:]
225 break
226 if not words:
227 continue
228 type, suffixes = words[0], words[1:]
229 for suff in suffixes:
230 self.add_type(type, '.' + suff, strict)
231
232 def read_windows_registry(self, strict=True):
233 """
234 Load the MIME types database from Windows registry.
235
236 If strict is true, information will be added to
237 list of standard types, else to the list of non-standard
238 types.
239 """
240
241 if not _mimetypes_read_windows_registry and not _winreg:
242 return
243
244 add_type = self.add_type
245 if strict:
246 add_type = lambda type, ext: self.add_type(type, ext, True)
247
248 # Accelerated function if it is available
249 if _mimetypes_read_windows_registry:
250 _mimetypes_read_windows_registry(add_type)
251 elif _winreg:
252 self._read_windows_registry(add_type)
253
254 @classmethod
255 def _read_windows_registry(cls, add_type):
256 def enum_types(mimedb):
257 i = 0
258 while True:
259 try:
260 ctype = _winreg.EnumKey(mimedb, i)
261 except OSError:
262 break
263 else:
264 if '\0' not in ctype:
265 yield ctype
266 i += 1
267
268 with _winreg.OpenKey(_winreg.HKEY_CLASSES_ROOT, '') as hkcr:
269 for subkeyname in enum_types(hkcr):
270 try:
271 with _winreg.OpenKey(hkcr, subkeyname) as subkey:
272 # Only check file extensions
273 if not subkeyname.startswith("."):
274 continue
275 # raises OSError if no 'Content Type' value
276 mimetype, datatype = _winreg.QueryValueEx(
277 subkey, 'Content Type')
278 if datatype != _winreg.REG_SZ:
279 continue
280 add_type(mimetype, subkeyname)
281 except OSError:
282 continue
283
284 def guess_type(url, strict=True):
285 """Guess the type of a file based on its URL.
286
287 Return value is a tuple (type, encoding) where type is None if the
288 type can't be guessed (no or unknown suffix) or a string of the
289 form type/subtype, usable for a MIME Content-type header; and
290 encoding is None for no encoding or the name of the program used
291 to encode (e.g. compress or gzip). The mappings are table
292 driven. Encoding suffixes are case sensitive; type suffixes are
293 first tried case sensitive, then case insensitive.
294
295 The suffixes .tgz, .taz and .tz (case sensitive!) are all mapped
296 to ".tar.gz". (This is table-driven too, using the dictionary
297 suffix_map).
298
299 Optional `strict' argument when false adds a bunch of commonly found, but
300 non-standard types.
301 """
302 if _db is None:
303 init()
304 return _db.guess_type(url, strict)
305
306
307 def guess_all_extensions(type, strict=True):
308 """Guess the extensions for a file based on its MIME type.
309
310 Return value is a list of strings giving the possible filename
311 extensions, including the leading dot ('.'). The extension is not
312 guaranteed to have been associated with any particular data
313 stream, but would be mapped to the MIME type `type' by
314 guess_type(). If no extension can be guessed for `type', None
315 is returned.
316
317 Optional `strict' argument when false adds a bunch of commonly found,
318 but non-standard types.
319 """
320 if _db is None:
321 init()
322 return _db.guess_all_extensions(type, strict)
323
324 def guess_extension(type, strict=True):
325 """Guess the extension for a file based on its MIME type.
326
327 Return value is a string giving a filename extension, including the
328 leading dot ('.'). The extension is not guaranteed to have been
329 associated with any particular data stream, but would be mapped to the
330 MIME type `type' by guess_type(). If no extension can be guessed for
331 `type', None is returned.
332
333 Optional `strict' argument when false adds a bunch of commonly found,
334 but non-standard types.
335 """
336 if _db is None:
337 init()
338 return _db.guess_extension(type, strict)
339
340 def add_type(type, ext, strict=True):
341 """Add a mapping between a type and an extension.
342
343 When the extension is already known, the new
344 type will replace the old one. When the type
345 is already known the extension will be added
346 to the list of known extensions.
347
348 If strict is true, information will be added to
349 list of standard types, else to the list of non-standard
350 types.
351 """
352 if _db is None:
353 init()
354 return _db.add_type(type, ext, strict)
355
356
357 def init(files=None):
358 global suffix_map, types_map, encodings_map, common_types
359 global inited, _db
360 inited = True # so that MimeTypes.__init__() doesn't call us again
361
362 if files is None or _db is None:
363 db = MimeTypes()
364 # Quick return if not supported
365 db.read_windows_registry()
366
367 if files is None:
368 files = knownfiles
369 else:
370 files = knownfiles + list(files)
371 else:
372 db = _db
373
374 for file in files:
375 if os.path.isfile(file):
376 db.read(file)
377 encodings_map = db.encodings_map
378 suffix_map = db.suffix_map
379 types_map = db.types_map[True]
380 common_types = db.types_map[False]
381 # Make the DB a global variable now that it is fully initialized
382 _db = db
383
384
385 def read_mime_types(file):
386 try:
387 f = open(file, encoding='utf-8')
388 except OSError:
389 return None
390 with f:
391 db = MimeTypes()
392 db.readfp(f, True)
393 return db.types_map[True]
394
395
396 def _default_mime_types():
397 global suffix_map, _suffix_map_default
398 global encodings_map, _encodings_map_default
399 global types_map, _types_map_default
400 global common_types, _common_types_default
401
402 suffix_map = _suffix_map_default = {
403 '.svgz': '.svg.gz',
404 '.tgz': '.tar.gz',
405 '.taz': '.tar.gz',
406 '.tz': '.tar.gz',
407 '.tbz2': '.tar.bz2',
408 '.txz': '.tar.xz',
409 }
410
411 encodings_map = _encodings_map_default = {
412 '.gz': 'gzip',
413 '.Z': 'compress',
414 '.bz2': 'bzip2',
415 '.xz': 'xz',
416 '.br': 'br',
417 }
418
419 # Before adding new types, make sure they are either registered with IANA,
420 # at http://www.iana.org/assignments/media-types
421 # or extensions, i.e. using the x- prefix
422
423 # If you add to these, please keep them sorted by mime type.
424 # Make sure the entry with the preferred file extension for a particular mime type
425 # appears before any others of the same mimetype.
426 types_map = _types_map_default = {
427 '.js' : 'text/javascript',
428 '.mjs' : 'text/javascript',
429 '.json' : 'application/json',
430 '.webmanifest': 'application/manifest+json',
431 '.doc' : 'application/msword',
432 '.dot' : 'application/msword',
433 '.wiz' : 'application/msword',
434 '.nq' : 'application/n-quads',
435 '.nt' : 'application/n-triples',
436 '.bin' : 'application/octet-stream',
437 '.a' : 'application/octet-stream',
438 '.dll' : 'application/octet-stream',
439 '.exe' : 'application/octet-stream',
440 '.o' : 'application/octet-stream',
441 '.obj' : 'application/octet-stream',
442 '.so' : 'application/octet-stream',
443 '.oda' : 'application/oda',
444 '.pdf' : 'application/pdf',
445 '.p7c' : 'application/pkcs7-mime',
446 '.ps' : 'application/postscript',
447 '.ai' : 'application/postscript',
448 '.eps' : 'application/postscript',
449 '.trig' : 'application/trig',
450 '.m3u' : 'application/vnd.apple.mpegurl',
451 '.m3u8' : 'application/vnd.apple.mpegurl',
452 '.xls' : 'application/vnd.ms-excel',
453 '.xlb' : 'application/vnd.ms-excel',
454 '.ppt' : 'application/vnd.ms-powerpoint',
455 '.pot' : 'application/vnd.ms-powerpoint',
456 '.ppa' : 'application/vnd.ms-powerpoint',
457 '.pps' : 'application/vnd.ms-powerpoint',
458 '.pwz' : 'application/vnd.ms-powerpoint',
459 '.wasm' : 'application/wasm',
460 '.bcpio' : 'application/x-bcpio',
461 '.cpio' : 'application/x-cpio',
462 '.csh' : 'application/x-csh',
463 '.dvi' : 'application/x-dvi',
464 '.gtar' : 'application/x-gtar',
465 '.hdf' : 'application/x-hdf',
466 '.h5' : 'application/x-hdf5',
467 '.latex' : 'application/x-latex',
468 '.mif' : 'application/x-mif',
469 '.cdf' : 'application/x-netcdf',
470 '.nc' : 'application/x-netcdf',
471 '.p12' : 'application/x-pkcs12',
472 '.pfx' : 'application/x-pkcs12',
473 '.ram' : 'application/x-pn-realaudio',
474 '.pyc' : 'application/x-python-code',
475 '.pyo' : 'application/x-python-code',
476 '.sh' : 'application/x-sh',
477 '.shar' : 'application/x-shar',
478 '.swf' : 'application/x-shockwave-flash',
479 '.sv4cpio': 'application/x-sv4cpio',
480 '.sv4crc' : 'application/x-sv4crc',
481 '.tar' : 'application/x-tar',
482 '.tcl' : 'application/x-tcl',
483 '.tex' : 'application/x-tex',
484 '.texi' : 'application/x-texinfo',
485 '.texinfo': 'application/x-texinfo',
486 '.roff' : 'application/x-troff',
487 '.t' : 'application/x-troff',
488 '.tr' : 'application/x-troff',
489 '.man' : 'application/x-troff-man',
490 '.me' : 'application/x-troff-me',
491 '.ms' : 'application/x-troff-ms',
492 '.ustar' : 'application/x-ustar',
493 '.src' : 'application/x-wais-source',
494 '.xsl' : 'application/xml',
495 '.rdf' : 'application/xml',
496 '.wsdl' : 'application/xml',
497 '.xpdl' : 'application/xml',
498 '.zip' : 'application/zip',
499 '.3gp' : 'audio/3gpp',
500 '.3gpp' : 'audio/3gpp',
501 '.3g2' : 'audio/3gpp2',
502 '.3gpp2' : 'audio/3gpp2',
503 '.aac' : 'audio/aac',
504 '.adts' : 'audio/aac',
505 '.loas' : 'audio/aac',
506 '.ass' : 'audio/aac',
507 '.au' : 'audio/basic',
508 '.snd' : 'audio/basic',
509 '.mp3' : 'audio/mpeg',
510 '.mp2' : 'audio/mpeg',
511 '.opus' : 'audio/opus',
512 '.aif' : 'audio/x-aiff',
513 '.aifc' : 'audio/x-aiff',
514 '.aiff' : 'audio/x-aiff',
515 '.ra' : 'audio/x-pn-realaudio',
516 '.wav' : 'audio/x-wav',
517 '.avif' : 'image/avif',
518 '.bmp' : 'image/bmp',
519 '.gif' : 'image/gif',
520 '.ief' : 'image/ief',
521 '.jpg' : 'image/jpeg',
522 '.jpe' : 'image/jpeg',
523 '.jpeg' : 'image/jpeg',
524 '.heic' : 'image/heic',
525 '.heif' : 'image/heif',
526 '.png' : 'image/png',
527 '.svg' : 'image/svg+xml',
528 '.tiff' : 'image/tiff',
529 '.tif' : 'image/tiff',
530 '.ico' : 'image/vnd.microsoft.icon',
531 '.ras' : 'image/x-cmu-raster',
532 '.pnm' : 'image/x-portable-anymap',
533 '.pbm' : 'image/x-portable-bitmap',
534 '.pgm' : 'image/x-portable-graymap',
535 '.ppm' : 'image/x-portable-pixmap',
536 '.rgb' : 'image/x-rgb',
537 '.xbm' : 'image/x-xbitmap',
538 '.xpm' : 'image/x-xpixmap',
539 '.xwd' : 'image/x-xwindowdump',
540 '.eml' : 'message/rfc822',
541 '.mht' : 'message/rfc822',
542 '.mhtml' : 'message/rfc822',
543 '.nws' : 'message/rfc822',
544 '.css' : 'text/css',
545 '.csv' : 'text/csv',
546 '.html' : 'text/html',
547 '.htm' : 'text/html',
548 '.n3' : 'text/n3',
549 '.txt' : 'text/plain',
550 '.bat' : 'text/plain',
551 '.c' : 'text/plain',
552 '.h' : 'text/plain',
553 '.ksh' : 'text/plain',
554 '.pl' : 'text/plain',
555 '.srt' : 'text/plain',
556 '.rtx' : 'text/richtext',
557 '.tsv' : 'text/tab-separated-values',
558 '.vtt' : 'text/vtt',
559 '.py' : 'text/x-python',
560 '.etx' : 'text/x-setext',
561 '.sgm' : 'text/x-sgml',
562 '.sgml' : 'text/x-sgml',
563 '.vcf' : 'text/x-vcard',
564 '.xml' : 'text/xml',
565 '.mp4' : 'video/mp4',
566 '.mpeg' : 'video/mpeg',
567 '.m1v' : 'video/mpeg',
568 '.mpa' : 'video/mpeg',
569 '.mpe' : 'video/mpeg',
570 '.mpg' : 'video/mpeg',
571 '.mov' : 'video/quicktime',
572 '.qt' : 'video/quicktime',
573 '.webm' : 'video/webm',
574 '.avi' : 'video/x-msvideo',
575 '.movie' : 'video/x-sgi-movie',
576 }
577
578 # These are non-standard types, commonly found in the wild. They will
579 # only match if strict=0 flag is given to the API methods.
580
581 # Please sort these too
582 common_types = _common_types_default = {
583 '.rtf' : 'application/rtf',
584 '.midi': 'audio/midi',
585 '.mid' : 'audio/midi',
586 '.jpg' : 'image/jpg',
587 '.pict': 'image/pict',
588 '.pct' : 'image/pict',
589 '.pic' : 'image/pict',
590 '.webp': 'image/webp',
591 '.xul' : 'text/xul',
592 }
593
594
595 _default_mime_types()
596
597
598 def _main():
599 import getopt
600
601 USAGE = """\
602 Usage: mimetypes.py [options] type
603
604 Options:
605 --help / -h -- print this message and exit
606 --lenient / -l -- additionally search of some common, but non-standard
607 types.
608 --extension / -e -- guess extension instead of type
609
610 More than one type argument may be given.
611 """
612
613 def usage(code, msg=''):
614 print(USAGE)
615 if msg: print(msg)
616 sys.exit(code)
617
618 try:
619 opts, args = getopt.getopt(sys.argv[1:], 'hle',
620 ['help', 'lenient', 'extension'])
621 except getopt.error as msg:
622 usage(1, msg)
623
624 strict = 1
625 extension = 0
626 for opt, arg in opts:
627 if opt in ('-h', '--help'):
628 usage(0)
629 elif opt in ('-l', '--lenient'):
630 strict = 0
631 elif opt in ('-e', '--extension'):
632 extension = 1
633 for gtype in args:
634 if extension:
635 guess = guess_extension(gtype, strict)
636 if not guess: print("I don't know anything about type", gtype)
637 else: print(guess)
638 else:
639 guess, encoding = guess_type(gtype, strict)
640 if not guess: print("I don't know anything about type", gtype)
641 else: print('type:', guess, 'encoding:', encoding)
642
643
644 if __name__ == '__main__':
645 _main()