# -*- coding: utf-8 -*- """ Mimetypes-related utilities # TODO: reexport stdlib mimetypes? """ import collections import functools import io import logging import mimetypes import re import zipfile __all__ = ['guess_mimetype'] _logger = logging.getLogger(__name__) # We define our own guess_mimetype implementation and if magic is available we # use it instead. # discriminants for zip-based file formats _ooxml_dirs = { 'word/': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'pt/': 'application/vnd.openxmlformats-officedocument.presentationml.presentation', 'xl/': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', } def _check_ooxml(data): with io.BytesIO(data) as f, zipfile.ZipFile(f) as z: filenames = z.namelist() # OOXML documents should have a [Content_Types].xml file for early # check that we're interested in this thing at all if '[Content_Types].xml' not in filenames: return False # then there is a directory whose name denotes the type of the file: # word, pt (powerpoint) or xl (excel) for dirname, mime in _ooxml_dirs.items(): if any(entry.startswith(dirname) for entry in filenames): return mime return False # checks that a string looks kinda sorta like a mimetype _mime_validator = re.compile(r""" [\w-]+ # type-name / # subtype separator [\w-]+ # registration facet or subtype (?:\.[\w-]+)* # optional faceted name (?:\+[\w-]+)? # optional structured syntax specifier """, re.VERBOSE) def _check_open_container_format(data): # Open Document Format for Office Applications (OpenDocument) Version 1.2 # # Part 3: Packages # 3 Packages # 3.3 MIME Media Type with io.BytesIO(data) as f, zipfile.ZipFile(f) as z: # If a MIME media type for a document exists, then an OpenDocument # package should contain a file with name "mimetype". if 'mimetype' not in z.namelist(): return False # The content of this file shall be the ASCII encoded MIME media type # associated with the document. marcel = z.read('mimetype').decode('ascii') # check that it's not too long (RFC6838 ยง 4.2 restricts type and # subtype to 127 characters each + separator, strongly recommends # limiting them to 64 but does not require it) and that it looks a lot # like a valid mime type if len(marcel) < 256 and _mime_validator.match(marcel): return marcel return False _xls_pattern = re.compile(b""" \x09\x08\x10\x00\x00\x06\x05\x00 | \xFD\xFF\xFF\xFF(\x10|\x1F|\x20|"|\\#|\\(|\\)) """, re.VERBOSE) _ppt_pattern = re.compile(b""" \x00\x6E\x1E\xF0 | \x0F\x00\xE8\x03 | \xA0\x46\x1D\xF0 | \xFD\xFF\xFF\xFF(\x0E|\x1C|\x43)\x00\x00\x00 """, re.VERBOSE) def _check_olecf(data): """ Pre-OOXML Office formats are OLE Compound Files which all use the same file signature ("magic bytes") and should have a subheader at offset 512 (0x200). Subheaders taken from http://www.garykessler.net/library/file_sigs.html according to which Mac office files *may* have different subheaders. We'll ignore that. """ offset = 0x200 if data.startswith(b'\xEC\xA5\xC1\x00', offset): return 'application/msword' # the _xls_pattern stuff doesn't seem to work correctly (the test file # only has a bunch of \xf* at offset 0x200), that apparently works elif b'Microsoft Excel' in data: return 'application/vnd.ms-excel' elif _ppt_pattern.match(data, offset): return 'application/vnd.ms-powerpoint' return False def _check_svg(data): """This simply checks the existence of the opening and ending SVG tags""" if b'