Implement mimetype categorization for easier file-type / preview detection (#10750)

This implements more advanced file-type detection based on `python-magic` thus we don't rely on the files extensions anymore but actually verify their content (header) before serving it to the client and reviewer.

Fixes #10685  (and actually rewrites it's meaning)
This commit is contained in:
Christopher Grebs 2019-02-22 14:01:44 +01:00 коммит произвёл GitHub
Родитель ae3ca6db65
Коммит f8d9198b50
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
21 изменённых файлов: 133 добавлений и 53 удалений

Просмотреть файл

@ -28,6 +28,7 @@ RUN apt-get update && apt-get install -y \
zlib1g-dev \
libffi-dev \
libssl-dev \
libmagic-dev \
python-dev \
python3-dev \
python-pip \

Просмотреть файл

@ -33,6 +33,7 @@ RUN apt-get update && apt-get install -y \
zlib1g-dev \
libffi-dev \
libssl-dev \
libmagic-dev \
python-dev \
python-pip \
nodejs \

Просмотреть файл

@ -30,6 +30,7 @@ RUN apt-get update && apt-get install -y \
zlib1g-dev \
libffi-dev \
libssl-dev \
libmagic-dev \
nodejs \
# Git, because we're using git-checkout dependencies
git \

Просмотреть файл

@ -34,6 +34,7 @@ RUN apt-get update && apt-get install -y \
locales \
zlib1g-dev \
libffi-dev \
libmagic-dev \
libssl-dev \
nodejs \
uuid-dev \

Просмотреть файл

@ -131,12 +131,11 @@ This endpoint allows you to browse through the contents of an Add-on version.
:>json string file.content: Raw content of the requested file.
:>json string file.selected_file: The selected file, either from the ``file`` parameter or the default (manifest.json, install.rdf or package.json for Add-ons as well as the XML file for search engines).
:>json array file.entries[]: The complete file-tree of the extracted XPI.
:>json boolean|string file.entries[].binary: ``True`` if the file is a binary file (e.g an .exe, dll, java, swf file), ``'image'`` if the file is an image or ``False`` otherwise. If ``False`` or ``'image'`` the file should be presentable to the user.
:>json int file.entries[].depth: Level of folder-tree depth, starting with 0.
:>json boolean file.entries[].is_directory: Wheather the file is a directory.
:>json string file.entries[].filename: The filename of the file.
:>json string file.entries[].path: The absolute path (from the root of the XPI) of the file.
:>json string file.entries[].sha256: SHA256 hash.
:>json string file.entries[].mimetype: The determined mimetype of the file or ``application/octet-stream`` if none could be determined.
:>json string files.entries[].mime_category: The mime type category of this file. Can be ``image``, ``directory``, ``text`` or ``binary``.
:>json int file.entries[].size: The size in bytes.
:>json string file.entries[].modified: The exact time of the commit, should be equivalent with ``created``.

Просмотреть файл

@ -380,3 +380,6 @@ cachetools==3.1.0 \
sqlparse==0.2.4 \
--hash=sha256:d9cf190f51cbb26da0412247dfe4fb5f4098edb73db84e02f9fc21fdca31fed4 \
--hash=sha256:ce028444cfab83be538752a2ffdb56bc417b7784ff35bb9a3062413717807dec
python-magic==0.4.15 \
--hash=sha256:f2674dcfad52ae6c49d4803fa027809540b130db1dec928cfbb9240316831375 \
--hash=sha256:f3765c0f582d2dfc72c15f3b5a82aecfae9498bd29ca840d72f37d7bd38bfcd5

Двоичные данные
src/olympia/files/fixtures/files/file_viewer_filetypes/blank.pdf Normal file

Двоичный файл не отображается.

Просмотреть файл

Двоичный файл не отображается.

Двоичные данные
src/olympia/files/fixtures/files/file_viewer_filetypes/fff.gif Normal file

Двоичный файл не отображается.

После

Ширина:  |  Высота:  |  Размер: 35 B

Просмотреть файл

@ -0,0 +1 @@
background-color: #000;

Просмотреть файл

@ -0,0 +1,6 @@
<!doctype html>
<html lang="en">
<body>
</body>
</html>

Просмотреть файл

@ -0,0 +1,3 @@
(function() {
console.log('I dont do anything');
})

Просмотреть файл

@ -0,0 +1,3 @@
{
"id": "some random id"
}

Просмотреть файл

@ -0,0 +1,3 @@
#!/usr/env/bin python
print('Hahaha, yeah, useless')

Двоичные данные
src/olympia/files/fixtures/files/file_viewer_filetypes/image.jpg Normal file

Двоичный файл не отображается.

После

Ширина:  |  Высота:  |  Размер: 695 B

Двоичные данные
src/olympia/files/fixtures/files/file_viewer_filetypes/image.png Normal file

Двоичный файл не отображается.

После

Ширина:  |  Высота:  |  Размер: 108 B

Просмотреть файл

@ -0,0 +1,7 @@
constconst list = [
{off: 0, str: "PK\x03\x04", mime: "application/zip", desc: "Zip archive data"},
{off: 0, str: "\xff\xd8", mime: "image/jpeg", desc: "JPEG image data"},
{off: 0, str: "\x89PNG\x0d\x0a\x1a\x0a", mime: "image/png", desc: "PNG image data"},
{off: 0, str: "GIF8", mime: "image/gif", desc: "GIF image data"},
{off: 0, str: "%PDF-", mime: "application/pdf", desc: "PDF document"}
];

Просмотреть файл

@ -0,0 +1,12 @@
<?xml version="1.0" encoding="utf-8"?>
<OpenSearchDescription xmlns="http://a9.com/-/spec/opensearch/1.1/">
<ShortName>search tool</ShortName>
<Tags>SearchGeek Search Engine</Tags>
<Description>Search Engine for Firefox</Description>
<Contact>xxx@yyy.com</Contact>
<InputEncoding>UTF-8</InputEncoding>
<SyndicationRight>limited</SyndicationRight>
<Image width="16" height="16" type="image/x-icon">http://www.yyy.com/favicon.ico</Image>
<Url type="text/html" template="http://www.yyy.com?q={searchTerms}"/>
<Url type="application/x-suggestions+json" template="http://www.yyy.net/?query={searchTerms}"/>
</OpenSearchDescription>

Просмотреть файл

@ -5,6 +5,7 @@ from collections import OrderedDict
from datetime import datetime
import pygit2
import magic
from rest_framework import serializers
from rest_framework.exceptions import NotFound
@ -19,13 +20,25 @@ from olympia.addons.serializers import (
from olympia.addons.models import AddonReviewerFlags
from olympia.files.utils import get_sha256
from olympia.files.models import File
from olympia.files.file_viewer import denied_extensions, denied_magic_numbers
from olympia.versions.models import Version
from olympia.lib.git import AddonGitRepository
from olympia.lib import unicodehelper
from olympia.lib.cache import cache_get_or_set
# Sometime mimetypes get changed in libmagic so this is a (hopefully short)
# list of mappings from old -> new types so that we stay compatible
# with versions out there in the wild.
MIMETYPE_COMPAT_MAPPING = {
# https://github.com/file/file/commit/cee2b49c
'application/xml': 'text/xml',
# Special case, for empty text files libmime reports
# application/x-empty for empty plain text files
# So, let's normalize this.
'application/x-empty': 'text/plain',
}
class AddonReviewerFlagsSerializer(serializers.ModelSerializer):
class Meta:
model = AddonReviewerFlags
@ -86,27 +99,25 @@ class FileEntriesSerializer(FileSerializer):
path = force_text(entry_wrapper.path)
blob = entry_wrapper.blob
is_directory = entry.type == 'tree'
mime, encoding = mimetypes.guess_type(entry.name)
is_binary = (
self.is_binary(path, mime, blob)
if not is_directory else False)
sha_hash = (
get_sha256(io.BytesIO(memoryview(blob)))
if not is_directory else '')
if not entry.type == 'tree' else '')
commit_tzinfo = FixedOffset(commit.commit_time_offset)
commit_time = datetime.fromtimestamp(
float(commit.commit_time),
commit_tzinfo)
mimetype, entry_mime_category = self.get_entry_mime_type(
entry, blob)
result[path] = {
'binary': is_binary,
'depth': path.count(os.sep),
'directory': is_directory,
'filename': force_text(entry.name),
'sha256': sha_hash,
'mimetype': mime or 'application/octet-stream',
'mime_category': entry_mime_category,
'mimetype': mimetype,
'path': path,
'size': blob.size if blob is not None else None,
'modified': commit_time,
@ -123,27 +134,30 @@ class FileEntriesSerializer(FileSerializer):
return self._entries
def is_binary(self, filepath, mimetype, blob):
def get_entry_mime_type(self, entry, blob):
"""Returns the mimetype and type category.
The type category can be ``image``, ``directory``, ``text`` or
``binary``.
"""
Using filepath, mimetype and in-memory buffer to determine if a file
can be shown in HTML or not.
"""
# Re-use the denied data from amo-validator to spot binaries.
ext = os.path.splitext(filepath)[1][1:]
if ext in denied_extensions:
return True
if entry.type == 'tree':
return 'application/octet-stream', 'directory'
bytes_ = tuple(bytearray(memoryview(blob)[:4]))
# Hardcoding the maximum amount of bytes to read here
# until https://github.com/ahupp/python-magic/commit/50e8c856
# lands in a release and we can read that value from libmagic
# We're only reading the needed amount of content from the file to
# not exhaust/read the whole blob into memory again.
bytes_ = io.BytesIO(memoryview(blob)).read(1048576)
mime = magic.from_buffer(bytes_, mime=True)
if any(bytes_[:len(x)] == x for x in denied_magic_numbers):
return True
# Apply compatibility mappings
mime = MIMETYPE_COMPAT_MAPPING.get(mime, mime)
if mimetype:
major, minor = mimetype.split('/')
if major == 'image':
return 'image' # Mark that the file is binary, but an image.
mime_type = mime.split('/')[0]
known_types = ('image', 'text')
return False
return mime, 'binary' if mime_type not in known_types else mime_type
def get_selected_file(self, obj):
requested_file = self.context.get('file', None)

Просмотреть файл

@ -1,8 +1,14 @@
# -*- coding: utf-8 -*-
import mimetypes
import os
from datetime import datetime
import pytest
from mock import MagicMock
from django.core.cache import cache
from django.conf import settings
from django.utils.encoding import force_bytes
from olympia import amo
from olympia.reviewers.serializers import (
@ -58,22 +64,20 @@ class TestFileEntriesSerializer(TestCase):
'manifest.json'}
manifest_data = data['entries']['manifest.json']
assert manifest_data['binary'] is False
assert manifest_data['depth'] == 0
assert manifest_data['directory'] is False
assert manifest_data['filename'] == u'manifest.json'
assert manifest_data['sha256'] == (
'71d4122c0f2f78e089136602f88dbf590f2fa04bb5bc417454bf21446d6cb4f0')
assert manifest_data['mimetype'] == 'application/json'
assert manifest_data['mimetype'] == 'text/plain'
assert manifest_data['mime_category'] == 'text'
assert manifest_data['path'] == u'manifest.json'
assert manifest_data['size'] == 622
assert isinstance(manifest_data['modified'], datetime)
ja_locale_data = data['entries']['_locales/ja']
assert ja_locale_data['binary'] is False
assert ja_locale_data['depth'] == 1
assert ja_locale_data['directory'] is True
assert ja_locale_data['mime_category'] == 'directory'
assert ja_locale_data['filename'] == 'ja'
assert ja_locale_data['sha256'] == ''
assert ja_locale_data['mimetype'] == 'application/octet-stream'
@ -107,26 +111,6 @@ class TestFileEntriesSerializer(TestCase):
assert data['content'].startswith(
'The "link-48.png" icon is taken from the Geomicons')
def test_is_binary(self):
serializer = FileEntriesSerializer()
files = [
'foo.rdf', 'foo.xml', 'foo.js', 'foo.py' 'foo.html', 'foo.txt',
'foo.dtd', 'foo.xul', 'foo.sh', 'foo.properties', 'foo.json',
'foo.src', 'CHANGELOG']
for fname in files:
mime, encoding = mimetypes.guess_type(fname)
assert not serializer.is_binary(fname, mime, b'')
for fname in ['foo.png', 'foo.gif', 'foo.exe', 'foo.swf']:
mime, encoding = mimetypes.guess_type(fname)
assert serializer.is_binary(fname, mime, b'')
for contents in [b'#!/usr/bin/python', b'#python', b'\0x2']:
mime, encoding = mimetypes.guess_type(fname)
assert not serializer.is_binary('random_junk', mime, contents)
def test_get_entries_cached(self):
file = self.addon.current_version.current_file
serializer = FileEntriesSerializer(instance=file)
@ -141,6 +125,47 @@ class TestFileEntriesSerializer(TestCase):
assert cache.get(key) == data['entries']
@pytest.mark.parametrize(
'entry, filename, expected_category, expected_mimetype',
[
(MagicMock(type='blob'), 'blank.pdf', 'binary', 'application/pdf'),
(MagicMock(type='blob'), 'blank.txt', 'text', 'text/plain'),
(MagicMock(type='blob'), 'empty_bat.exe', 'binary',
'application/x-dosexec'),
(MagicMock(type='blob'), 'fff.gif', 'image', 'image/gif'),
(MagicMock(type='blob'), 'foo.css', 'text', 'text/plain'),
(MagicMock(type='blob'), 'foo.html', 'text', 'text/html'),
(MagicMock(type='blob'), 'foo.js', 'text', 'text/plain'),
(MagicMock(type='blob'), 'foo.py', 'text', 'text/plain'),
(MagicMock(type='blob'), 'image.jpg', 'image', 'image/jpeg'),
(MagicMock(type='blob'), 'image.png', 'image', 'image/png'),
(MagicMock(type='blob'), 'search.xml', 'text', 'text/xml'),
(MagicMock(type='blob'), 'js_containing_png_data.js', 'text',
'text/plain'),
(MagicMock(type='blob'), 'foo.json', 'text', 'text/plain'),
(MagicMock(type='tree'), 'foo', 'directory',
'application/octet-stream'),
]
)
def test_file_entries_serializer_category_type(
entry, filename, expected_category, expected_mimetype):
serializer = FileEntriesSerializer()
root = os.path.join(
settings.ROOT,
'src/olympia/files/fixtures/files/file_viewer_filetypes/')
if entry.type == 'tree':
mime, category = serializer.get_entry_mime_type(entry, None)
else:
with open(os.path.join(root, filename), 'rb') as fobj:
mime, category = serializer.get_entry_mime_type(
entry, force_bytes(fobj.read()))
assert mime == expected_mimetype
assert category == expected_category
class TestAddonBrowseVersionSerializer(TestCase):
def setUp(self):
super(TestAddonBrowseVersionSerializer, self).setUp()