Implement mimetype categorization for easier file-type / preview detection (#10750)
This implements more advanced file-type detection based on `python-magic` thus we don't rely on the files extensions anymore but actually verify their content (header) before serving it to the client and reviewer. Fixes #10685 (and actually rewrites it's meaning)
This commit is contained in:
Родитель
ae3ca6db65
Коммит
f8d9198b50
|
@ -28,6 +28,7 @@ RUN apt-get update && apt-get install -y \
|
|||
zlib1g-dev \
|
||||
libffi-dev \
|
||||
libssl-dev \
|
||||
libmagic-dev \
|
||||
python-dev \
|
||||
python3-dev \
|
||||
python-pip \
|
||||
|
|
|
@ -33,6 +33,7 @@ RUN apt-get update && apt-get install -y \
|
|||
zlib1g-dev \
|
||||
libffi-dev \
|
||||
libssl-dev \
|
||||
libmagic-dev \
|
||||
python-dev \
|
||||
python-pip \
|
||||
nodejs \
|
||||
|
|
|
@ -30,6 +30,7 @@ RUN apt-get update && apt-get install -y \
|
|||
zlib1g-dev \
|
||||
libffi-dev \
|
||||
libssl-dev \
|
||||
libmagic-dev \
|
||||
nodejs \
|
||||
# Git, because we're using git-checkout dependencies
|
||||
git \
|
||||
|
|
|
@ -34,6 +34,7 @@ RUN apt-get update && apt-get install -y \
|
|||
locales \
|
||||
zlib1g-dev \
|
||||
libffi-dev \
|
||||
libmagic-dev \
|
||||
libssl-dev \
|
||||
nodejs \
|
||||
uuid-dev \
|
||||
|
|
|
@ -131,12 +131,11 @@ This endpoint allows you to browse through the contents of an Add-on version.
|
|||
:>json string file.content: Raw content of the requested file.
|
||||
:>json string file.selected_file: The selected file, either from the ``file`` parameter or the default (manifest.json, install.rdf or package.json for Add-ons as well as the XML file for search engines).
|
||||
:>json array file.entries[]: The complete file-tree of the extracted XPI.
|
||||
:>json boolean|string file.entries[].binary: ``True`` if the file is a binary file (e.g an .exe, dll, java, swf file), ``'image'`` if the file is an image or ``False`` otherwise. If ``False`` or ``'image'`` the file should be presentable to the user.
|
||||
:>json int file.entries[].depth: Level of folder-tree depth, starting with 0.
|
||||
:>json boolean file.entries[].is_directory: Wheather the file is a directory.
|
||||
:>json string file.entries[].filename: The filename of the file.
|
||||
:>json string file.entries[].path: The absolute path (from the root of the XPI) of the file.
|
||||
:>json string file.entries[].sha256: SHA256 hash.
|
||||
:>json string file.entries[].mimetype: The determined mimetype of the file or ``application/octet-stream`` if none could be determined.
|
||||
:>json string files.entries[].mime_category: The mime type category of this file. Can be ``image``, ``directory``, ``text`` or ``binary``.
|
||||
:>json int file.entries[].size: The size in bytes.
|
||||
:>json string file.entries[].modified: The exact time of the commit, should be equivalent with ``created``.
|
||||
|
|
|
@ -380,3 +380,6 @@ cachetools==3.1.0 \
|
|||
sqlparse==0.2.4 \
|
||||
--hash=sha256:d9cf190f51cbb26da0412247dfe4fb5f4098edb73db84e02f9fc21fdca31fed4 \
|
||||
--hash=sha256:ce028444cfab83be538752a2ffdb56bc417b7784ff35bb9a3062413717807dec
|
||||
python-magic==0.4.15 \
|
||||
--hash=sha256:f2674dcfad52ae6c49d4803fa027809540b130db1dec928cfbb9240316831375 \
|
||||
--hash=sha256:f3765c0f582d2dfc72c15f3b5a82aecfae9498bd29ca840d72f37d7bd38bfcd5
|
||||
|
|
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
После Ширина: | Высота: | Размер: 35 B |
|
@ -0,0 +1 @@
|
|||
background-color: #000;
|
|
@ -0,0 +1,6 @@
|
|||
<!doctype html>
|
||||
|
||||
<html lang="en">
|
||||
<body>
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,3 @@
|
|||
(function() {
|
||||
console.log('I dont do anything');
|
||||
})
|
|
@ -0,0 +1,3 @@
|
|||
{
|
||||
"id": "some random id"
|
||||
}
|
|
@ -0,0 +1,3 @@
|
|||
#!/usr/env/bin python
|
||||
|
||||
print('Hahaha, yeah, useless')
|
Двоичный файл не отображается.
После Ширина: | Высота: | Размер: 695 B |
Двоичный файл не отображается.
После Ширина: | Высота: | Размер: 108 B |
|
@ -0,0 +1,7 @@
|
|||
constconst list = [
|
||||
{off: 0, str: "PK\x03\x04", mime: "application/zip", desc: "Zip archive data"},
|
||||
{off: 0, str: "\xff\xd8", mime: "image/jpeg", desc: "JPEG image data"},
|
||||
{off: 0, str: "\x89PNG\x0d\x0a\x1a\x0a", mime: "image/png", desc: "PNG image data"},
|
||||
{off: 0, str: "GIF8", mime: "image/gif", desc: "GIF image data"},
|
||||
{off: 0, str: "%PDF-", mime: "application/pdf", desc: "PDF document"}
|
||||
];
|
|
@ -0,0 +1,12 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<OpenSearchDescription xmlns="http://a9.com/-/spec/opensearch/1.1/">
|
||||
<ShortName>search tool</ShortName>
|
||||
<Tags>SearchGeek Search Engine</Tags>
|
||||
<Description>Search Engine for Firefox</Description>
|
||||
<Contact>xxx@yyy.com</Contact>
|
||||
<InputEncoding>UTF-8</InputEncoding>
|
||||
<SyndicationRight>limited</SyndicationRight>
|
||||
<Image width="16" height="16" type="image/x-icon">http://www.yyy.com/favicon.ico</Image>
|
||||
<Url type="text/html" template="http://www.yyy.com?q={searchTerms}"/>
|
||||
<Url type="application/x-suggestions+json" template="http://www.yyy.net/?query={searchTerms}"/>
|
||||
</OpenSearchDescription>
|
|
@ -5,6 +5,7 @@ from collections import OrderedDict
|
|||
from datetime import datetime
|
||||
|
||||
import pygit2
|
||||
import magic
|
||||
|
||||
from rest_framework import serializers
|
||||
from rest_framework.exceptions import NotFound
|
||||
|
@ -19,13 +20,25 @@ from olympia.addons.serializers import (
|
|||
from olympia.addons.models import AddonReviewerFlags
|
||||
from olympia.files.utils import get_sha256
|
||||
from olympia.files.models import File
|
||||
from olympia.files.file_viewer import denied_extensions, denied_magic_numbers
|
||||
from olympia.versions.models import Version
|
||||
from olympia.lib.git import AddonGitRepository
|
||||
from olympia.lib import unicodehelper
|
||||
from olympia.lib.cache import cache_get_or_set
|
||||
|
||||
|
||||
# Sometime mimetypes get changed in libmagic so this is a (hopefully short)
|
||||
# list of mappings from old -> new types so that we stay compatible
|
||||
# with versions out there in the wild.
|
||||
MIMETYPE_COMPAT_MAPPING = {
|
||||
# https://github.com/file/file/commit/cee2b49c
|
||||
'application/xml': 'text/xml',
|
||||
# Special case, for empty text files libmime reports
|
||||
# application/x-empty for empty plain text files
|
||||
# So, let's normalize this.
|
||||
'application/x-empty': 'text/plain',
|
||||
}
|
||||
|
||||
|
||||
class AddonReviewerFlagsSerializer(serializers.ModelSerializer):
|
||||
class Meta:
|
||||
model = AddonReviewerFlags
|
||||
|
@ -86,27 +99,25 @@ class FileEntriesSerializer(FileSerializer):
|
|||
path = force_text(entry_wrapper.path)
|
||||
blob = entry_wrapper.blob
|
||||
|
||||
is_directory = entry.type == 'tree'
|
||||
mime, encoding = mimetypes.guess_type(entry.name)
|
||||
is_binary = (
|
||||
self.is_binary(path, mime, blob)
|
||||
if not is_directory else False)
|
||||
sha_hash = (
|
||||
get_sha256(io.BytesIO(memoryview(blob)))
|
||||
if not is_directory else '')
|
||||
if not entry.type == 'tree' else '')
|
||||
|
||||
commit_tzinfo = FixedOffset(commit.commit_time_offset)
|
||||
commit_time = datetime.fromtimestamp(
|
||||
float(commit.commit_time),
|
||||
commit_tzinfo)
|
||||
|
||||
mimetype, entry_mime_category = self.get_entry_mime_type(
|
||||
entry, blob)
|
||||
|
||||
result[path] = {
|
||||
'binary': is_binary,
|
||||
'depth': path.count(os.sep),
|
||||
'directory': is_directory,
|
||||
'filename': force_text(entry.name),
|
||||
'sha256': sha_hash,
|
||||
'mimetype': mime or 'application/octet-stream',
|
||||
'mime_category': entry_mime_category,
|
||||
'mimetype': mimetype,
|
||||
'path': path,
|
||||
'size': blob.size if blob is not None else None,
|
||||
'modified': commit_time,
|
||||
|
@ -123,27 +134,30 @@ class FileEntriesSerializer(FileSerializer):
|
|||
|
||||
return self._entries
|
||||
|
||||
def is_binary(self, filepath, mimetype, blob):
|
||||
def get_entry_mime_type(self, entry, blob):
|
||||
"""Returns the mimetype and type category.
|
||||
|
||||
The type category can be ``image``, ``directory``, ``text`` or
|
||||
``binary``.
|
||||
"""
|
||||
Using filepath, mimetype and in-memory buffer to determine if a file
|
||||
can be shown in HTML or not.
|
||||
"""
|
||||
# Re-use the denied data from amo-validator to spot binaries.
|
||||
ext = os.path.splitext(filepath)[1][1:]
|
||||
if ext in denied_extensions:
|
||||
return True
|
||||
if entry.type == 'tree':
|
||||
return 'application/octet-stream', 'directory'
|
||||
|
||||
bytes_ = tuple(bytearray(memoryview(blob)[:4]))
|
||||
# Hardcoding the maximum amount of bytes to read here
|
||||
# until https://github.com/ahupp/python-magic/commit/50e8c856
|
||||
# lands in a release and we can read that value from libmagic
|
||||
# We're only reading the needed amount of content from the file to
|
||||
# not exhaust/read the whole blob into memory again.
|
||||
bytes_ = io.BytesIO(memoryview(blob)).read(1048576)
|
||||
mime = magic.from_buffer(bytes_, mime=True)
|
||||
|
||||
if any(bytes_[:len(x)] == x for x in denied_magic_numbers):
|
||||
return True
|
||||
# Apply compatibility mappings
|
||||
mime = MIMETYPE_COMPAT_MAPPING.get(mime, mime)
|
||||
|
||||
if mimetype:
|
||||
major, minor = mimetype.split('/')
|
||||
if major == 'image':
|
||||
return 'image' # Mark that the file is binary, but an image.
|
||||
mime_type = mime.split('/')[0]
|
||||
known_types = ('image', 'text')
|
||||
|
||||
return False
|
||||
return mime, 'binary' if mime_type not in known_types else mime_type
|
||||
|
||||
def get_selected_file(self, obj):
|
||||
requested_file = self.context.get('file', None)
|
||||
|
|
|
@ -1,8 +1,14 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
import mimetypes
|
||||
import os
|
||||
from datetime import datetime
|
||||
|
||||
import pytest
|
||||
|
||||
from mock import MagicMock
|
||||
|
||||
from django.core.cache import cache
|
||||
from django.conf import settings
|
||||
from django.utils.encoding import force_bytes
|
||||
|
||||
from olympia import amo
|
||||
from olympia.reviewers.serializers import (
|
||||
|
@ -58,22 +64,20 @@ class TestFileEntriesSerializer(TestCase):
|
|||
'manifest.json'}
|
||||
|
||||
manifest_data = data['entries']['manifest.json']
|
||||
assert manifest_data['binary'] is False
|
||||
assert manifest_data['depth'] == 0
|
||||
assert manifest_data['directory'] is False
|
||||
assert manifest_data['filename'] == u'manifest.json'
|
||||
assert manifest_data['sha256'] == (
|
||||
'71d4122c0f2f78e089136602f88dbf590f2fa04bb5bc417454bf21446d6cb4f0')
|
||||
assert manifest_data['mimetype'] == 'application/json'
|
||||
assert manifest_data['mimetype'] == 'text/plain'
|
||||
assert manifest_data['mime_category'] == 'text'
|
||||
assert manifest_data['path'] == u'manifest.json'
|
||||
assert manifest_data['size'] == 622
|
||||
assert isinstance(manifest_data['modified'], datetime)
|
||||
|
||||
ja_locale_data = data['entries']['_locales/ja']
|
||||
|
||||
assert ja_locale_data['binary'] is False
|
||||
assert ja_locale_data['depth'] == 1
|
||||
assert ja_locale_data['directory'] is True
|
||||
assert ja_locale_data['mime_category'] == 'directory'
|
||||
assert ja_locale_data['filename'] == 'ja'
|
||||
assert ja_locale_data['sha256'] == ''
|
||||
assert ja_locale_data['mimetype'] == 'application/octet-stream'
|
||||
|
@ -107,26 +111,6 @@ class TestFileEntriesSerializer(TestCase):
|
|||
assert data['content'].startswith(
|
||||
'The "link-48.png" icon is taken from the Geomicons')
|
||||
|
||||
def test_is_binary(self):
|
||||
serializer = FileEntriesSerializer()
|
||||
|
||||
files = [
|
||||
'foo.rdf', 'foo.xml', 'foo.js', 'foo.py' 'foo.html', 'foo.txt',
|
||||
'foo.dtd', 'foo.xul', 'foo.sh', 'foo.properties', 'foo.json',
|
||||
'foo.src', 'CHANGELOG']
|
||||
|
||||
for fname in files:
|
||||
mime, encoding = mimetypes.guess_type(fname)
|
||||
assert not serializer.is_binary(fname, mime, b'')
|
||||
|
||||
for fname in ['foo.png', 'foo.gif', 'foo.exe', 'foo.swf']:
|
||||
mime, encoding = mimetypes.guess_type(fname)
|
||||
assert serializer.is_binary(fname, mime, b'')
|
||||
|
||||
for contents in [b'#!/usr/bin/python', b'#python', b'\0x2']:
|
||||
mime, encoding = mimetypes.guess_type(fname)
|
||||
assert not serializer.is_binary('random_junk', mime, contents)
|
||||
|
||||
def test_get_entries_cached(self):
|
||||
file = self.addon.current_version.current_file
|
||||
serializer = FileEntriesSerializer(instance=file)
|
||||
|
@ -141,6 +125,47 @@ class TestFileEntriesSerializer(TestCase):
|
|||
assert cache.get(key) == data['entries']
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'entry, filename, expected_category, expected_mimetype',
|
||||
[
|
||||
(MagicMock(type='blob'), 'blank.pdf', 'binary', 'application/pdf'),
|
||||
(MagicMock(type='blob'), 'blank.txt', 'text', 'text/plain'),
|
||||
(MagicMock(type='blob'), 'empty_bat.exe', 'binary',
|
||||
'application/x-dosexec'),
|
||||
(MagicMock(type='blob'), 'fff.gif', 'image', 'image/gif'),
|
||||
(MagicMock(type='blob'), 'foo.css', 'text', 'text/plain'),
|
||||
(MagicMock(type='blob'), 'foo.html', 'text', 'text/html'),
|
||||
(MagicMock(type='blob'), 'foo.js', 'text', 'text/plain'),
|
||||
(MagicMock(type='blob'), 'foo.py', 'text', 'text/plain'),
|
||||
(MagicMock(type='blob'), 'image.jpg', 'image', 'image/jpeg'),
|
||||
(MagicMock(type='blob'), 'image.png', 'image', 'image/png'),
|
||||
(MagicMock(type='blob'), 'search.xml', 'text', 'text/xml'),
|
||||
(MagicMock(type='blob'), 'js_containing_png_data.js', 'text',
|
||||
'text/plain'),
|
||||
(MagicMock(type='blob'), 'foo.json', 'text', 'text/plain'),
|
||||
(MagicMock(type='tree'), 'foo', 'directory',
|
||||
'application/octet-stream'),
|
||||
]
|
||||
)
|
||||
def test_file_entries_serializer_category_type(
|
||||
entry, filename, expected_category, expected_mimetype):
|
||||
serializer = FileEntriesSerializer()
|
||||
|
||||
root = os.path.join(
|
||||
settings.ROOT,
|
||||
'src/olympia/files/fixtures/files/file_viewer_filetypes/')
|
||||
|
||||
if entry.type == 'tree':
|
||||
mime, category = serializer.get_entry_mime_type(entry, None)
|
||||
else:
|
||||
with open(os.path.join(root, filename), 'rb') as fobj:
|
||||
mime, category = serializer.get_entry_mime_type(
|
||||
entry, force_bytes(fobj.read()))
|
||||
|
||||
assert mime == expected_mimetype
|
||||
assert category == expected_category
|
||||
|
||||
|
||||
class TestAddonBrowseVersionSerializer(TestCase):
|
||||
def setUp(self):
|
||||
super(TestAddonBrowseVersionSerializer, self).setUp()
|
||||
|
|
Загрузка…
Ссылка в новой задаче