extract_archive: support deflate64-compressed zip files (#282)

This commit is contained in:
Adam J. Stewart 2022-01-14 23:14:51 -06:00
Родитель b7d35aab64
Коммит c9520aa3f1
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: C66C0675661156FC
11 изменённых файлов: 165 добавлений и 9 удалений

Просмотреть файл

@ -9,7 +9,7 @@ experiment:
learning_rate: 1e-3
learning_rate_schedule_patience: 2
in_channels: 4
num_classes: 13
num_classes: 14
num_filters: 1
ignore_zeros: False
datamodule:

Просмотреть файл

@ -46,3 +46,4 @@ dependencies:
- sphinx>=4
- timm>=0.2.1
- torchmetrics
- zipfile-deflate64>=0.2

Просмотреть файл

@ -88,6 +88,9 @@ datasets =
rarfile>=3
# scipy 0.9+ required for scipy.io.wavfile.read
scipy>=0.9
# zipfile-deflate64 0.2+ required for extraction bugfix:
# https://github.com/brianhelba/zipfile-deflate64/issues/19
zipfile-deflate64>=0.2
# Optional developer requirements
style =
# black 21+ required for Python 3.9 support

Двоичный файл не отображается.

Двоичный файл не отображается.

Просмотреть файл

@ -0,0 +1,95 @@
#!/usr/bin/env python3
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import hashlib
import os
import subprocess
import numpy as np
import rasterio
from rasterio.crs import CRS
from rasterio.transform import Affine
SIZE = 128 # image width/height
NUM_CLASSES = 14
np.random.seed(0)
filename = "Baywide_13Class_20132014"
wkt = """
PROJCS["USA_Contiguous_Albers_Equal_Area_Conic_USGS_version",
GEOGCS["NAD83",
DATUM["North_American_Datum_1983",
SPHEROID["GRS 1980",6378137,298.257222101004,
AUTHORITY["EPSG","7019"]],
AUTHORITY["EPSG","6269"]],
PRIMEM["Greenwich",0],
UNIT["degree",0.0174532925199433,
AUTHORITY["EPSG","9122"]],
AUTHORITY["EPSG","4269"]],
PROJECTION["Albers_Conic_Equal_Area"],
PARAMETER["latitude_of_center",23],
PARAMETER["longitude_of_center",-96],
PARAMETER["standard_parallel_1",29.5],
PARAMETER["standard_parallel_2",45.5],
PARAMETER["false_easting",0],
PARAMETER["false_northing",0],
UNIT["metre",1,
AUTHORITY["EPSG","9001"]],
AXIS["Easting",EAST],
AXIS["Northing",NORTH]]
"""
cmap = {
0: (0, 0, 0, 255),
1: (0, 197, 255, 255),
2: (0, 168, 132, 255),
3: (38, 115, 0, 255),
4: (76, 230, 0, 255),
5: (163, 255, 115, 255),
6: (255, 170, 0, 255),
7: (255, 0, 0, 255),
8: (156, 156, 156, 255),
9: (0, 0, 0, 255),
10: (115, 115, 0, 255),
11: (230, 230, 0, 255),
12: (255, 255, 115, 255),
13: (197, 0, 255, 255),
}
meta = {
"driver": "GTiff",
"dtype": "uint8",
"nodata": None,
"width": SIZE,
"height": SIZE,
"count": 1,
"crs": CRS.from_wkt(wkt),
"transform": Affine(1.0, 0.0, 1303555.0000000005, 0.0, -1.0, 2535064.999999998),
}
# Remove old data
if os.path.exists(f"{filename}.tif"):
os.remove(f"{filename}.tif")
# Create raster file
with rasterio.open(f"{filename}.tif", "w", **meta) as f:
data = np.random.randint(NUM_CLASSES, size=(SIZE, SIZE), dtype=np.uint8)
f.write(data, 1)
f.write_colormap(1, cmap)
# Create zip file
# 7z required to create a zip file using the proprietary DEFLATE64 compression algorithm
# https://github.com/brianhelba/zipfile-deflate64/issues/19#issuecomment-1006077294
subprocess.run(
["7z", "a", f"{filename}.zip", "-mm=DEFLATE64", f"{filename}.tif"],
capture_output=True,
check=True,
)
# Compute checksums
with open(f"{filename}.zip", "rb") as f:
md5 = hashlib.md5(f.read()).hexdigest()
print(repr(md5))

Просмотреть файл

@ -33,10 +33,11 @@ class TestChesapeake13:
def dataset(
self, monkeypatch: Generator[MonkeyPatch, None, None], tmp_path: Path
) -> Chesapeake13:
pytest.importorskip("zipfile_deflate64")
monkeypatch.setattr( # type: ignore[attr-defined]
torchgeo.datasets.chesapeake, "download_url", download_url
)
md5 = "9557b609e614a1f79dec6eb1bb3f3a06"
md5 = "fe35a615b8e749b21270472aa98bb42c"
monkeypatch.setattr(Chesapeake13, "md5", md5) # type: ignore[attr-defined]
url = os.path.join(
"tests", "data", "chesapeake", "BAYWIDE", "Baywide_13Class_20132014.zip"

Просмотреть файл

@ -41,7 +41,7 @@ def mock_missing_module(monkeypatch: Generator[MonkeyPatch, None, None]) -> None
import_orig = builtins.__import__
def mocked_import(name: str, *args: Any, **kwargs: Any) -> Any:
if name in ["rarfile", "radiant_mlhub"]:
if name in ["radiant_mlhub", "rarfile", "zipfile_deflate64"]:
raise ImportError()
return import_orig(name, *args, **kwargs)
@ -93,11 +93,15 @@ def test_mock_missing_module(mock_missing_module: None) -> None:
os.path.join("cowc_detection", "COWC_test_list_detection.txt.bz2"),
os.path.join("vhr10", "NWPU VHR-10 dataset.rar"),
os.path.join("landcoverai", "landcover.ai.v1.zip"),
os.path.join("chesapeake", "BAYWIDE", "Baywide_13Class_20132014.zip"),
os.path.join("sen12ms", "ROIs1158_spring_lc.tar.gz"),
],
)
def test_extract_archive(src: str, tmp_path: Path) -> None:
if src.endswith(".rar"):
pytest.importorskip("rarfile", minversion="3")
if src.startswith("chesapeake"):
pytest.importorskip("zipfile_deflate64")
extract_archive(os.path.join("tests", "data", src), str(tmp_path))
@ -111,6 +115,11 @@ def test_missing_rarfile(mock_missing_module: None) -> None:
)
def test_missing_zipfile_deflate64(mock_missing_module: None) -> None:
# Should fallback on Python builtin zipfile
extract_archive(os.path.join("tests", "data", "landcoverai", "landcover.ai.v1.zip"))
def test_unsupported_scheme() -> None:
with pytest.raises(
RuntimeError, match="src file has unknown archival/compression scheme"

Просмотреть файл

@ -50,6 +50,9 @@ class TestSemanticSegmentationTask:
name: str,
classname: Type[LightningDataModule],
) -> None:
if name == "naipchesapeake":
pytest.importorskip("zipfile_deflate64")
conf = OmegaConf.load(os.path.join("conf", "task_defaults", name + ".yaml"))
conf_dict = OmegaConf.to_object(conf.experiment)
conf_dict = cast(Dict[Any, Dict[Any, Any]], conf_dict)

Просмотреть файл

@ -233,7 +233,15 @@ class ChesapeakeDE(Chesapeake):
class ChesapeakeMD(Chesapeake):
"""This subset of the dataset contains data only for Maryland."""
"""This subset of the dataset contains data only for Maryland.
.. note::
This dataset requires the following additional library to be installed:
* `zipfile-deflate64 <https://pypi.org/project/zipfile-deflate64/>`_ to extract
the proprietary deflate64 compressed zip file.
"""
base_folder = "MD"
filename = "MD_STATEWIDE.tif"
@ -242,7 +250,15 @@ class ChesapeakeMD(Chesapeake):
class ChesapeakeNY(Chesapeake):
"""This subset of the dataset contains data only for New York."""
"""This subset of the dataset contains data only for New York.
.. note::
This dataset requires the following additional library to be installed:
* `zipfile-deflate64 <https://pypi.org/project/zipfile-deflate64/>`_ to extract
the proprietary deflate64 compressed zip file.
"""
base_folder = "NY"
filename = "NY_STATEWIDE.tif"
@ -260,7 +276,15 @@ class ChesapeakePA(Chesapeake):
class ChesapeakeVA(Chesapeake):
"""This subset of the dataset contains data only for Virginia."""
"""This subset of the dataset contains data only for Virginia.
.. note::
This dataset requires the following additional library to be installed:
* `zipfile-deflate64 <https://pypi.org/project/zipfile-deflate64/>`_ to extract
the proprietary deflate64 compressed zip file.
"""
base_folder = "VA"
filename = "CIC2014_VA_STATEWIDE.tif"

Просмотреть файл

@ -11,7 +11,6 @@ import lzma
import os
import sys
import tarfile
import zipfile
from dataclasses import dataclass
from datetime import datetime, timedelta
from typing import (
@ -77,6 +76,27 @@ class _rarfile:
pass
class _zipfile:
class ZipFile:
def __init__(self, *args: Any, **kwargs: Any) -> None:
self.args = args
self.kwargs = kwargs
def __enter__(self) -> Any:
try:
# Supports normal zip files, proprietary deflate64 compression algorithm
import zipfile_deflate64 as zipfile
except ImportError:
# Only supports normal zip files
# https://github.com/python/mypy/issues/1153
import zipfile # type: ignore[no-redef]
return zipfile.ZipFile(*self.args, **self.kwargs)
def __exit__(self, exc_type: None, exc_value: None, traceback: None) -> None:
pass
def extract_archive(src: str, dst: Optional[str] = None) -> None:
"""Extract an archive.
@ -96,7 +116,7 @@ def extract_archive(src: str, dst: Optional[str] = None) -> None:
(".tar", ".tar.gz", ".tar.bz2", ".tar.xz", ".tgz", ".tbz2", ".tbz", ".txz"),
tarfile.open,
),
(".zip", zipfile.ZipFile),
(".zip", _zipfile.ZipFile),
]
for suffix, extractor in suffix_and_extractor: