Add ZueriCrop dataset (#147)

2021-09-19 18:25:09 -05:00 · 2021-09-19 18:25:09 -05:00 · 459524fedc
--- a/docs/api/datasets.rst
+++ b/docs/api/datasets.rst
@ -147,6 +147,11 @@ NWPU VHR-10
 .. autoclass:: VHR10
 ZueriCrop
 ^^^^^^^^^
 .. autoclass:: ZueriCrop
 .. _Base Classes:
 Base Classes
--- a/tests/data/README.md
+++ b/tests/data/README.md
@ -75,3 +75,18 @@ from scipy.io import wavfile
 audio = np.random.randn(1).astype(np.float32)
 wavfile.write("01.wav", rate=22050, data=audio)
 ```
 ### HDF5 datasets
 ```python
 import h5py
 import numpy as np
 f = h5py.File("data.hdf5", "w")
 num_classes = 10
 images = np.random.randint(low=0, high=255, size=(1, 1, 3)).astype(np.uint8)
 masks = np.random.randint(low=0, high=num_classes, size=(1, 1)).astype(np.uint8)
 f.create_dataset("images", data=images)
 f.create_dataset("masks", data=masks)
 f.close()
--- a/tests/data/zuericrop/ZueriCrop.hdf5
+++ b/tests/data/zuericrop/ZueriCrop.hdf5
--- a/tests/data/zuericrop/labels.csv
+++ b/tests/data/zuericrop/labels.csv
--- a/tests/datasets/test_zuericrop.py
+++ b/tests/datasets/test_zuericrop.py
@ -0,0 +1,104 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 import builtins
 import os
 import shutil
 from pathlib import Path
 from typing import Any, Generator
 import pytest
 import torch
 from _pytest.monkeypatch import MonkeyPatch
 import torchgeo.datasets.utils
 from torchgeo.datasets import ZueriCrop
 from torchgeo.transforms import Identity
 pytest.importorskip("h5py")
 def download_url(url: str, root: str, *args: str, **kwargs: str) -> None:
    shutil.copy(url, root)
 class TestZueriCrop:
    @pytest.fixture
    def dataset(
        self,
        monkeypatch: Generator[MonkeyPatch, None, None],
        tmp_path: Path,
    ) -> ZueriCrop:
        monkeypatch.setattr(  # type: ignore[attr-defined]
            torchgeo.datasets.zuericrop, "download_url", download_url
        )
        data_dir = os.path.join("tests", "data", "zuericrop")
        urls = [
            os.path.join(data_dir, "ZueriCrop.hdf5"),
            os.path.join(data_dir, "labels.csv"),
        ]
        md5s = ["8c0ca5ad53903aeba8a1d06bba50a5ec", "d41d8cd98f00b204e9800998ecf8427e"]
        monkeypatch.setattr(ZueriCrop, "urls", urls)  # type: ignore[attr-defined]
        monkeypatch.setattr(ZueriCrop, "md5s", md5s)  # type: ignore[attr-defined]
        root = str(tmp_path)
        transforms = Identity()
        return ZueriCrop(root, transforms, download=True, checksum=True)
    @pytest.fixture
    def mock_missing_module(
        self, monkeypatch: Generator[MonkeyPatch, None, None]
    ) -> None:
        import_orig = builtins.__import__
        def mocked_import(name: str, *args: Any, **kwargs: Any) -> Any:
            if name == "h5py":
                raise ImportError()
            return import_orig(name, *args, **kwargs)
        monkeypatch.setattr(  # type: ignore[attr-defined]
            builtins, "__import__", mocked_import
        )
    def test_getitem(self, dataset: ZueriCrop) -> None:
        x = dataset[0]
        assert isinstance(x, dict)
        assert isinstance(x["image"], torch.Tensor)
        assert isinstance(x["mask"], torch.Tensor)
        assert isinstance(x["boxes"], torch.Tensor)
        assert isinstance(x["label"], torch.Tensor)
        # Image tests
        assert x["image"].ndim == 4
        # Instance masks tests
        assert x["mask"].ndim == 3
        assert x["mask"].shape[-2:] == x["image"].shape[-2:]
        # Bboxes tests
        assert x["boxes"].ndim == 2
        assert x["boxes"].shape[1] == 4
        # Labels tests
        assert x["label"].ndim == 1
    def test_len(self, dataset: ZueriCrop) -> None:
        assert len(dataset) == 2
    def test_already_downloaded(self, dataset: ZueriCrop) -> None:
        ZueriCrop(root=dataset.root, download=True)
    def test_not_downloaded(self, tmp_path: Path) -> None:
        err = "Dataset not found in `root` directory and `download=False`, "
        "either specify a different `root` directory or use `download=True` "
        "to automaticaly download the dataset."
        with pytest.raises(RuntimeError, match=err):
            ZueriCrop(str(tmp_path))
    def test_mock_missing_module(
        self, dataset: ZueriCrop, tmp_path: Path, mock_missing_module: None
    ) -> None:
        with pytest.raises(
            ImportError,
            match="h5py is not installed and is required to use this dataset",
        ):
            ZueriCrop(dataset.root, download=True, checksum=True)
--- a/torchgeo/datasets/init.py
+++ b/torchgeo/datasets/init.py
@ -50,6 +50,7 @@ from .sentinel import Sentinel, Sentinel2
 from .so2sat import So2Sat
 from .spacenet import SpaceNet1
 from .utils import BoundingBox, collate_dict
 from .zuericrop import ZueriCrop
 __all__ = (
    # GeoDataset
@ -98,6 +99,7 @@ __all__ = (
    "SpaceNet1",
    "TropicalCycloneWindEstimation",
    "VHR10",
    "ZueriCrop",
    # Base classes
    "GeoDataset",
    "RasterDataset",
--- a/torchgeo/datasets/zuericrop.py
+++ b/torchgeo/datasets/zuericrop.py
@ -0,0 +1,232 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 """ZueriCrop dataset."""
 import os
 from typing import Callable, Dict, Optional, Tuple
 import torch
 from torch import Tensor
 from .geo import VisionDataset
 from .utils import download_url
 class ZueriCrop(VisionDataset):
    """ZueriCrop dataset.
    The `ZueriCrop <https://github.com/0zgur0/ms-convSTAR>`_
    dataset is a dataset for time-series instance segmentation of crops.
    Dataset features:
    * Sentinel-2 multispectral imagery
    * instance masks of 48 crop categories
    * nine multispectral bands
    * 116k images with 10 m per pixel resolution (24x24 px)
    * ~28k time-series containing 142 images each
    Dataset format:
    * single hdf5 dataset containing images, semantic masks, and instance masks
    * data is parsed into images and instance masks, boxes, and labels
    * one mask per time-series
    Dataset classes:
    * 48 fine-grained hierarchical crop
      `categories <https://github.com/0zgur0/ms-convSTAR/blob/master/labels.csv>`_
    If you use this dataset in your research, please cite the following paper:
    * https://doi.org/10.1016/j.rse.2021.112603
    .. note::
       This dataset requires the following additional library to be installed:
       * `h5py <https://pypi.org/project/h5py/>`_ to load the dataset
    """
    urls = [
        "https://polybox.ethz.ch/index.php/s/uXfdr2AcXE3QNB6/download",
        "https://raw.githubusercontent.com/0zgur0/ms-convSTAR/master/labels.csv",
    ]
    md5s = ["1635231df67f3d25f4f1e62c98e221a4", "5118398c7a5bbc246f5f6bb35d8d529b"]
    filenames = ["ZueriCrop.hdf5", "labels.csv"]
    def __init__(
        self,
        root: str = "data",
        transforms: Optional[Callable[[Dict[str, Tensor]], Dict[str, Tensor]]] = None,
        download: bool = False,
        checksum: bool = False,
    ) -> None:
        """Initialize a new ZueriCrop dataset instance.
        Args:
            root: root directory where dataset can be found
            transforms: a function/transform that takes input sample and its target as
                entry and returns a transformed version
            download: if True, download dataset and store it in the root directory
            checksum: if True, check the MD5 of the downloaded files (may be slow)
        Raises:
            RuntimeError: if ``download=False`` and data is not found, or checksums
                don't match
        """
        self.root = root
        self.transforms = transforms
        self.download = download
        self.checksum = checksum
        self.filepath = os.path.join(root, "ZueriCrop.hdf5")
        self._verify()
        try:
            import h5py  # noqa: F401
        except ImportError:
            raise ImportError(
                "h5py is not installed and is required to use this dataset"
            )
    def __getitem__(self, index: int) -> Dict[str, Tensor]:
        """Return an index within the dataset.
        Args:
            index: index to return
        Returns:
            sample containing image, mask, bounding boxes, and target label
        """
        image = self._load_image(index)
        mask, boxes, label = self._load_target(index)
        sample = {"image": image, "mask": mask, "boxes": boxes, "label": label}
        if self.transforms is not None:
            sample = self.transforms(sample)
        return sample
    def __len__(self) -> int:
        """Return the number of data points in the dataset.
        Returns:
            length of the dataset
        """
        import h5py
        with h5py.File(self.filepath, "r") as f:
            length: int = f["data"].shape[0]
        return length
    def _load_image(self, index: int) -> Tensor:
        """Load a single image.
        Args:
            index: index to return
        Returns:
            the image
        """
        import h5py
        with h5py.File(self.filepath, "r") as f:
            array = f["data"][index, ...]
        tensor: Tensor = torch.from_numpy(array)  # type: ignore[attr-defined]
        # Convert from TxHxWxC to TxCxHxW
        tensor = tensor.permute((0, 3, 1, 2))
        return tensor
    def _load_target(self, index: int) -> Tuple[Tensor, Tensor, Tensor]:
        """Load the target mask for a single image.
        Args:
            index: index to return
        Returns:
            the target mask and label for each mask
        """
        import h5py
        with h5py.File(self.filepath, "r") as f:
            mask_array = f["gt"][index, ...]
            instance_array = f["gt_instance"][index, ...]
        mask_tensor = torch.from_numpy(mask_array)  # type: ignore[attr-defined]
        instance_tensor = torch.from_numpy(instance_array)  # type: ignore[attr-defined]
        # Convert from HxWxC to CxHxW
        mask_tensor = mask_tensor.permute((2, 0, 1))
        instance_tensor = instance_tensor.permute((2, 0, 1))
        # Convert instance mask of N instances to N binary instance masks
        instance_ids = torch.unique(instance_tensor)  # type: ignore[attr-defined]
        # Exclude a mask for unknown/background
        instance_ids = instance_ids[instance_ids != 0]
        instance_ids = instance_ids[:, None, None]
        masks: Tensor = instance_tensor == instance_ids
        # Parse labels for each instance
        labels_list = []
        for mask in masks:
            label = mask_tensor[mask[None, :, :]]
            label = torch.unique(label)[0]  # type: ignore[attr-defined]
            labels_list.append(label)
        # Get bounding boxes for each instance
        boxes_list = []
        for mask in masks:
            pos = torch.where(mask)  # type: ignore[attr-defined]
            xmin = torch.min(pos[1])  # type: ignore[attr-defined]
            xmax = torch.max(pos[1])  # type: ignore[attr-defined]
            ymin = torch.min(pos[0])  # type: ignore[attr-defined]
            ymax = torch.max(pos[0])  # type: ignore[attr-defined]
            boxes_list.append([xmin, ymin, xmax, ymax])
        masks = masks.to(torch.uint8)  # type: ignore[attr-defined]
        boxes = torch.tensor(boxes_list).to(torch.float)  # type: ignore[attr-defined]
        labels = torch.tensor(labels_list).to(torch.long)  # type: ignore[attr-defined]
        return masks, boxes, labels
    def _verify(self) -> None:
        """Verify the integrity of the dataset.
        Raises:
            RuntimeError: if ``download=False`` but dataset is missing or checksum fails
        """
        # Check if the files already exist
        exists = []
        for filename in self.filenames:
            filepath = os.path.join(self.root, filename)
            exists.append(os.path.exists(filepath))
        if all(exists):
            return
        # Check if the user requested to download the dataset
        if not self.download:
            raise RuntimeError(
                "Dataset not found in `root` directory and `download=False`, "
                "either specify a different `root` directory or use `download=True` "
                "to automaticaly download the dataset."
            )
        # Download the dataset
        self._download()
    def _download(self) -> None:
        """Download the dataset."""
        for url, filename, md5 in zip(self.urls, self.filenames, self.md5s):
            filepath = os.path.join(self.root, filename)
            if not os.path.exists(filepath):
                download_url(
                    url,
                    self.root,
                    filename=filename,
                    md5=md5 if self.checksum else None,
                )