Add CaBuAr dataset (#2235)

* 🆕 Added CaBuAr dataset * 🆕 Added CaBuAr datamodule * 🔨 Added CaBuAr datamodule test * 🔨 Corrected CaBuAr typing and datamodule test * 🔨 updated test, corrected docs, minor fixes to dataset and datamodule * 🔨 CaBuAr test fixes
2024-08-28 15:57:58 +02:00 · 2024-08-28 15:57:58 +02:00 · ccc314cd88
--- a/docs/api/datamodules.rst
+++ b/docs/api/datamodules.rst
@ -57,6 +57,11 @@ BigEarthNet
 .. autoclass:: BigEarthNetDataModule
 CaBuAr
 ^^^^^^
 .. autoclass:: CaBuArDataModule
 ChaBuD
 ^^^^^^
--- a/docs/api/datasets.rst
+++ b/docs/api/datasets.rst
@ -217,6 +217,11 @@ BioMassters
 .. autoclass:: BioMassters
 CaBuAr
 ^^^^^^
 .. autoclass:: CaBuAr
 ChaBuD
 ^^^^^^
--- a/docs/api/non_geo_datasets.csv
+++ b/docs/api/non_geo_datasets.csv
@ -3,6 +3,7 @@ Dataset,Task,Source,License,# Samples,# Classes,Size (px),Resolution (m),Bands
 `Benin Cashew Plantations`_,S,Airbus Pléiades,"CC-BY-4.0",70,6,"1,122x1,186",10,MSI
 `BigEarthNet`_,C,Sentinel-1/2,"CDLA-Permissive-1.0","590,326",19--43,120x120,10,"SAR, MSI"
 `BioMassters`_,R,Sentinel-1/2 and Lidar,"CC-BY-4.0",,,256x256, 10, "SAR, MSI"
 `CaBuAr`_,CD,Sentinel-2,"OpenRAIL",424,2,512x512,20,MSI
 `ChaBuD`_,CD,Sentinel-2,"OpenRAIL",356,2,512x512,10,MSI
 `Cloud Cover Detection`_,S,Sentinel-2,"CC-BY-4.0","22,728",2,512x512,10,MSI
 `COWC`_,"C, R","CSUAV AFRL, ISPRS, LINZ, AGRC","AGPL-3.0-only","388,435",2,256x256,0.15,RGB
--- a/tests/conf/cabuar.yaml
+++ b/tests/conf/cabuar.yaml
@ -0,0 +1,16 @@
 model:
  class_path: SemanticSegmentationTask
  init_args:
    loss: "ce"
    model: "unet"
    backbone: "resnet18"
    in_channels: 24
    num_classes: 2
    num_filters: 1
    ignore_index: null
 data:
  class_path: CaBuArDataModule
  init_args:
    batch_size: 2
  dict_kwargs:
    root: "tests/data/cabuar"
--- a/tests/data/cabuar/512x512.hdf5
+++ b/tests/data/cabuar/512x512.hdf5
--- a/tests/data/cabuar/chabud_test.h5
+++ b/tests/data/cabuar/chabud_test.h5
--- a/tests/data/cabuar/data.py
+++ b/tests/data/cabuar/data.py
@ -0,0 +1,69 @@
 #!/usr/bin/env python3
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 import hashlib
 import os
 import random
 import h5py
 import numpy as np
 # Sentinel-2 is 12-bit with range 0-4095
 SENTINEL2_MAX = 4096
 NUM_CHANNELS = 12
 NUM_CLASSES = 2
 SIZE = 32
 np.random.seed(0)
 random.seed(0)
 filenames = ['512x512.hdf5', 'chabud_test.h5']
 fold_mapping = {'train': [1, 2, 3, 4], 'val': [0], 'test': ['chabud']}
 uris = [
    'feb08801-64b1-4d11-a3fc-0efaad1f4274_0',
    'e4d4dbcb-dd92-40cf-a7fe-fda8dd35f367_1',
    '9fc8c1f4-1858-47c3-953e-1dc8b179a',
    '3a1358a2-6155-445a-a269-13bebd9741a8_0',
    '2f8e659c-f457-4527-a57f-bffc3bbe0baa_0',
    '299ee670-19b1-4a76-bef3-34fd55580711_1',
    '05cfef86-3e27-42be-a0cb-a61fe2f89e40_0',
    '0328d12a-4ad8-4504-8ac5-70089db10b4e_1',
    '04800581-b540-4f9b-9df8-7ee433e83f46_0',
    '108ae2a9-d7d6-42f7-b89a-90bb75c23ccb_0',
    '29413474-04b8-4bb1-8b89-fd640023d4a6_0',
    '43f2e60a-73b4-4f33-b99e-319d892fcab4_0',
 ]
 folds = random.choices(fold_mapping['train'], k=4) + [0] * 4 + ['chabud'] * 4
 files = ['512x512.hdf5'] * 8 + ['chabud_test.h5'] * 4
 # Remove old data
 for filename in filenames:
    if os.path.exists(filename):
        os.remove(filename)
 # Create dataset file
 data = np.random.randint(
    SENTINEL2_MAX, size=(SIZE, SIZE, NUM_CHANNELS), dtype=np.uint16
 )
 gt = np.random.randint(NUM_CLASSES, size=(SIZE, SIZE, 1), dtype=np.uint16)
 for filename, uri, fold in zip(files, uris, folds):
    with h5py.File(filename, 'a') as f:
        sample = f.create_group(uri)
        sample.attrs.create(
            name='fold', data=np.int64(fold) if fold != 'chabud' else fold
        )
        sample.create_dataset
        sample.create_dataset('pre_fire', data=data)
        sample.create_dataset('post_fire', data=data)
        sample.create_dataset('mask', data=gt)
 # Compute checksums
 for filename in filenames:
    with open(filename, 'rb') as f:
        md5 = hashlib.md5(f.read()).hexdigest()
        print(f'{filename} md5: {md5}')
--- a/tests/datasets/test_cabuar.py
+++ b/tests/datasets/test_cabuar.py
@ -0,0 +1,92 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 import os
 from itertools import product
 from pathlib import Path
 import matplotlib.pyplot as plt
 import pytest
 import torch
 import torch.nn as nn
 from _pytest.fixtures import SubRequest
 from pytest import MonkeyPatch
 from torchgeo.datasets import CaBuAr, DatasetNotFoundError
 pytest.importorskip('h5py', minversion='3.6')
 class TestCaBuAr:
    @pytest.fixture(
        params=product([CaBuAr.all_bands, CaBuAr.rgb_bands], ['train', 'val', 'test'])
    )
    def dataset(
        self, monkeypatch: MonkeyPatch, tmp_path: Path, request: SubRequest
    ) -> CaBuAr:
        data_dir = os.path.join('tests', 'data', 'cabuar')
        urls = (
            os.path.join(data_dir, '512x512.hdf5'),
            os.path.join(data_dir, 'chabud_test.h5'),
        )
        monkeypatch.setattr(CaBuAr, 'urls', urls)
        bands, split = request.param
        root = tmp_path
        transforms = nn.Identity()
        return CaBuAr(
            root=root,
            split=split,
            bands=bands,
            transforms=transforms,
            download=True,
            checksum=True,
        )
    def test_getitem(self, dataset: CaBuAr) -> None:
        x = dataset[0]
        assert isinstance(x, dict)
        assert isinstance(x['image'], torch.Tensor)
        assert isinstance(x['mask'], torch.Tensor)
        # Image tests
        assert x['image'].ndim == 3
        if dataset.bands == CaBuAr.rgb_bands:
            assert x['image'].shape[0] == 2 * 3
        elif dataset.bands == CaBuAr.all_bands:
            assert x['image'].shape[0] == 2 * 12
        # Mask tests:
        assert x['mask'].ndim == 2
    def test_len(self, dataset: CaBuAr) -> None:
        assert len(dataset) == 4
    def test_already_downloaded(self, dataset: CaBuAr) -> None:
        CaBuAr(root=dataset.root, download=True)
    def test_not_downloaded(self, tmp_path: Path) -> None:
        with pytest.raises(DatasetNotFoundError, match='Dataset not found'):
            CaBuAr(tmp_path)
    def test_invalid_bands(self) -> None:
        with pytest.raises(AssertionError):
            CaBuAr(bands=('OK', 'BK'))
    def test_plot(self, dataset: CaBuAr) -> None:
        dataset.plot(dataset[0], suptitle='Test')
        plt.close()
        sample = dataset[0]
        sample['prediction'] = sample['mask'].clone()
        dataset.plot(sample, suptitle='prediction')
        plt.close()
    def test_plot_rgb(self, dataset: CaBuAr) -> None:
        dataset = CaBuAr(root=dataset.root, bands=('B02',))
        with pytest.raises(ValueError, match="doesn't contain some of the RGB bands"):
            dataset.plot(dataset[0], suptitle='Single Band')
    def test_invalid_split(self, dataset: CaBuAr) -> None:
        with pytest.raises(AssertionError):
            CaBuAr(dataset.root, split='foo')
--- a/tests/trainers/test_segmentation.py
+++ b/tests/trainers/test_segmentation.py
@ -50,6 +50,7 @@ class TestSemanticSegmentationTask:
        'name',
        [
            'agrifieldnet',
            'cabuar',
            'chabud',
            'chesapeake_cvpr_5',
            'chesapeake_cvpr_7',
@ -83,7 +84,7 @@ class TestSemanticSegmentationTask:
        self, monkeypatch: MonkeyPatch, name: str, fast_dev_run: bool
    ) -> None:
        match name:
-            case 'chabud':
+            case 'chabud' | 'cabuar':
                pytest.importorskip('h5py', minversion='3.6')
            case 'landcoverai':
                sha256 = (
--- a/torchgeo/datamodules/init.py
+++ b/torchgeo/datamodules/init.py
@ -5,6 +5,7 @@
 from .agrifieldnet import AgriFieldNetDataModule
 from .bigearthnet import BigEarthNetDataModule
 from .cabuar import CaBuArDataModule
 from .chabud import ChaBuDDataModule
 from .chesapeake import ChesapeakeCVPRDataModule
 from .cowc import COWCCountingDataModule
@ -65,6 +66,7 @@ __all__ = (
    'SouthAfricaCropTypeDataModule',
    # NonGeoDataset
    'BigEarthNetDataModule',
    'CaBuArDataModule',
    'ChaBuDDataModule',
    'COWCCountingDataModule',
    'DeepGlobeLandCoverDataModule',
--- a/torchgeo/datamodules/cabuar.py
+++ b/torchgeo/datamodules/cabuar.py
@ -0,0 +1,67 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 """CaBuAr datamodule."""
 from typing import Any
 import torch
 from einops import repeat
 from ..datasets import CaBuAr
 from .geo import NonGeoDataModule
 class CaBuArDataModule(NonGeoDataModule):
    """LightningDataModule implementation for the CaBuAr dataset.
    Uses the train/val/test splits from the dataset
    .. versionadded:: 0.6
    """
    # min/max values computed on train set using 2/98 percentiles
    min = torch.tensor(
        [0.0, 1.0, 73.0, 39.0, 46.0, 25.0, 26.0, 21.0, 17.0, 1.0, 20.0, 21.0]
    )
    max = torch.tensor(
        [
            1926.0,
            2174.0,
            2527.0,
            2950.0,
            3237.0,
            3717.0,
            4087.0,
            4271.0,
            4290.0,
            4219.0,
            4568.0,
            3753.0,
        ]
    )
    def __init__(
        self, batch_size: int = 64, num_workers: int = 0, **kwargs: Any
    ) -> None:
        """Initialize a new CaBuArDataModule instance.
        Args:
            batch_size: Size of each mini-batch.
            num_workers: Number of workers for parallel data loading.
            **kwargs: Additional keyword arguments passed to
                :class:`~torchgeo.datasets.CaBuAr`.
        """
        bands = kwargs.get('bands', CaBuAr.all_bands)
        band_indices = [CaBuAr.all_bands.index(b) for b in bands]
        mins = self.min[band_indices]
        maxs = self.max[band_indices]
        # Change detection, 2 images from different times
        mins = repeat(mins, 'c -> (t c)', t=2)
        maxs = repeat(maxs, 'c -> (t c)', t=2)
        self.mean = mins
        self.std = maxs - mins
        super().__init__(CaBuAr, batch_size, num_workers, **kwargs)
--- a/torchgeo/datasets/init.py
+++ b/torchgeo/datasets/init.py
@ -11,6 +11,7 @@ from .astergdem import AsterGDEM
 from .benin_cashews import BeninSmallHolderCashews
 from .bigearthnet import BigEarthNet
 from .biomassters import BioMassters
 from .cabuar import CaBuAr
 from .cbf import CanadianBuildingFootprints
 from .cdl import CDL
 from .chabud import ChaBuD
@ -199,6 +200,7 @@ __all__ = (
    'BeninSmallHolderCashews',
    'BigEarthNet',
    'BioMassters',
    'CaBuAr',
    'ChaBuD',
    'CloudCoverDetection',
    'COWC',
--- a/torchgeo/datasets/cabuar.py
+++ b/torchgeo/datasets/cabuar.py
@ -0,0 +1,303 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 """CaBuAr dataset."""
 import os
 from collections.abc import Callable
 from typing import ClassVar
 import matplotlib.pyplot as plt
 import numpy as np
 import torch
 from matplotlib.figure import Figure
 from torch import Tensor
 from .errors import DatasetNotFoundError
 from .geo import NonGeoDataset
 from .utils import Path, download_url, lazy_import, percentile_normalization
 class CaBuAr(NonGeoDataset):
    """CaBuAr dataset.
    `CaBuAr <https://huggingface.co/datasets/DarthReca/california_burned_areas>`__
    is a dataset for Change detection for Burned area Delineation and part of
    the splits are used for the ChaBuD ECML-PKDD 2023 Discovery Challenge.
    Dataset features:
    * Sentinel-2 multispectral imagery
    * binary masks of burned areas
    * 12 multispectral bands
    * 424 pairs of pre and post images with 20 m per pixel resolution (512x512 px)
    Dataset format:
    * single hdf5 dataset containing images and masks
    Dataset classes:
    0. no change
    1. burned area
    If you use this dataset in your research, please cite the following paper:
    * https://doi.org/10.1109/MGRS.2023.3292467
    .. note::
       This dataset requires the following additional library to be installed:
       * `h5py <https://pypi.org/project/h5py/>`_ to load the dataset
    .. versionadded:: 0.6
    """
    all_bands = (
        'B01',
        'B02',
        'B03',
        'B04',
        'B05',
        'B06',
        'B07',
        'B08',
        'B8A',
        'B09',
        'B11',
        'B12',
    )
    rgb_bands = ('B04', 'B03', 'B02')
    folds: ClassVar[dict[str, list[object]]] = {
        'train': [1, 2, 3, 4],
        'val': [0],
        'test': ['chabud'],
    }
    urls = (
        'https://huggingface.co/datasets/DarthReca/california_burned_areas/resolve/main/raw/patched/512x512.hdf5',
        'https://huggingface.co/datasets/DarthReca/california_burned_areas/resolve/main/raw/patched/chabud_test.h5',
    )
    filenames = ('512x512.hdf5', 'chabud_test.h5')
    md5s = ('15d78fb825f9a81dad600db828d22c08', 'a70bb7e4a2788657c2354c4c3d9296fe')
    def __init__(
        self,
        root: Path = 'data',
        split: str = 'train',
        bands: tuple[str, ...] = all_bands,
        transforms: Callable[[dict[str, Tensor]], dict[str, Tensor]] | None = None,
        download: bool = False,
        checksum: bool = False,
    ) -> None:
        """Initialize a new CaBuAr dataset instance.
        Args:
            root: root directory where dataset can be found
            split: one of "train", "val", "test"
            bands: the subset of bands to load
            transforms: a function/transform that takes input sample and its target as
                entry and returns a transformed version
            download: if True, download dataset and store it in the root directory
            checksum: if True, check the MD5 of the downloaded files (may be slow)
        Raises:
            AssertionError: If ``split`` or ``bands`` arguments are invalid.
            DatasetNotFoundError: If dataset is not found and *download* is False.
            DependencyNotFoundError: If h5py is not installed.
        """
        lazy_import('h5py')
        assert split in self.folds
        assert set(bands) <= set(self.all_bands)
        # Set the file index based on the split
        file_index = 1 if split == 'test' else 0
        self.root = root
        self.split = split
        self.bands = bands
        self.transforms = transforms
        self.download = download
        self.checksum = checksum
        self.filepath = os.path.join(root, self.filenames[file_index])
        self.band_indices = [self.all_bands.index(b) for b in bands]
        self._verify()
        self.uuids = self._load_uuids()
    def __getitem__(self, index: int) -> dict[str, Tensor]:
        """Return an index within the dataset.
        Args:
            index: index to return
        Returns:
            sample containing image and mask
        """
        image = self._load_image(index)
        mask = self._load_target(index)
        sample = {'image': image, 'mask': mask}
        if self.transforms is not None:
            sample = self.transforms(sample)
        return sample
    def __len__(self) -> int:
        """Return the number of data points in the dataset.
        Returns:
            length of the dataset
        """
        return len(self.uuids)
    def _load_uuids(self) -> list[str]:
        """Return the image uuids for the given split.
        Returns:
            the image uuids
        """
        h5py = lazy_import('h5py')
        uuids = []
        with h5py.File(self.filepath, 'r') as f:
            for k, v in f.items():
                if v.attrs['fold'] in self.folds[self.split] and 'pre_fire' in v.keys():
                    uuids.append(k)
        return sorted(uuids)
    def _load_image(self, index: int) -> Tensor:
        """Load a single image.
        Args:
            index: index to return
        Returns:
            the image
        """
        h5py = lazy_import('h5py')
        uuid = self.uuids[index]
        with h5py.File(self.filepath, 'r') as f:
            pre_array = f[uuid]['pre_fire'][:]
            post_array = f[uuid]['post_fire'][:]
        # index specified bands and concatenate
        pre_array = pre_array[..., self.band_indices]
        post_array = post_array[..., self.band_indices]
        array = np.concatenate([pre_array, post_array], axis=-1).astype(np.float32)
        tensor = torch.from_numpy(array)
        # Convert from HxWxC to CxHxW
        tensor = tensor.permute((2, 0, 1))
        return tensor
    def _load_target(self, index: int) -> Tensor:
        """Load the target mask for a single image.
        Args:
            index: index to return
        Returns:
            the target mask
        """
        h5py = lazy_import('h5py')
        uuid = self.uuids[index]
        with h5py.File(self.filepath, 'r') as f:
            array = f[uuid]['mask'][:].astype(np.int32).squeeze(axis=-1)
        tensor = torch.from_numpy(array)
        tensor = tensor.to(torch.long)
        return tensor
    def _verify(self) -> None:
        """Verify the integrity of the dataset."""
        # Check if the files already exist
        exists = []
        for filename in self.filenames:
            filepath = os.path.join(self.root, filename)
            exists.append(os.path.exists(filepath))
        if all(exists):
            return
        # Check if the user requested to download the dataset
        if not self.download:
            raise DatasetNotFoundError(self)
        # Download the dataset
        self._download()
    def _download(self) -> None:
        """Download the dataset."""
        for url, filename, md5 in zip(self.urls, self.filenames, self.md5s):
            filepath = os.path.join(self.root, filename)
            if not os.path.exists(filepath):
                download_url(
                    url,
                    self.root,
                    filename=filename,
                    md5=md5 if self.checksum else None,
                )
    def plot(
        self,
        sample: dict[str, Tensor],
        show_titles: bool = True,
        suptitle: str | None = None,
    ) -> Figure:
        """Plot a sample from the dataset.
        Args:
            sample: a sample returned by :meth:`__getitem__`
            show_titles: flag indicating whether to show titles above each panel
            suptitle: optional suptitle to use for figure
        Returns:
            a matplotlib Figure with the rendered sample
        """
        rgb_indices = []
        for band in self.rgb_bands:
            if band in self.bands:
                rgb_indices.append(self.bands.index(band))
            else:
                raise ValueError("Dataset doesn't contain some of the RGB bands")
        mask = sample['mask'].numpy()
        image_pre = sample['image'][: len(self.bands)][rgb_indices].numpy()
        image_post = sample['image'][len(self.bands) :][rgb_indices].numpy()
        image_pre = percentile_normalization(image_pre)
        image_post = percentile_normalization(image_post)
        ncols = 3
        showing_predictions = 'prediction' in sample
        if showing_predictions:
            prediction = sample['prediction']
            ncols += 1
        fig, axs = plt.subplots(nrows=1, ncols=ncols, figsize=(10, ncols * 5))
        axs[0].imshow(np.transpose(image_pre, (1, 2, 0)))
        axs[0].axis('off')
        axs[1].imshow(np.transpose(image_post, (1, 2, 0)))
        axs[1].axis('off')
        axs[2].imshow(mask)
        axs[2].axis('off')
        if showing_predictions:
            axs[3].imshow(prediction)
            axs[3].axis('off')
        if show_titles:
            axs[0].set_title('Image Pre')
            axs[1].set_title('Image Post')
            axs[2].set_title('Mask')
            if showing_predictions:
                axs[3].set_title('Prediction')
        if suptitle is not None:
            plt.suptitle(suptitle)
        return fig