Datasets: support os.PathLike (#2273)

2024-09-10 17:27:04 +02:00 · 2024-09-10 17:27:04 +02:00 · 891f192637
--- a/tests/datasets/test_geo.py
+++ b/tests/datasets/test_geo.py
@ -38,7 +38,7 @@ class CustomGeoDataset(GeoDataset):
        bounds: BoundingBox = BoundingBox(0, 1, 2, 3, 4, 5),
        crs: CRS = CRS.from_epsg(4087),
        res: float = 1,
-        paths: str | Path | Iterable[str | Path] | None = None,
+        paths: str | os.PathLike[str] | Iterable[str | os.PathLike[str]] | None = None,
    ) -> None:
        super().__init__()
        self.index.insert(0, tuple(bounds))
--- a/torchgeo/datasets/agb_live_woody_density.py
+++ b/torchgeo/datasets/agb_live_woody_density.py
@ -5,7 +5,6 @@

 import json
 import os
-import pathlib
 from collections.abc import Callable, Iterable
 from typing import Any

@ -106,7 +105,7 @@ class AbovegroundLiveWoodyBiomassDensity(RasterDataset):

    def _download(self) -> None:
        """Download the dataset."""
-        assert isinstance(self.paths, str | pathlib.Path)
+        assert isinstance(self.paths, str | os.PathLike)
        download_url(self.url, self.paths, self.base_filename)

        with open(os.path.join(self.paths, self.base_filename)) as f:
--- a/torchgeo/datasets/agrifieldnet.py
+++ b/torchgeo/datasets/agrifieldnet.py
@ -4,7 +4,6 @@
 """AgriFieldNet India Challenge dataset."""

 import os
-import pathlib
 import re
 from collections.abc import Callable, Iterable, Sequence
 from typing import Any, ClassVar, cast
@ -181,10 +180,10 @@ class AgriFieldNet(RasterDataset):
        Returns:
            data, label, and field ids at that index
        """
-        assert isinstance(self.paths, str | pathlib.Path)
+        assert isinstance(self.paths, str | os.PathLike)

        hits = self.index.intersection(tuple(query), objects=True)
-        filepaths = cast(list[Path], [hit.object for hit in hits])
+        filepaths = cast(list[str], [hit.object for hit in hits])

        if not filepaths:
            raise IndexError(
@ -246,7 +245,7 @@ class AgriFieldNet(RasterDataset):

    def _download(self) -> None:
        """Download the dataset."""
-        assert isinstance(self.paths, str | pathlib.Path)
+        assert isinstance(self.paths, str | os.PathLike)
        os.makedirs(self.paths, exist_ok=True)
        azcopy = which('azcopy')
        azcopy('sync', f'{self.url}', self.paths, '--recursive=true')
--- a/torchgeo/datasets/cbf.py
+++ b/torchgeo/datasets/cbf.py
@ -4,7 +4,6 @@
 """Canadian Building Footprints dataset."""

 import os
-import pathlib
 from collections.abc import Callable, Iterable
 from typing import Any

@ -105,7 +104,7 @@ class CanadianBuildingFootprints(VectorDataset):
        Returns:
            True if dataset files are found and/or MD5s match, else False
        """
-        assert isinstance(self.paths, str | pathlib.Path)
+        assert isinstance(self.paths, str | os.PathLike)
        for prov_terr, md5 in zip(self.provinces_territories, self.md5s):
            filepath = os.path.join(self.paths, prov_terr + '.zip')
            if not check_integrity(filepath, md5 if self.checksum else None):
@ -117,7 +116,7 @@ class CanadianBuildingFootprints(VectorDataset):
        if self._check_integrity():
            print('Files already downloaded and verified')
            return
-        assert isinstance(self.paths, str | pathlib.Path)
+        assert isinstance(self.paths, str | os.PathLike)
        for prov_terr, md5 in zip(self.provinces_territories, self.md5s):
            download_and_extract_archive(
                self.url + prov_terr + '.zip',
--- a/torchgeo/datasets/cdl.py
+++ b/torchgeo/datasets/cdl.py
@ -4,7 +4,6 @@
 """CDL dataset."""

 import os
-import pathlib
 from collections.abc import Callable, Iterable
 from typing import Any, ClassVar

@ -295,7 +294,7 @@ class CDL(RasterDataset):

        # Check if the zip files have already been downloaded
        exists = []
-        assert isinstance(self.paths, str | pathlib.Path)
+        assert isinstance(self.paths, str | os.PathLike)
        for year in self.years:
            pathname = os.path.join(
                self.paths, self.zipfile_glob.replace('*', str(year))
@ -328,7 +327,7 @@ class CDL(RasterDataset):

    def _extract(self) -> None:
        """Extract the dataset."""
-        assert isinstance(self.paths, str | pathlib.Path)
+        assert isinstance(self.paths, str | os.PathLike)
        for year in self.years:
            zipfile_name = self.zipfile_glob.replace('*', str(year))
            pathname = os.path.join(self.paths, zipfile_name)
--- a/torchgeo/datasets/chesapeake.py
+++ b/torchgeo/datasets/chesapeake.py
@ -5,7 +5,6 @@

 import glob
 import os
-import pathlib
 import sys
 from abc import ABC, abstractmethod
 from collections.abc import Callable, Iterable, Sequence
@ -173,7 +172,7 @@ class Chesapeake(RasterDataset, ABC):
            return

        # Check if the zip file has already been downloaded
-        assert isinstance(self.paths, str | pathlib.Path)
+        assert isinstance(self.paths, str | os.PathLike)
        if glob.glob(os.path.join(self.paths, '**', '*.zip'), recursive=True):
            self._extract()
            return
@ -195,7 +194,7 @@ class Chesapeake(RasterDataset, ABC):

    def _extract(self) -> None:
        """Extract the dataset."""
-        assert isinstance(self.paths, str | pathlib.Path)
+        assert isinstance(self.paths, str | os.PathLike)
        for file in glob.iglob(os.path.join(self.paths, '**', '*.zip'), recursive=True):
            extract_archive(file)

--- a/torchgeo/datasets/cms_mangrove_canopy.py
+++ b/torchgeo/datasets/cms_mangrove_canopy.py
@ -4,7 +4,6 @@
 """CMS Global Mangrove Canopy dataset."""

 import os
-import pathlib
 from collections.abc import Callable
 from typing import Any

@ -229,7 +228,7 @@ class CMSGlobalMangroveCanopy(RasterDataset):
            return

        # Check if the zip file has already been downloaded
-        assert isinstance(self.paths, str | pathlib.Path)
+        assert isinstance(self.paths, str | os.PathLike)
        pathname = os.path.join(self.paths, self.zipfile)
        if os.path.exists(pathname):
            if self.checksum and not check_integrity(pathname, self.md5):
@ -241,7 +240,7 @@ class CMSGlobalMangroveCanopy(RasterDataset):

    def _extract(self) -> None:
        """Extract the dataset."""
-        assert isinstance(self.paths, str | pathlib.Path)
+        assert isinstance(self.paths, str | os.PathLike)
        pathname = os.path.join(self.paths, self.zipfile)
        extract_archive(pathname)

--- a/torchgeo/datasets/esri2020.py
+++ b/torchgeo/datasets/esri2020.py
@ -5,7 +5,6 @@

 import glob
 import os
-import pathlib
 from collections.abc import Callable, Iterable
 from typing import Any

@ -113,7 +112,7 @@ class Esri2020(RasterDataset):
            return

        # Check if the zip files have already been downloaded
-        assert isinstance(self.paths, str | pathlib.Path)
+        assert isinstance(self.paths, str | os.PathLike)
        pathname = os.path.join(self.paths, self.zipfile)
        if glob.glob(pathname):
            self._extract()
@ -133,7 +132,7 @@ class Esri2020(RasterDataset):

    def _extract(self) -> None:
        """Extract the dataset."""
-        assert isinstance(self.paths, str | pathlib.Path)
+        assert isinstance(self.paths, str | os.PathLike)
        extract_archive(os.path.join(self.paths, self.zipfile))

    def plot(
--- a/torchgeo/datasets/eudem.py
+++ b/torchgeo/datasets/eudem.py
@ -5,7 +5,6 @@

 import glob
 import os
-import pathlib
 from collections.abc import Callable, Iterable
 from typing import Any, ClassVar

@ -117,7 +116,7 @@ class EUDEM(RasterDataset):
            return

        # Check if the zip files have already been downloaded
-        assert isinstance(self.paths, str | pathlib.Path)
+        assert isinstance(self.paths, str | os.PathLike)
        pathname = os.path.join(self.paths, self.zipfile_glob)
        if glob.glob(pathname):
            for zipfile in glob.iglob(pathname):
--- a/torchgeo/datasets/eurocrops.py
+++ b/torchgeo/datasets/eurocrops.py
@ -5,7 +5,6 @@

 import csv
 import os
-import pathlib
 from collections.abc import Callable, Iterable
 from typing import Any

@ -140,7 +139,7 @@ class EuroCrops(VectorDataset):
        if self.files and not self.checksum:
            return True

-        assert isinstance(self.paths, str | pathlib.Path)
+        assert isinstance(self.paths, str | os.PathLike)

        filepath = os.path.join(self.paths, self.hcat_fname)
        if not check_integrity(filepath, self.hcat_md5 if self.checksum else None):
@ -157,7 +156,7 @@ class EuroCrops(VectorDataset):
        if self._check_integrity():
            print('Files already downloaded and verified')
            return
-        assert isinstance(self.paths, str | pathlib.Path)
+        assert isinstance(self.paths, str | os.PathLike)
        download_url(
            self.base_url + self.hcat_fname,
            self.paths,
@ -179,7 +178,7 @@ class EuroCrops(VectorDataset):
                (defaults to all classes)
        """
        if not classes:
-            assert isinstance(self.paths, str | pathlib.Path)
+            assert isinstance(self.paths, str | os.PathLike)
            classes = []
            filepath = os.path.join(self.paths, self.hcat_fname)
            with open(filepath) as f:
--- a/torchgeo/datasets/geo.py
+++ b/torchgeo/datasets/geo.py
@ -8,7 +8,6 @@ import fnmatch
 import functools
 import glob
 import os
-import pathlib
 import re
 import sys
 import warnings
@ -300,7 +299,7 @@ class GeoDataset(Dataset[dict[str, Any]], abc.ABC):
        .. versionadded:: 0.5
        """
        # Make iterable
-        if isinstance(self.paths, str | pathlib.Path):
+        if isinstance(self.paths, str | os.PathLike):
            paths: Iterable[Path] = [self.paths]
        else:
            paths = self.paths
@ -521,7 +520,7 @@ class RasterDataset(GeoDataset):
            IndexError: if query is not found in the index
        """
        hits = self.index.intersection(tuple(query), objects=True)
-        filepaths = cast(list[Path], [hit.object for hit in hits])
+        filepaths = cast(list[str], [hit.object for hit in hits])

        if not filepaths:
            raise IndexError(
@ -564,7 +563,7 @@ class RasterDataset(GeoDataset):

    def _merge_files(
        self,
-        filepaths: Sequence[Path],
+        filepaths: Sequence[str],
        query: BoundingBox,
        band_indexes: Sequence[int] | None = None,
    ) -> Tensor:
--- a/torchgeo/datasets/globbiomass.py
+++ b/torchgeo/datasets/globbiomass.py
@ -5,7 +5,6 @@

 import glob
 import os
-import pathlib
 from collections.abc import Callable, Iterable
 from typing import Any, ClassVar, cast

@ -193,7 +192,7 @@ class GlobBiomass(RasterDataset):
            IndexError: if query is not found in the index
        """
        hits = self.index.intersection(tuple(query), objects=True)
-        filepaths = cast(list[Path], [hit.object for hit in hits])
+        filepaths = cast(list[str], [hit.object for hit in hits])

        if not filepaths:
            raise IndexError(
@ -221,7 +220,7 @@ class GlobBiomass(RasterDataset):
            return

        # Check if the zip files have already been downloaded
-        assert isinstance(self.paths, str | pathlib.Path)
+        assert isinstance(self.paths, str | os.PathLike)
        pathname = os.path.join(self.paths, f'*_{self.measurement}.zip')
        if glob.glob(pathname):
            for zipfile in glob.iglob(pathname):
--- a/torchgeo/datasets/l7irish.py
+++ b/torchgeo/datasets/l7irish.py
@ -5,7 +5,6 @@

 import glob
 import os
-import pathlib
 import re
 from collections.abc import Callable, Iterable, Sequence
 from typing import Any, ClassVar, cast
@ -94,7 +93,7 @@ class L7IrishMask(RasterDataset):
        filename_regex = re.compile(L7IrishImage.filename_regex, re.VERBOSE)
        index = Index(interleaved=False, properties=Property(dimension=3))
        for hit in self.index.intersection(self.index.bounds, objects=True):
-            dirname = os.path.dirname(cast(Path, hit.object))
+            dirname = os.path.dirname(cast(str, hit.object))
            image = glob.glob(os.path.join(dirname, L7IrishImage.filename_glob))[0]
            minx, maxx, miny, maxy, mint, maxt = hit.bounds
            if match := re.match(filename_regex, os.path.basename(image)):
@ -229,7 +228,7 @@ class L7Irish(IntersectionDataset):
    def _verify(self) -> None:
        """Verify the integrity of the dataset."""
        # Check if the extracted files already exist
-        if not isinstance(self.paths, str | pathlib.Path):
+        if not isinstance(self.paths, str | os.PathLike):
            return

        for classname in [L7IrishImage, L7IrishMask]:
@ -262,7 +261,7 @@ class L7Irish(IntersectionDataset):

    def _extract(self) -> None:
        """Extract the dataset."""
-        assert isinstance(self.paths, str | pathlib.Path)
+        assert isinstance(self.paths, str | os.PathLike)
        pathname = os.path.join(self.paths, '*.tar.gz')
        for tarfile in glob.iglob(pathname):
            extract_archive(tarfile)
--- a/torchgeo/datasets/l8biome.py
+++ b/torchgeo/datasets/l8biome.py
@ -5,7 +5,6 @@

 import glob
 import os
-import pathlib
 from collections.abc import Callable, Iterable, Sequence
 from typing import Any, ClassVar

@ -174,7 +173,7 @@ class L8Biome(IntersectionDataset):
    def _verify(self) -> None:
        """Verify the integrity of the dataset."""
        # Check if the extracted files already exist
-        if not isinstance(self.paths, str | pathlib.Path):
+        if not isinstance(self.paths, str | os.PathLike):
            return

        for classname in [L8BiomeImage, L8BiomeMask]:
@ -207,7 +206,7 @@ class L8Biome(IntersectionDataset):

    def _extract(self) -> None:
        """Extract the dataset."""
-        assert isinstance(self.paths, str | pathlib.Path)
+        assert isinstance(self.paths, str | os.PathLike)
        pathname = os.path.join(self.paths, '*.tar.gz')
        for tarfile in glob.iglob(pathname):
            extract_archive(tarfile)
--- a/torchgeo/datasets/landcoverai.py
+++ b/torchgeo/datasets/landcoverai.py
@ -254,7 +254,7 @@ class LandCoverAIGeo(LandCoverAIBase, RasterDataset):
            IndexError: if query is not found in the index
        """
        hits = self.index.intersection(tuple(query), objects=True)
-        img_filepaths = cast(list[Path], [hit.object for hit in hits])
+        img_filepaths = cast(list[str], [hit.object for hit in hits])
        mask_filepaths = [
            str(path).replace('images', 'masks') for path in img_filepaths
        ]
--- a/torchgeo/datasets/nlcd.py
+++ b/torchgeo/datasets/nlcd.py
@ -5,7 +5,6 @@

 import glob
 import os
-import pathlib
 from collections.abc import Callable, Iterable
 from typing import Any, ClassVar

@ -192,7 +191,7 @@ class NLCD(RasterDataset):
        exists = []
        for year in self.years:
            zipfile_year = self.zipfile_glob.replace('*', str(year), 1)
-            assert isinstance(self.paths, str | pathlib.Path)
+            assert isinstance(self.paths, str | os.PathLike)
            pathname = os.path.join(self.paths, '**', zipfile_year)
            if glob.glob(pathname, recursive=True):
                exists.append(True)
@ -224,7 +223,7 @@ class NLCD(RasterDataset):
        """Extract the dataset."""
        for year in self.years:
            zipfile_name = self.zipfile_glob.replace('*', str(year), 1)
-            assert isinstance(self.paths, str | pathlib.Path)
+            assert isinstance(self.paths, str | os.PathLike)
            pathname = os.path.join(self.paths, '**', zipfile_name)
            extract_archive(glob.glob(pathname, recursive=True)[0], self.paths)

--- a/torchgeo/datasets/openbuildings.py
+++ b/torchgeo/datasets/openbuildings.py
@ -6,7 +6,6 @@
 import glob
 import json
 import os
-import pathlib
 import sys
 from collections.abc import Callable, Iterable
 from typing import Any, ClassVar, cast
@ -242,7 +241,7 @@ class OpenBuildings(VectorDataset):
        # Create an R-tree to index the dataset using the polygon centroid as bounds
        self.index = Index(interleaved=False, properties=Property(dimension=3))

-        assert isinstance(self.paths, str | pathlib.Path)
+        assert isinstance(self.paths, str | os.PathLike)
        with open(os.path.join(self.paths, 'tiles.geojson')) as f:
            data = json.load(f)

@ -305,7 +304,7 @@ class OpenBuildings(VectorDataset):
            IndexError: if query is not found in the index
        """
        hits = self.index.intersection(tuple(query), objects=True)
-        filepaths = cast(list[Path], [hit.object for hit in hits])
+        filepaths = cast(list[str], [hit.object for hit in hits])

        if not filepaths:
            raise IndexError(
@ -336,7 +335,7 @@ class OpenBuildings(VectorDataset):
        return sample

    def _filter_geometries(
-        self, query: BoundingBox, filepaths: list[Path]
+        self, query: BoundingBox, filepaths: list[str]
    ) -> list[dict[str, Any]]:
        """Filters a df read from the polygon csv file based on query and conf thresh.

@ -398,7 +397,7 @@ class OpenBuildings(VectorDataset):
    def _verify(self) -> None:
        """Verify the integrity of the dataset."""
        # Check if the zip files have already been downloaded and checksum
-        assert isinstance(self.paths, str | pathlib.Path)
+        assert isinstance(self.paths, str | os.PathLike)
        pathname = os.path.join(self.paths, self.zipfile_glob)
        i = 0
        for zipfile in glob.iglob(pathname):
--- a/torchgeo/datasets/south_africa_crop_type.py
+++ b/torchgeo/datasets/south_africa_crop_type.py
@ -4,7 +4,6 @@
 """South Africa Crop Type Competition Dataset."""

 import os
-import pathlib
 import re
 from collections.abc import Callable, Iterable, Sequence
 from typing import Any, ClassVar, cast
@ -161,11 +160,11 @@ class SouthAfricaCropType(RasterDataset):
        Returns:
            data and labels at that index
        """
-        assert isinstance(self.paths, str | pathlib.Path)
+        assert isinstance(self.paths, str | os.PathLike)

        # Get all files matching the given query
        hits = self.index.intersection(tuple(query), objects=True)
-        filepaths = cast(list[Path], [hit.object for hit in hits])
+        filepaths = cast(list[str], [hit.object for hit in hits])

        if not filepaths:
            raise IndexError(
@ -253,7 +252,7 @@ class SouthAfricaCropType(RasterDataset):

    def _download(self) -> None:
        """Download the dataset."""
-        assert isinstance(self.paths, str | pathlib.Path)
+        assert isinstance(self.paths, str | os.PathLike)
        os.makedirs(self.paths, exist_ok=True)
        azcopy = which('azcopy')
        azcopy('sync', f'{self.url}', self.paths, '--recursive=true')
--- a/torchgeo/datasets/south_america_soybean.py
+++ b/torchgeo/datasets/south_america_soybean.py
@ -3,7 +3,7 @@

 """South America Soybean Dataset."""

-import pathlib
+import os
 from collections.abc import Callable, Iterable
 from typing import Any, ClassVar

@ -113,7 +113,7 @@ class SouthAmericaSoybean(RasterDataset):
        # Check if the extracted files already exist
        if self.files:
            return
-        assert isinstance(self.paths, str | pathlib.Path)
+        assert isinstance(self.paths, str | os.PathLike)

        # Check if the user requested to download the dataset
        if not self.download:
--- a/torchgeo/datasets/utils.py
+++ b/torchgeo/datasets/utils.py
@ -10,7 +10,6 @@ import collections
 import contextlib
 import importlib
 import os
-import pathlib
 import shutil
 import subprocess
 import sys
@ -42,7 +41,7 @@ __all__ = (
 )


-Path: TypeAlias = str | pathlib.Path
+Path: TypeAlias = str | os.PathLike[str]


@dataclass(frozen=True)