Feature/refactor vector root to paths (#1597)

* Make RasterDataset accept list of files

* Fix check if str

* Use isdir and isfile

* Rename root to paths and update type hint

* Update children of RasterDataset methods using root

* Fix check to cast str to list

* Update conf files for RasterDatasets

* Add initial suggested test

* Add workaround for lists LandCoverAIBase

* Add method handle_nonlocal_path for users to override

* Raise RuntimeError to support existing tests

* Remove reduntand cast to set

* Remove required os.exists for paths

* Revert "Remove required os.exists for paths"

This reverts commit 84bf62b944326c33d5ba8efdcab615c65b124792.

* Use arg  as potitional argument not kwarg

* Improve comments and logs about arg paths

* Remove misleading comment

* Change type hint of 'paths' to Iterable

* Change type hint of 'paths' to Iterable

* Remove premature handling of non-local paths

* Replace root with paths in docstrings

* Add versionadded to list_files docstring

* Add versionchanged to docstrings

* Update type of paths in childred of Raster

* Replace docstring for paths in all raster

* Swap root with paths for conf files for raster

* Add newline before versionchanged

* Revert name to root in conf for ChesapeakeCVPR

* Simplify EUDEM tests

* paths must be a string if you want autodownload support

* Convert list_files to a property

* Fix type hints

* Test with a real empty directory

* Move property `files` up to GeoDataset

* Rename root to paths for VectorDataset

* Fix mypy

* Fix tests

* Delete duplicate code

* Delete duplicate code

* Fix test coverage

* Document name change

---------

Co-authored-by: Adrian Tofting <adriantofting@mobmob14994.hq.k.grp>
Co-authored-by: Adrian Tofting <adrian@vake.ai>
Co-authored-by: Adam J. Stewart <ajstewart426@gmail.com>
This commit is contained in:
Adrian Tofting 2023-09-29 19:39:04 +02:00 коммит произвёл GitHub
Родитель 6ae0d78448
Коммит 3532f78383
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
9 изменённых файлов: 85 добавлений и 79 удалений

Просмотреть файл

@ -61,7 +61,7 @@ class TestCanadianBuildingFootprints:
assert isinstance(ds, UnionDataset) assert isinstance(ds, UnionDataset)
def test_already_downloaded(self, dataset: CanadianBuildingFootprints) -> None: def test_already_downloaded(self, dataset: CanadianBuildingFootprints) -> None:
CanadianBuildingFootprints(root=dataset.root, download=True) CanadianBuildingFootprints(dataset.paths, download=True)
def test_plot(self, dataset: CanadianBuildingFootprints) -> None: def test_plot(self, dataset: CanadianBuildingFootprints) -> None:
query = dataset.bounds query = dataset.bounds

Просмотреть файл

@ -141,7 +141,7 @@ class TestChesapeakeCVPR:
) )
monkeypatch.setattr( monkeypatch.setattr(
ChesapeakeCVPR, ChesapeakeCVPR,
"files", "_files",
["de_1m_2013_extended-debuffered-test_tiles", "spatial_index.geojson"], ["de_1m_2013_extended-debuffered-test_tiles", "spatial_index.geojson"],
) )
root = str(tmp_path) root = str(tmp_path)

Просмотреть файл

@ -47,7 +47,7 @@ class TestEnviroAtlas:
) )
monkeypatch.setattr( monkeypatch.setattr(
EnviroAtlas, EnviroAtlas,
"files", "_files",
["pittsburgh_pa-2010_1m-train_tiles-debuffered", "spatial_index.geojson"], ["pittsburgh_pa-2010_1m-train_tiles-debuffered", "spatial_index.geojson"],
) )
root = str(tmp_path) root = str(tmp_path)

Просмотреть файл

@ -37,7 +37,7 @@ class TestOpenBuildings:
monkeypatch.setattr(OpenBuildings, "md5s", md5s) monkeypatch.setattr(OpenBuildings, "md5s", md5s)
transforms = nn.Identity() transforms = nn.Identity()
return OpenBuildings(root=root, transforms=transforms) return OpenBuildings(root, transforms=transforms)
def test_no_shapes_to_rasterize( def test_no_shapes_to_rasterize(
self, dataset: OpenBuildings, tmp_path: Path self, dataset: OpenBuildings, tmp_path: Path
@ -61,19 +61,19 @@ class TestOpenBuildings:
with pytest.raises( with pytest.raises(
RuntimeError, match="have manually downloaded the dataset as suggested " RuntimeError, match="have manually downloaded the dataset as suggested "
): ):
OpenBuildings(root=false_root) OpenBuildings(false_root)
def test_corrupted(self, dataset: OpenBuildings, tmp_path: Path) -> None: def test_corrupted(self, dataset: OpenBuildings, tmp_path: Path) -> None:
with open(os.path.join(tmp_path, "000_buildings.csv.gz"), "w") as f: with open(os.path.join(tmp_path, "000_buildings.csv.gz"), "w") as f:
f.write("bad") f.write("bad")
with pytest.raises(RuntimeError, match="Dataset found, but corrupted."): with pytest.raises(RuntimeError, match="Dataset found, but corrupted."):
OpenBuildings(dataset.root, checksum=True) OpenBuildings(dataset.paths, checksum=True)
def test_no_meta_data_found(self, tmp_path: Path) -> None: def test_no_meta_data_found(self, tmp_path: Path) -> None:
false_root = os.path.join(tmp_path, "empty") false_root = os.path.join(tmp_path, "empty")
os.makedirs(false_root) os.makedirs(false_root)
with pytest.raises(FileNotFoundError, match="Meta data file"): with pytest.raises(FileNotFoundError, match="Meta data file"):
OpenBuildings(root=false_root) OpenBuildings(false_root)
def test_nothing_in_index(self, dataset: OpenBuildings, tmp_path: Path) -> None: def test_nothing_in_index(self, dataset: OpenBuildings, tmp_path: Path) -> None:
# change meta data to another 'title_url' so that there is no match found # change meta data to another 'title_url' so that there is no match found
@ -85,7 +85,7 @@ class TestOpenBuildings:
json.dump(content, f) json.dump(content, f)
with pytest.raises(FileNotFoundError, match="data was found in"): with pytest.raises(FileNotFoundError, match="data was found in"):
OpenBuildings(dataset.root) OpenBuildings(dataset.paths)
def test_getitem(self, dataset: OpenBuildings) -> None: def test_getitem(self, dataset: OpenBuildings) -> None:
x = dataset[dataset.bounds] x = dataset[dataset.bounds]

Просмотреть файл

@ -4,7 +4,8 @@
"""Canadian Building Footprints dataset.""" """Canadian Building Footprints dataset."""
import os import os
from typing import Any, Callable, Optional from collections.abc import Iterable
from typing import Any, Callable, Optional, Union
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from matplotlib.figure import Figure from matplotlib.figure import Figure
@ -60,7 +61,7 @@ class CanadianBuildingFootprints(VectorDataset):
def __init__( def __init__(
self, self,
root: str = "data", paths: Union[str, Iterable[str]] = "data",
crs: Optional[CRS] = None, crs: Optional[CRS] = None,
res: float = 0.00001, res: float = 0.00001,
transforms: Optional[Callable[[dict[str, Any]], dict[str, Any]]] = None, transforms: Optional[Callable[[dict[str, Any]], dict[str, Any]]] = None,
@ -70,7 +71,7 @@ class CanadianBuildingFootprints(VectorDataset):
"""Initialize a new Dataset instance. """Initialize a new Dataset instance.
Args: Args:
root: root directory where dataset can be found paths: one or more root directories to search or files to load
crs: :term:`coordinate reference system (CRS)` to warp to crs: :term:`coordinate reference system (CRS)` to warp to
(defaults to the CRS of the first file found) (defaults to the CRS of the first file found)
res: resolution of the dataset in units of CRS res: resolution of the dataset in units of CRS
@ -83,8 +84,11 @@ class CanadianBuildingFootprints(VectorDataset):
FileNotFoundError: if no files are found in ``root`` FileNotFoundError: if no files are found in ``root``
RuntimeError: if ``download=False`` and data is not found, or RuntimeError: if ``download=False`` and data is not found, or
``checksum=True`` and checksums don't match ``checksum=True`` and checksums don't match
.. versionchanged:: 0.5
*root* was renamed to *paths*.
""" """
self.root = root self.paths = paths
self.checksum = checksum self.checksum = checksum
if download: if download:
@ -96,7 +100,7 @@ class CanadianBuildingFootprints(VectorDataset):
+ "You can use download=True to download it" + "You can use download=True to download it"
) )
super().__init__(root, crs, res, transforms) super().__init__(paths, crs, res, transforms)
def _check_integrity(self) -> bool: def _check_integrity(self) -> bool:
"""Check integrity of dataset. """Check integrity of dataset.
@ -104,8 +108,9 @@ class CanadianBuildingFootprints(VectorDataset):
Returns: Returns:
True if dataset files are found and/or MD5s match, else False True if dataset files are found and/or MD5s match, else False
""" """
assert isinstance(self.paths, str)
for prov_terr, md5 in zip(self.provinces_territories, self.md5s): for prov_terr, md5 in zip(self.provinces_territories, self.md5s):
filepath = os.path.join(self.root, prov_terr + ".zip") filepath = os.path.join(self.paths, prov_terr + ".zip")
if not check_integrity(filepath, md5 if self.checksum else None): if not check_integrity(filepath, md5 if self.checksum else None):
return False return False
return True return True
@ -115,11 +120,11 @@ class CanadianBuildingFootprints(VectorDataset):
if self._check_integrity(): if self._check_integrity():
print("Files already downloaded and verified") print("Files already downloaded and verified")
return return
assert isinstance(self.paths, str)
for prov_terr, md5 in zip(self.provinces_territories, self.md5s): for prov_terr, md5 in zip(self.provinces_territories, self.md5s):
download_and_extract_archive( download_and_extract_archive(
self.url + prov_terr + ".zip", self.url + prov_terr + ".zip",
self.root, self.paths,
md5=md5 if self.checksum else None, md5=md5 if self.checksum else None,
) )

Просмотреть файл

@ -495,7 +495,7 @@ class ChesapeakeCVPR(GeoDataset):
) )
# these are used to check the integrity of the dataset # these are used to check the integrity of the dataset
files = [ _files = [
"de_1m_2013_extended-debuffered-test_tiles", "de_1m_2013_extended-debuffered-test_tiles",
"de_1m_2013_extended-debuffered-train_tiles", "de_1m_2013_extended-debuffered-train_tiles",
"de_1m_2013_extended-debuffered-val_tiles", "de_1m_2013_extended-debuffered-val_tiles",
@ -704,7 +704,7 @@ class ChesapeakeCVPR(GeoDataset):
return os.path.exists(os.path.join(self.root, filename)) return os.path.exists(os.path.join(self.root, filename))
# Check if the extracted files already exist # Check if the extracted files already exist
if all(map(exists, self.files)): if all(map(exists, self._files)):
return return
# Check if the zip files have already been downloaded # Check if the zip files have already been downloaded

Просмотреть файл

@ -80,7 +80,7 @@ class EnviroAtlas(GeoDataset):
) )
# these are used to check the integrity of the dataset # these are used to check the integrity of the dataset
files = [ _files = [
"austin_tx-2012_1m-test_tiles-debuffered", "austin_tx-2012_1m-test_tiles-debuffered",
"austin_tx-2012_1m-val5_tiles-debuffered", "austin_tx-2012_1m-val5_tiles-debuffered",
"durham_nc-2012_1m-test_tiles-debuffered", "durham_nc-2012_1m-test_tiles-debuffered",
@ -422,7 +422,7 @@ class EnviroAtlas(GeoDataset):
return os.path.exists(os.path.join(self.root, "enviroatlas_lotp", filename)) return os.path.exists(os.path.join(self.root, "enviroatlas_lotp", filename))
# Check if the extracted files already exist # Check if the extracted files already exist
if all(map(exists, self.files)): if all(map(exists, self._files)):
return return
# Check if the zip files have already been downloaded # Check if the zip files have already been downloaded

Просмотреть файл

@ -72,9 +72,17 @@ class GeoDataset(Dataset[dict[str, Any]], abc.ABC):
dataset = landsat7 | landsat8 dataset = landsat7 | landsat8
""" """
paths: Union[str, Iterable[str]]
_crs = CRS.from_epsg(4326) _crs = CRS.from_epsg(4326)
_res = 0.0 _res = 0.0
#: Glob expression used to search for files.
#:
#: This expression should be specific enough that it will not pick up files from
#: other datasets. It should not include a file extension, as the dataset may be in
#: a different file format than what it was originally downloaded as.
filename_glob = "*"
# NOTE: according to the Python docs: # NOTE: according to the Python docs:
# #
# * https://docs.python.org/3/library/exceptions.html#NotImplementedError # * https://docs.python.org/3/library/exceptions.html#NotImplementedError
@ -269,17 +277,36 @@ class GeoDataset(Dataset[dict[str, Any]], abc.ABC):
print(f"Converting {self.__class__.__name__} res from {self.res} to {new_res}") print(f"Converting {self.__class__.__name__} res from {self.res} to {new_res}")
self._res = new_res self._res = new_res
@property
def files(self) -> set[str]:
"""A list of all files in the dataset.
Returns:
All files in the dataset.
.. versionadded:: 0.5
"""
# Make iterable
if isinstance(self.paths, str):
paths: Iterable[str] = [self.paths]
else:
paths = self.paths
# Using set to remove any duplicates if directories are overlapping
files: set[str] = set()
for path in paths:
if os.path.isdir(path):
pathname = os.path.join(path, "**", self.filename_glob)
files |= set(glob.iglob(pathname, recursive=True))
else:
files.add(path)
return files
class RasterDataset(GeoDataset): class RasterDataset(GeoDataset):
"""Abstract base class for :class:`GeoDataset` stored as raster files.""" """Abstract base class for :class:`GeoDataset` stored as raster files."""
#: Glob expression used to search for files.
#:
#: This expression should be specific enough that it will not pick up files from
#: other datasets. It should not include a file extension, as the dataset may be in
#: a different file format than what it was originally downloaded as.
filename_glob = "*"
#: Regular expression used to extract date from filename. #: Regular expression used to extract date from filename.
#: #:
#: The expression should use named groups. The expression may contain any number of #: The expression should use named groups. The expression may contain any number of
@ -423,32 +450,6 @@ class RasterDataset(GeoDataset):
self._crs = cast(CRS, crs) self._crs = cast(CRS, crs)
self._res = cast(float, res) self._res = cast(float, res)
@property
def files(self) -> set[str]:
"""A list of all files in the dataset.
Returns:
All files in the dataset.
.. versionadded:: 0.5
"""
# Make iterable
if isinstance(self.paths, str):
paths: Iterable[str] = [self.paths]
else:
paths = self.paths
# Using set to remove any duplicates if directories are overlapping
files: set[str] = set()
for path in paths:
if os.path.isdir(path):
pathname = os.path.join(path, "**", self.filename_glob)
files |= set(glob.iglob(pathname, recursive=True))
else:
files.add(path)
return files
def __getitem__(self, query: BoundingBox) -> dict[str, Any]: def __getitem__(self, query: BoundingBox) -> dict[str, Any]:
"""Retrieve image/mask and metadata indexed by query. """Retrieve image/mask and metadata indexed by query.
@ -571,16 +572,9 @@ class RasterDataset(GeoDataset):
class VectorDataset(GeoDataset): class VectorDataset(GeoDataset):
"""Abstract base class for :class:`GeoDataset` stored as vector files.""" """Abstract base class for :class:`GeoDataset` stored as vector files."""
#: Glob expression used to search for files.
#:
#: This expression should be specific enough that it will not pick up files from
#: other datasets. It should not include a file extension, as the dataset may be in
#: a different file format than what it was originally downloaded as.
filename_glob = "*"
def __init__( def __init__(
self, self,
root: str = "data", paths: Union[str, Iterable[str]] = "data",
crs: Optional[CRS] = None, crs: Optional[CRS] = None,
res: float = 0.0001, res: float = 0.0001,
transforms: Optional[Callable[[dict[str, Any]], dict[str, Any]]] = None, transforms: Optional[Callable[[dict[str, Any]], dict[str, Any]]] = None,
@ -589,7 +583,7 @@ class VectorDataset(GeoDataset):
"""Initialize a new Dataset instance. """Initialize a new Dataset instance.
Args: Args:
root: root directory where dataset can be found paths: one or more root directories to search or files to load
crs: :term:`coordinate reference system (CRS)` to warp to crs: :term:`coordinate reference system (CRS)` to warp to
(defaults to the CRS of the first file found) (defaults to the CRS of the first file found)
res: resolution of the dataset in units of CRS res: resolution of the dataset in units of CRS
@ -603,16 +597,18 @@ class VectorDataset(GeoDataset):
.. versionadded:: 0.4 .. versionadded:: 0.4
The *label_name* parameter. The *label_name* parameter.
.. versionchanged:: 0.5
*root* was renamed to *paths*.
""" """
super().__init__(transforms) super().__init__(transforms)
self.root = root self.paths = paths
self.label_name = label_name self.label_name = label_name
# Populate the dataset index # Populate the dataset index
i = 0 i = 0
pathname = os.path.join(root, "**", self.filename_glob) for filepath in self.files:
for filepath in glob.iglob(pathname, recursive=True):
try: try:
with fiona.open(filepath) as src: with fiona.open(filepath) as src:
if crs is None: if crs is None:
@ -633,7 +629,7 @@ class VectorDataset(GeoDataset):
i += 1 i += 1
if i == 0: if i == 0:
msg = f"No {self.__class__.__name__} data was found in `root='{root}'`" msg = f"No {self.__class__.__name__} data was found in `root='{paths}'`"
raise FileNotFoundError(msg) raise FileNotFoundError(msg)
self._crs = crs self._crs = crs

Просмотреть файл

@ -7,7 +7,8 @@ import glob
import json import json
import os import os
import sys import sys
from typing import Any, Callable, Optional, cast from collections.abc import Iterable
from typing import Any, Callable, Optional, Union, cast
import fiona import fiona
import fiona.transform import fiona.transform
@ -205,7 +206,7 @@ class OpenBuildings(VectorDataset):
def __init__( def __init__(
self, self,
root: str = "data", paths: Union[str, Iterable[str]] = "data",
crs: Optional[CRS] = None, crs: Optional[CRS] = None,
res: float = 0.0001, res: float = 0.0001,
transforms: Optional[Callable[[dict[str, Any]], dict[str, Any]]] = None, transforms: Optional[Callable[[dict[str, Any]], dict[str, Any]]] = None,
@ -214,7 +215,7 @@ class OpenBuildings(VectorDataset):
"""Initialize a new Dataset instance. """Initialize a new Dataset instance.
Args: Args:
root: root directory where dataset can be found paths: one or more root directories to search or files to load
crs: :term:`coordinate reference system (CRS)` to warp to crs: :term:`coordinate reference system (CRS)` to warp to
(defaults to the CRS of the first file found) (defaults to the CRS of the first file found)
res: resolution of the dataset in units of CRS res: resolution of the dataset in units of CRS
@ -224,11 +225,13 @@ class OpenBuildings(VectorDataset):
Raises: Raises:
FileNotFoundError: if no files are found in ``root`` FileNotFoundError: if no files are found in ``root``
.. versionchanged:: 0.5
*root* was renamed to *paths*.
""" """
self.root = root self.paths = paths
self.res = res self.res = res
self.checksum = checksum self.checksum = checksum
self.root = root
self.res = res self.res = res
self.transforms = transforms self.transforms = transforms
@ -237,7 +240,8 @@ class OpenBuildings(VectorDataset):
# Create an R-tree to index the dataset using the polygon centroid as bounds # Create an R-tree to index the dataset using the polygon centroid as bounds
self.index = Index(interleaved=False, properties=Property(dimension=3)) self.index = Index(interleaved=False, properties=Property(dimension=3))
with open(os.path.join(root, "tiles.geojson")) as f: assert isinstance(self.paths, str)
with open(os.path.join(self.paths, "tiles.geojson")) as f:
data = json.load(f) data = json.load(f)
features = data["features"] features = data["features"]
@ -245,7 +249,7 @@ class OpenBuildings(VectorDataset):
feature["properties"]["tile_url"].split("/")[-1] for feature in features feature["properties"]["tile_url"].split("/")[-1] for feature in features
] # get csv filename ] # get csv filename
polygon_files = glob.glob(os.path.join(self.root, self.zipfile_glob)) polygon_files = glob.glob(os.path.join(self.paths, self.zipfile_glob))
polygon_filenames = [f.split(os.sep)[-1] for f in polygon_files] polygon_filenames = [f.split(os.sep)[-1] for f in polygon_files]
matched_features = [ matched_features = [
@ -274,14 +278,14 @@ class OpenBuildings(VectorDataset):
coords = (minx, maxx, miny, maxy, mint, maxt) coords = (minx, maxx, miny, maxy, mint, maxt)
filepath = os.path.join( filepath = os.path.join(
self.root, feature["properties"]["tile_url"].split("/")[-1] self.paths, feature["properties"]["tile_url"].split("/")[-1]
) )
self.index.insert(i, coords, filepath) self.index.insert(i, coords, filepath)
i += 1 i += 1
if i == 0: if i == 0:
raise FileNotFoundError( raise FileNotFoundError(
f"No {self.__class__.__name__} data was found in '{self.root}'" f"No {self.__class__.__name__} data was found in '{self.paths}'"
) )
self._crs = crs self._crs = crs
@ -398,7 +402,8 @@ class OpenBuildings(VectorDataset):
FileNotFoundError: if metadata file is not found in root FileNotFoundError: if metadata file is not found in root
""" """
# Check if the zip files have already been downloaded and checksum # Check if the zip files have already been downloaded and checksum
pathname = os.path.join(self.root, self.zipfile_glob) assert isinstance(self.paths, str)
pathname = os.path.join(self.paths, self.zipfile_glob)
i = 0 i = 0
for zipfile in glob.iglob(pathname): for zipfile in glob.iglob(pathname):
filename = os.path.basename(zipfile) filename = os.path.basename(zipfile)
@ -410,14 +415,14 @@ class OpenBuildings(VectorDataset):
return return
# check if the metadata file has been downloaded # check if the metadata file has been downloaded
if not os.path.exists(os.path.join(self.root, self.meta_data_filename)): if not os.path.exists(os.path.join(self.paths, self.meta_data_filename)):
raise FileNotFoundError( raise FileNotFoundError(
f"Meta data file {self.meta_data_filename} " f"Meta data file {self.meta_data_filename} "
f"not found in in `root={self.root}`." f"not found in in `root={self.paths}`."
) )
raise RuntimeError( raise RuntimeError(
f"Dataset not found in `root={self.root}` " f"Dataset not found in `root={self.paths}` "
"either specify a different `root` directory or make sure you " "either specify a different `root` directory or make sure you "
"have manually downloaded the dataset as suggested in the documentation." "have manually downloaded the dataset as suggested in the documentation."
) )