Feature/refactor vector root to paths (#1597)

* Make RasterDataset accept list of files

* Fix check if str

* Use isdir and isfile

* Rename root to paths and update type hint

* Update children of RasterDataset methods using root

* Fix check to cast str to list

* Update conf files for RasterDatasets

* Add initial suggested test

* Add workaround for lists LandCoverAIBase

* Add method handle_nonlocal_path for users to override

* Raise RuntimeError to support existing tests

* Remove reduntand cast to set

* Remove required os.exists for paths

* Revert "Remove required os.exists for paths"

This reverts commit 84bf62b944326c33d5ba8efdcab615c65b124792.

* Use arg  as potitional argument not kwarg

* Improve comments and logs about arg paths

* Remove misleading comment

* Change type hint of 'paths' to Iterable

* Change type hint of 'paths' to Iterable

* Remove premature handling of non-local paths

* Replace root with paths in docstrings

* Add versionadded to list_files docstring

* Add versionchanged to docstrings

* Update type of paths in childred of Raster

* Replace docstring for paths in all raster

* Swap root with paths for conf files for raster

* Add newline before versionchanged

* Revert name to root in conf for ChesapeakeCVPR

* Simplify EUDEM tests

* paths must be a string if you want autodownload support

* Convert list_files to a property

* Fix type hints

* Test with a real empty directory

* Move property `files` up to GeoDataset

* Rename root to paths for VectorDataset

* Fix mypy

* Fix tests

* Delete duplicate code

* Delete duplicate code

* Fix test coverage

* Document name change

---------

Co-authored-by: Adrian Tofting <adriantofting@mobmob14994.hq.k.grp>
Co-authored-by: Adrian Tofting <adrian@vake.ai>
Co-authored-by: Adam J. Stewart <ajstewart426@gmail.com>
This commit is contained in:
Adrian Tofting 2023-09-29 19:39:04 +02:00 коммит произвёл GitHub
Родитель 6ae0d78448
Коммит 3532f78383
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
9 изменённых файлов: 85 добавлений и 79 удалений

Просмотреть файл

@ -61,7 +61,7 @@ class TestCanadianBuildingFootprints:
assert isinstance(ds, UnionDataset)
def test_already_downloaded(self, dataset: CanadianBuildingFootprints) -> None:
CanadianBuildingFootprints(root=dataset.root, download=True)
CanadianBuildingFootprints(dataset.paths, download=True)
def test_plot(self, dataset: CanadianBuildingFootprints) -> None:
query = dataset.bounds

Просмотреть файл

@ -141,7 +141,7 @@ class TestChesapeakeCVPR:
)
monkeypatch.setattr(
ChesapeakeCVPR,
"files",
"_files",
["de_1m_2013_extended-debuffered-test_tiles", "spatial_index.geojson"],
)
root = str(tmp_path)

Просмотреть файл

@ -47,7 +47,7 @@ class TestEnviroAtlas:
)
monkeypatch.setattr(
EnviroAtlas,
"files",
"_files",
["pittsburgh_pa-2010_1m-train_tiles-debuffered", "spatial_index.geojson"],
)
root = str(tmp_path)

Просмотреть файл

@ -37,7 +37,7 @@ class TestOpenBuildings:
monkeypatch.setattr(OpenBuildings, "md5s", md5s)
transforms = nn.Identity()
return OpenBuildings(root=root, transforms=transforms)
return OpenBuildings(root, transforms=transforms)
def test_no_shapes_to_rasterize(
self, dataset: OpenBuildings, tmp_path: Path
@ -61,19 +61,19 @@ class TestOpenBuildings:
with pytest.raises(
RuntimeError, match="have manually downloaded the dataset as suggested "
):
OpenBuildings(root=false_root)
OpenBuildings(false_root)
def test_corrupted(self, dataset: OpenBuildings, tmp_path: Path) -> None:
with open(os.path.join(tmp_path, "000_buildings.csv.gz"), "w") as f:
f.write("bad")
with pytest.raises(RuntimeError, match="Dataset found, but corrupted."):
OpenBuildings(dataset.root, checksum=True)
OpenBuildings(dataset.paths, checksum=True)
def test_no_meta_data_found(self, tmp_path: Path) -> None:
false_root = os.path.join(tmp_path, "empty")
os.makedirs(false_root)
with pytest.raises(FileNotFoundError, match="Meta data file"):
OpenBuildings(root=false_root)
OpenBuildings(false_root)
def test_nothing_in_index(self, dataset: OpenBuildings, tmp_path: Path) -> None:
# change meta data to another 'title_url' so that there is no match found
@ -85,7 +85,7 @@ class TestOpenBuildings:
json.dump(content, f)
with pytest.raises(FileNotFoundError, match="data was found in"):
OpenBuildings(dataset.root)
OpenBuildings(dataset.paths)
def test_getitem(self, dataset: OpenBuildings) -> None:
x = dataset[dataset.bounds]

Просмотреть файл

@ -4,7 +4,8 @@
"""Canadian Building Footprints dataset."""
import os
from typing import Any, Callable, Optional
from collections.abc import Iterable
from typing import Any, Callable, Optional, Union
import matplotlib.pyplot as plt
from matplotlib.figure import Figure
@ -60,7 +61,7 @@ class CanadianBuildingFootprints(VectorDataset):
def __init__(
self,
root: str = "data",
paths: Union[str, Iterable[str]] = "data",
crs: Optional[CRS] = None,
res: float = 0.00001,
transforms: Optional[Callable[[dict[str, Any]], dict[str, Any]]] = None,
@ -70,7 +71,7 @@ class CanadianBuildingFootprints(VectorDataset):
"""Initialize a new Dataset instance.
Args:
root: root directory where dataset can be found
paths: one or more root directories to search or files to load
crs: :term:`coordinate reference system (CRS)` to warp to
(defaults to the CRS of the first file found)
res: resolution of the dataset in units of CRS
@ -83,8 +84,11 @@ class CanadianBuildingFootprints(VectorDataset):
FileNotFoundError: if no files are found in ``root``
RuntimeError: if ``download=False`` and data is not found, or
``checksum=True`` and checksums don't match
.. versionchanged:: 0.5
*root* was renamed to *paths*.
"""
self.root = root
self.paths = paths
self.checksum = checksum
if download:
@ -96,7 +100,7 @@ class CanadianBuildingFootprints(VectorDataset):
+ "You can use download=True to download it"
)
super().__init__(root, crs, res, transforms)
super().__init__(paths, crs, res, transforms)
def _check_integrity(self) -> bool:
"""Check integrity of dataset.
@ -104,8 +108,9 @@ class CanadianBuildingFootprints(VectorDataset):
Returns:
True if dataset files are found and/or MD5s match, else False
"""
assert isinstance(self.paths, str)
for prov_terr, md5 in zip(self.provinces_territories, self.md5s):
filepath = os.path.join(self.root, prov_terr + ".zip")
filepath = os.path.join(self.paths, prov_terr + ".zip")
if not check_integrity(filepath, md5 if self.checksum else None):
return False
return True
@ -115,11 +120,11 @@ class CanadianBuildingFootprints(VectorDataset):
if self._check_integrity():
print("Files already downloaded and verified")
return
assert isinstance(self.paths, str)
for prov_terr, md5 in zip(self.provinces_territories, self.md5s):
download_and_extract_archive(
self.url + prov_terr + ".zip",
self.root,
self.paths,
md5=md5 if self.checksum else None,
)

Просмотреть файл

@ -495,7 +495,7 @@ class ChesapeakeCVPR(GeoDataset):
)
# these are used to check the integrity of the dataset
files = [
_files = [
"de_1m_2013_extended-debuffered-test_tiles",
"de_1m_2013_extended-debuffered-train_tiles",
"de_1m_2013_extended-debuffered-val_tiles",
@ -704,7 +704,7 @@ class ChesapeakeCVPR(GeoDataset):
return os.path.exists(os.path.join(self.root, filename))
# Check if the extracted files already exist
if all(map(exists, self.files)):
if all(map(exists, self._files)):
return
# Check if the zip files have already been downloaded

Просмотреть файл

@ -80,7 +80,7 @@ class EnviroAtlas(GeoDataset):
)
# these are used to check the integrity of the dataset
files = [
_files = [
"austin_tx-2012_1m-test_tiles-debuffered",
"austin_tx-2012_1m-val5_tiles-debuffered",
"durham_nc-2012_1m-test_tiles-debuffered",
@ -422,7 +422,7 @@ class EnviroAtlas(GeoDataset):
return os.path.exists(os.path.join(self.root, "enviroatlas_lotp", filename))
# Check if the extracted files already exist
if all(map(exists, self.files)):
if all(map(exists, self._files)):
return
# Check if the zip files have already been downloaded

Просмотреть файл

@ -72,9 +72,17 @@ class GeoDataset(Dataset[dict[str, Any]], abc.ABC):
dataset = landsat7 | landsat8
"""
paths: Union[str, Iterable[str]]
_crs = CRS.from_epsg(4326)
_res = 0.0
#: Glob expression used to search for files.
#:
#: This expression should be specific enough that it will not pick up files from
#: other datasets. It should not include a file extension, as the dataset may be in
#: a different file format than what it was originally downloaded as.
filename_glob = "*"
# NOTE: according to the Python docs:
#
# * https://docs.python.org/3/library/exceptions.html#NotImplementedError
@ -269,17 +277,36 @@ class GeoDataset(Dataset[dict[str, Any]], abc.ABC):
print(f"Converting {self.__class__.__name__} res from {self.res} to {new_res}")
self._res = new_res
@property
def files(self) -> set[str]:
"""A list of all files in the dataset.
Returns:
All files in the dataset.
.. versionadded:: 0.5
"""
# Make iterable
if isinstance(self.paths, str):
paths: Iterable[str] = [self.paths]
else:
paths = self.paths
# Using set to remove any duplicates if directories are overlapping
files: set[str] = set()
for path in paths:
if os.path.isdir(path):
pathname = os.path.join(path, "**", self.filename_glob)
files |= set(glob.iglob(pathname, recursive=True))
else:
files.add(path)
return files
class RasterDataset(GeoDataset):
"""Abstract base class for :class:`GeoDataset` stored as raster files."""
#: Glob expression used to search for files.
#:
#: This expression should be specific enough that it will not pick up files from
#: other datasets. It should not include a file extension, as the dataset may be in
#: a different file format than what it was originally downloaded as.
filename_glob = "*"
#: Regular expression used to extract date from filename.
#:
#: The expression should use named groups. The expression may contain any number of
@ -423,32 +450,6 @@ class RasterDataset(GeoDataset):
self._crs = cast(CRS, crs)
self._res = cast(float, res)
@property
def files(self) -> set[str]:
"""A list of all files in the dataset.
Returns:
All files in the dataset.
.. versionadded:: 0.5
"""
# Make iterable
if isinstance(self.paths, str):
paths: Iterable[str] = [self.paths]
else:
paths = self.paths
# Using set to remove any duplicates if directories are overlapping
files: set[str] = set()
for path in paths:
if os.path.isdir(path):
pathname = os.path.join(path, "**", self.filename_glob)
files |= set(glob.iglob(pathname, recursive=True))
else:
files.add(path)
return files
def __getitem__(self, query: BoundingBox) -> dict[str, Any]:
"""Retrieve image/mask and metadata indexed by query.
@ -571,16 +572,9 @@ class RasterDataset(GeoDataset):
class VectorDataset(GeoDataset):
"""Abstract base class for :class:`GeoDataset` stored as vector files."""
#: Glob expression used to search for files.
#:
#: This expression should be specific enough that it will not pick up files from
#: other datasets. It should not include a file extension, as the dataset may be in
#: a different file format than what it was originally downloaded as.
filename_glob = "*"
def __init__(
self,
root: str = "data",
paths: Union[str, Iterable[str]] = "data",
crs: Optional[CRS] = None,
res: float = 0.0001,
transforms: Optional[Callable[[dict[str, Any]], dict[str, Any]]] = None,
@ -589,7 +583,7 @@ class VectorDataset(GeoDataset):
"""Initialize a new Dataset instance.
Args:
root: root directory where dataset can be found
paths: one or more root directories to search or files to load
crs: :term:`coordinate reference system (CRS)` to warp to
(defaults to the CRS of the first file found)
res: resolution of the dataset in units of CRS
@ -603,16 +597,18 @@ class VectorDataset(GeoDataset):
.. versionadded:: 0.4
The *label_name* parameter.
.. versionchanged:: 0.5
*root* was renamed to *paths*.
"""
super().__init__(transforms)
self.root = root
self.paths = paths
self.label_name = label_name
# Populate the dataset index
i = 0
pathname = os.path.join(root, "**", self.filename_glob)
for filepath in glob.iglob(pathname, recursive=True):
for filepath in self.files:
try:
with fiona.open(filepath) as src:
if crs is None:
@ -633,7 +629,7 @@ class VectorDataset(GeoDataset):
i += 1
if i == 0:
msg = f"No {self.__class__.__name__} data was found in `root='{root}'`"
msg = f"No {self.__class__.__name__} data was found in `root='{paths}'`"
raise FileNotFoundError(msg)
self._crs = crs

Просмотреть файл

@ -7,7 +7,8 @@ import glob
import json
import os
import sys
from typing import Any, Callable, Optional, cast
from collections.abc import Iterable
from typing import Any, Callable, Optional, Union, cast
import fiona
import fiona.transform
@ -205,7 +206,7 @@ class OpenBuildings(VectorDataset):
def __init__(
self,
root: str = "data",
paths: Union[str, Iterable[str]] = "data",
crs: Optional[CRS] = None,
res: float = 0.0001,
transforms: Optional[Callable[[dict[str, Any]], dict[str, Any]]] = None,
@ -214,7 +215,7 @@ class OpenBuildings(VectorDataset):
"""Initialize a new Dataset instance.
Args:
root: root directory where dataset can be found
paths: one or more root directories to search or files to load
crs: :term:`coordinate reference system (CRS)` to warp to
(defaults to the CRS of the first file found)
res: resolution of the dataset in units of CRS
@ -224,11 +225,13 @@ class OpenBuildings(VectorDataset):
Raises:
FileNotFoundError: if no files are found in ``root``
.. versionchanged:: 0.5
*root* was renamed to *paths*.
"""
self.root = root
self.paths = paths
self.res = res
self.checksum = checksum
self.root = root
self.res = res
self.transforms = transforms
@ -237,7 +240,8 @@ class OpenBuildings(VectorDataset):
# Create an R-tree to index the dataset using the polygon centroid as bounds
self.index = Index(interleaved=False, properties=Property(dimension=3))
with open(os.path.join(root, "tiles.geojson")) as f:
assert isinstance(self.paths, str)
with open(os.path.join(self.paths, "tiles.geojson")) as f:
data = json.load(f)
features = data["features"]
@ -245,7 +249,7 @@ class OpenBuildings(VectorDataset):
feature["properties"]["tile_url"].split("/")[-1] for feature in features
] # get csv filename
polygon_files = glob.glob(os.path.join(self.root, self.zipfile_glob))
polygon_files = glob.glob(os.path.join(self.paths, self.zipfile_glob))
polygon_filenames = [f.split(os.sep)[-1] for f in polygon_files]
matched_features = [
@ -274,14 +278,14 @@ class OpenBuildings(VectorDataset):
coords = (minx, maxx, miny, maxy, mint, maxt)
filepath = os.path.join(
self.root, feature["properties"]["tile_url"].split("/")[-1]
self.paths, feature["properties"]["tile_url"].split("/")[-1]
)
self.index.insert(i, coords, filepath)
i += 1
if i == 0:
raise FileNotFoundError(
f"No {self.__class__.__name__} data was found in '{self.root}'"
f"No {self.__class__.__name__} data was found in '{self.paths}'"
)
self._crs = crs
@ -398,7 +402,8 @@ class OpenBuildings(VectorDataset):
FileNotFoundError: if metadata file is not found in root
"""
# Check if the zip files have already been downloaded and checksum
pathname = os.path.join(self.root, self.zipfile_glob)
assert isinstance(self.paths, str)
pathname = os.path.join(self.paths, self.zipfile_glob)
i = 0
for zipfile in glob.iglob(pathname):
filename = os.path.basename(zipfile)
@ -410,14 +415,14 @@ class OpenBuildings(VectorDataset):
return
# check if the metadata file has been downloaded
if not os.path.exists(os.path.join(self.root, self.meta_data_filename)):
if not os.path.exists(os.path.join(self.paths, self.meta_data_filename)):
raise FileNotFoundError(
f"Meta data file {self.meta_data_filename} "
f"not found in in `root={self.root}`."
f"not found in in `root={self.paths}`."
)
raise RuntimeError(
f"Dataset not found in `root={self.root}` "
f"Dataset not found in `root={self.paths}` "
"either specify a different `root` directory or make sure you "
"have manually downloaded the dataset as suggested in the documentation."
)