Add CanadianBuildingFootprints dataset

This commit is contained in:
Adam J. Stewart 2021-08-03 17:07:03 +00:00 коммит произвёл Caleb Robinson
Родитель b5db261dce
Коммит 54d42b9022
7 изменённых файлов: 218 добавлений и 0 удалений

Просмотреть файл

@ -10,6 +10,11 @@ Geospatial Datasets
:class:`GeoDataset` is designed for datasets that contain geospatial information, like latitude, longitude, coordinate system, and projection. Datasets containing this kind of information can be combined using :class:`ZipDataset`.
Canadian Building Footprints
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. autoclass:: CanadianBuildingFootprints
Chesapeake Bay High-Resolution Land Cover Project
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Просмотреть файл

@ -15,6 +15,7 @@ dependencies:
- pip:
- affine
- black[colorama]>=21b
- fiona
- flake8
- isort[colors]>=4.3.5
- mypy>=0.900

Просмотреть файл

@ -1,5 +1,6 @@
affine
black[colorama]>=21b
fiona
flake8
h5py
isort[colors]>=4.3.5

Просмотреть файл

@ -26,6 +26,7 @@ setup_requires =
setuptools>=42
install_requires =
affine
fiona
matplotlib
numpy
pillow

Просмотреть файл

@ -5,6 +5,7 @@ spack:
- "python@3.7:+bz2"
- py-affine
- "py-black@21:+colorama"
- py-fiona
- py-flake8
- py-h5py
- "py-isort@4.3.5:+colors"

Просмотреть файл

@ -1,6 +1,7 @@
"""TorchGeo datasets."""
from .benin_cashews import BeninSmallHolderCashews
from .cbf import CanadianBuildingFootprints
from .cdl import CDL
from .chesapeake import (
Chesapeake,
@ -42,6 +43,7 @@ from .utils import BoundingBox, collate_dict
__all__ = (
"BeninSmallHolderCashews",
"BoundingBox",
"CanadianBuildingFootprints",
"CDL",
"collate_dict",
"Chesapeake",

207
torchgeo/datasets/cbf.py Normal file
Просмотреть файл

@ -0,0 +1,207 @@
"""Canadian Building Footprints dataset."""
import glob
import os
import sys
from typing import Any, Callable, Dict, Optional
import fiona
import matplotlib.pyplot as plt
import torch
from fiona.transform import transform, transform_geom
from rasterio.crs import CRS
from rasterio.features import rasterize
from rtree.index import Index, Property
from torch import Tensor
from .geo import GeoDataset
from .utils import BoundingBox, check_integrity, download_and_extract_archive
_crs = CRS.from_epsg(4326)
class CanadianBuildingFootprints(GeoDataset):
"""Canadian Building Footprints dataset.
The `Canadian Building Footprints
<https://github.com/Microsoft/CanadianBuildingFootprints>`_ dataset contains
11,842,186 computer generated building footprints in all Canadian provinces and
territories in GeoJSON format. This data is freely available for download and use.
"""
# TODO: how does one cite this dataset?
# https://github.com/microsoft/CanadianBuildingFootprints/issues/11
url = "https://usbuildingdata.blob.core.windows.net/canadian-buildings-v2/"
provinces_territories = [
"Alberta",
"BritishColumbia",
"Manitoba",
"NewBrunswick",
"NewfoundlandAndLabrador",
"NorthwestTerritories",
"NovaScotia",
"Nunavut",
"Ontario",
"PrinceEdwardIsland",
"Quebec",
"Saskatchewan",
"YukonTerritory",
]
md5s = [
"8b4190424e57bb0902bd8ecb95a9235b",
"fea05d6eb0006710729c675de63db839",
"adf11187362624d68f9c69aaa693c46f",
"44269d4ec89521735389ef9752ee8642",
"65dd92b1f3f5f7222ae5edfad616d266",
"346d70a682b95b451b81b47f660fd0e2",
"bd57cb1a7822d72610215fca20a12602",
"c1f29b73cdff9a6a9dd7d086b31ef2cf",
"76ba4b7059c5717989ce34977cad42b2",
"2e4a3fa47b3558503e61572c59ac5963",
"9ff4417ae00354d39a0cf193c8df592c",
"a51078d8e60082c7d3a3818240da6dd5",
"c11f3bd914ecabd7cac2cb2871ec0261",
]
def __init__(
self,
root: str = "data",
crs: CRS = _crs,
transforms: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None,
download: bool = False,
checksum: bool = False,
) -> None:
"""Initialize a new Canadian Building Footprints dataset.
Args:
root: root directory where dataset can be found
crs: :term:`coordinate reference system (CRS)` to project to
transforms: a function/transform that takes input sample and its target as
entry and returns a transformed version
download: if True, download dataset and store it in the root directory
checksum: if True, check the MD5 of the downloaded files (may be slow)
Raises:
RuntimeError: if ``download=False`` and data is not found, or
``checksum=True`` and checksums don't match
"""
self.root = root
self.crs = crs
self.transforms = transforms
self.checksum = checksum
if download:
self._download()
if not self._check_integrity():
raise RuntimeError(
"Dataset not found or corrupted. "
+ "You can use download=True to download it"
)
# Create an R-tree to index the dataset
self.index = Index(interleaved=False, properties=Property(dimension=3))
fileglob = os.path.join(root, "**.geojson")
for i, filename in enumerate(glob.iglob(fileglob, recursive=True)):
with fiona.open(filename) as src:
minx, miny, maxx, maxy = src.bounds
(minx, maxx), (miny, maxy) = transform(
src.crs, crs.to_dict(), [minx, maxx], [miny, maxy]
)
mint = 0
maxt = sys.maxsize
coords = (minx, maxx, miny, maxy, mint, maxt)
self.index.insert(i, coords, filename)
def __getitem__(self, query: BoundingBox) -> Dict[str, Any]:
"""Retrieve image and metadata indexed by query.
Args:
query: (minx, maxx, miny, maxy, mint, maxt) coordinates to index
Returns:
sample of labels and metadata at that index
Raises:
IndexError: if query is not within bounds of the index
"""
if not query.intersects(self.bounds):
raise IndexError(
f"query: {query} is not within bounds of the index: {self.bounds}"
)
hits = self.index.intersection(query, objects=True)
filename = next(hits).object # TODO: this assumes there is only a single hit
shapes = []
with fiona.open(filename) as src:
# We need to know the bounding box of the query in the source CRS
(minx, maxx), (miny, maxy) = transform(
self.crs.to_dict(),
src.crs,
[query.minx, query.maxx],
[query.miny, query.maxy],
)
# Filter geometries to those that intersect with the bounding box
for feature in src.filter((minx, miny, maxx, maxy)):
# Warp geometries to requested CRS
shape = transform_geom(src.crs, self.crs.to_dict(), feature["geometry"])
shapes.append(shape)
# Rasterize geometries
masks = rasterize(shapes)
# Clip to bounding box
# TODO: how to do this without creating a new dataset?
sample = {
"masks": torch.tensor(masks), # type: ignore[attr-defined]
"crs": self.crs,
"bbox": query,
}
if self.transforms is not None:
sample = self.transforms(sample)
return sample
def _check_integrity(self) -> bool:
"""Check integrity of dataset.
Returns:
True if dataset files are found and/or MD5s match, else False
"""
for prov_terr, md5 in zip(self.provinces_territories, self.md5s):
filepath = os.path.join(self.root, prov_terr + ".zip")
if not check_integrity(filepath, md5 if self.checksum else None):
return False
return True
def _download(self) -> None:
"""Download the dataset and extract it."""
if self._check_integrity():
print("Files already downloaded and verified")
return
for prov_terr, md5 in zip(self.provinces_territories, self.md5s):
download_and_extract_archive(
self.url + prov_terr + ".zip",
self.root,
md5=md5 if self.checksum else None,
)
def plot(self, image: Tensor) -> None:
"""Plot an image on a map.
Args:
image: the image to plot
"""
array = image.squeeze().numpy()
# Plot the image
ax = plt.axes()
ax.imshow(array)
ax.axis("off")
plt.show()
plt.close()