This commit is contained in:
Caleb Robinson 2021-09-03 22:32:40 +00:00 коммит произвёл Adam J. Stewart
Родитель b8ba2ebc2b
Коммит 04355ecc2f
5 изменённых файлов: 216 добавлений и 270 удалений

Просмотреть файл

@ -13,7 +13,7 @@ dependencies:
- pytorch-gpu>=1.7
- rarfile>=3
- rasterio>=1.0.16
- shapely>1.3.0
- shapely>=1.3.0
- torchvision>=0.3
- pip:
- black>=21.4b0

Просмотреть файл

@ -1,39 +0,0 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import math
import pyproj
import shapely.ops
def test_crs_with_pyproj() -> None:
src_crs = pyproj.CRS("epsg:4326")
dst_crs = pyproj.CRS(src_crs)
project = pyproj.Transformer.from_crs(src_crs, dst_crs, always_xy=True).transform
geom = {
"type": "Polygon",
"coordinates": [
[
[-125.068359375, 45.920587344733654],
[-116.56494140625001, 45.920587344733654],
[-116.56494140625001, 49.095452162534826],
[-125.068359375, 49.095452162534826],
[-125.068359375, 45.920587344733654],
]
],
}
geom_transformed = shapely.ops.transform(project, shapely.geometry.shape(geom))
bounds = geom_transformed.bounds
expected_bounds = (
-125.068359375,
45.920587344733654,
-116.56494140625001,
49.095452162534826,
)
for i in range(4):
assert math.isclose(bounds[i], expected_bounds[i])

Просмотреть файл

@ -10,6 +10,7 @@ from .chesapeake import (
Chesapeake,
Chesapeake7,
Chesapeake13,
ChesapeakeCVPR,
ChesapeakeDC,
ChesapeakeDE,
ChesapeakeMD,
@ -20,7 +21,6 @@ from .chesapeake import (
)
from .cowc import COWC, COWCCounting, COWCDetection
from .cv4a_kenya_crop_type import CV4AKenyaCropType
from .cvpr_chesapeake import CVPRChesapeake
from .cyclone import TropicalCycloneWindEstimation
from .geo import GeoDataset, RasterDataset, VectorDataset, VisionDataset, ZipDataset
from .landcoverai import LandCoverAI
@ -58,7 +58,7 @@ __all__ = (
"ChesapeakePA",
"ChesapeakeVA",
"ChesapeakeWV",
"CVPRChesapeake",
"ChesapeakeCVPR",
"Landsat",
"Landsat1",
"Landsat2",

Просмотреть файл

@ -1,16 +1,23 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
"""Chesapeake Bay High-Resolution Land Cover Project dataset."""
"""Chesapeake Bay High-Resolution Land Cover Project datasets."""
import abc
import os
from typing import Any, Callable, Dict, Optional
import sys
from typing import Any, Callable, Dict, List, Optional
import fiona
import pyproj
import rasterio
import rasterio.mask
import shapely.geometry
import shapely.ops
from rasterio.crs import CRS
from .geo import RasterDataset
from .utils import check_integrity, download_and_extract_archive
from .geo import GeoDataset, RasterDataset
from .utils import BoundingBox, check_integrity, download_and_extract_archive
class Chesapeake(RasterDataset, abc.ABC):
@ -262,3 +269,205 @@ class ChesapeakeWV(Chesapeake):
filename = "WV_STATEWIDE.tif"
zipfile = "_WV_STATEWIDE.zip"
md5 = "350621ea293651fbc557a1c3e3c64cc3"
class ChesapeakeCVPR(GeoDataset):
"""CVPR 2019 Chesapeake Land Cover dataset.
The `CVPR 2019 Chesapeake Land Cover
<https://lila.science/datasets/chesapeakelandcover>`_ dataset contains two layers of
NAIP aerial imagery, Landsat 8 leaf-on and leaf-off imagery, Chesapeake Bay land
cover labels, NLCD land cover labels, and Microsoft building footprint labels.
This dataset was organized to accompany the 2019 CVPR paper, "Large Scale
High-Resolution Land Cover Mapping with Multi-Resolution Data".
If you use this dataset in your research, please cite the following paper:
* https://doi.org/10.1109/cvpr.2019.01301
"""
url = "https://lilablobssc.blob.core.windows.net/lcmcvpr2019/cvpr_chesapeake_landcover.zip" # noqa: E501
filename = "cvpr_chesapeake_landcover.zip"
md5 = "0ea5e7cb861be3fb8a06fedaaaf91af9"
valid_layers = [
"naip-new",
"naip-old",
"landsat-leaf-on",
"landsat-leaf-off",
"nlcd",
"lc",
"buildings",
]
states = ["de", "md", "va", "wv", "pa", "ny"]
splits = (
[f"{state}-train" for state in states]
+ [f"{state}-val" for state in states]
+ [f"{state}-test" for state in states]
)
p_src_crs = pyproj.CRS("epsg:3857")
p_transformers = {
"epsg:26917": pyproj.Transformer.from_crs(
p_src_crs, pyproj.CRS("epsg:26917"), always_xy=True
).transform,
"epsg:26918": pyproj.Transformer.from_crs(
p_src_crs, pyproj.CRS("epsg:26918"), always_xy=True
).transform,
}
def __init__(
self,
root: str = "data",
split: str = "de-train",
layers: List[str] = ["naip-new", "lc"],
transforms: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None,
cache: bool = True,
download: bool = False,
checksum: bool = False,
) -> None:
"""Initialize a new Dataset instance.
Args:
root: root directory where dataset can be found
split: a string in the format "{state}-{train,val,test}" indicating the
subset of data to use, for example "ny-train"
layers: a list containing a subset of "naip-new", "naip-old", "lc", "nlcd",
"landsat-leaf-on", "landsat-leaf-off", "buildings" indicating which
layers to load
transforms: a function/transform that takes an input sample
and returns a transformed version
cache: if True, cache file handle to speed up repeated sampling
download: if True, download dataset and store it in the root directory
checksum: if True, check the MD5 of the downloaded files (may be slow)
Raises:
FileNotFoundError: if no files are found in ``root``
RuntimeError: if ``download=False`` but dataset is missing or checksum fails
"""
assert split in self.splits
assert all([layer in self.valid_layers for layer in layers])
super().__init__(transforms) # creates self.index and self.transform
self.root = root
self.layers = layers
self.cache = cache
self.checksum = checksum
if download:
self._download()
if not self._check_integrity():
raise RuntimeError(
"Dataset not found or corrupted. "
+ "You can use download=True to download it"
)
# Add all tiles into the index in epsg:3857 based on the included geojson
mint: float = 0
maxt: float = sys.maxsize
with fiona.open(os.path.join(root, "spatial_index.geojson"), "r") as f:
for i, row in enumerate(f):
if row["properties"]["split"] == split:
box = shapely.geometry.shape(row["geometry"])
minx, miny, maxx, maxy = box.bounds
coords = (minx, maxx, miny, maxy, mint, maxt)
self.index.insert(
i,
coords,
{
"naip-new": row["properties"]["naip-new"],
"naip-old": row["properties"]["naip-old"],
"landsat-leaf-on": row["properties"]["landsat-leaf-on"],
"landsat-leaf-off": row["properties"]["landsat-leaf-off"],
"lc": row["properties"]["lc"],
"nlcd": row["properties"]["nlcd"],
"buildings": row["properties"]["buildings"],
},
)
def __getitem__(self, query: BoundingBox) -> Dict[str, Any]:
"""Retrieve image/mask and metadata indexed by query.
Args:
query: (minx, maxx, miny, maxy, mint, maxt) coordinates to index
Returns:
sample of image/mask and metadata at that index
Raises:
IndexError: if query is not found in the index
"""
hits = self.index.intersection(query, objects=True)
filepaths = [hit.object for hit in hits]
sample = {
"crs": self.crs,
"bbox": query,
}
if len(filepaths) == 0:
raise IndexError(
f"query: {query} not found in index with bounds: {self.bounds}"
)
elif len(filepaths) == 1:
filenames = filepaths[0]
query_geom_transformed = None # is set by the first layer
minx, maxx, miny, maxy, mint, maxt = query
query_box = shapely.geometry.box(minx, miny, maxx, maxy)
for layer in self.layers:
fn = filenames[layer]
with rasterio.open(os.path.join(self.root, fn)) as f:
dst_crs = f.crs.to_string().lower()
if query_geom_transformed is None:
query_box_transformed = shapely.ops.transform(
self.p_transformers[dst_crs], query_box
).envelope
query_geom_transformed = shapely.geometry.mapping(
query_box_transformed
)
data, _ = rasterio.mask.mask(
f, [query_geom_transformed], crop=True, all_touched=True
)
sample[layer] = data.squeeze()
else:
raise IndexError(f"query: {query} spans multiple tiles which is not valid")
if self.transforms is not None:
sample = self.transforms(sample)
return sample
def _check_integrity(self) -> bool:
"""Check integrity of dataset.
Returns:
True if dataset files are found and/or MD5s match, else False
"""
integrity: bool = check_integrity(
os.path.join(self.root, self.filename),
self.md5 if self.checksum else None,
)
return integrity
def _download(self) -> None:
"""Download the dataset and extract it."""
if self._check_integrity():
print("Files already downloaded and verified")
return
download_and_extract_archive(
self.url,
self.root,
filename=self.filename,
md5=self.md5,
)

Просмотреть файл

@ -1,224 +0,0 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
"""CVPR 2019 Chesapeake Land Cover dataset."""
import os
import sys
from typing import Any, Callable, Dict, List, Optional
import fiona
import pyproj
import rasterio
import rasterio.mask
import shapely.geometry
import shapely.ops
from rasterio.crs import CRS
from .geo import GeoDataset
from .utils import BoundingBox, check_integrity, download_and_extract_archive
class CVPRChesapeake(GeoDataset):
"""CVPR 2019 Chesapeake Land Cover dataset.
The `CVPR 2019 Chesapeake Land Cover
<https://lila.science/datasets/chesapeakelandcover>`_ dataset contains two layers of
NAIP aerial imagery, Landsat 8 leaf-on and leaf-off imagery, Chesapeake Bay land
cover labels, NLCD land cover labels, and Microsoft building footprint labels.
This dataset was organized to accompany the 2019 CVPR paper, "Large Scale
High-Resolution Land Cover Mapping with Multi-Resolution Data".
If you use this dataset in your research, please cite the following paper:
* https://doi.org/10.1109/cvpr.2019.01301
"""
url = "https://lilablobssc.blob.core.windows.net/lcmcvpr2019/cvpr_chesapeake_landcover.zip" # noqa: E501
filename = "cvpr_chesapeake_landcover.zip"
md5 = "0ea5e7cb861be3fb8a06fedaaaf91af9"
crs = CRS.from_epsg(3857)
res = 1
valid_layers = [
"naip-new",
"naip-old",
"landsat-leaf-on",
"landsat-leaf-off",
"nlcd",
"lc",
"buildings",
]
states = ["de", "md", "va", "wv", "pa", "ny"]
splits = (
[f"{state}-train" for state in states]
+ [f"{state}-val" for state in states]
+ [f"{state}-test" for state in states]
)
p_src_crs = pyproj.CRS("epsg:3857")
p_transformers = {
"epsg:26917": pyproj.Transformer.from_crs(
p_src_crs, pyproj.CRS("epsg:26917"), always_xy=True
).transform,
"epsg:26918": pyproj.Transformer.from_crs(
p_src_crs, pyproj.CRS("epsg:26918"), always_xy=True
).transform,
}
def __init__(
self,
root: str = "data",
split: str = "de-train",
layers: List[str] = ["naip-new", "lc"],
transforms: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None,
cache: bool = True,
download: bool = False,
checksum: bool = False,
) -> None:
"""Initialize a new Dataset instance.
Args:
root: root directory where dataset can be found
split: a string in the format "{state}-{train,val,test}" indicating the
subset of data to use
layers: a list containing a subset of "naip-new", "naip-old", "lc", "nlcd",
"landsat-leaf-on", "landsat-leaf-off", "buildings" indicating which
layers to load
transforms: a function/transform that takes an input sample
and returns a transformed version
cache: if True, cache file handle to speed up repeated sampling
download: if True, download dataset and store it in the root directory
checksum: if True, check the MD5 of the downloaded files (may be slow)
Raises:
FileNotFoundError: if no files are found in ``root``
RuntimeError: if ``download=False`` but dataset is missing or checksum fails
"""
assert split in self.splits
assert all([layer in self.valid_layers for layer in layers])
super().__init__(transforms) # creates self.index and self.transform
self.root = root
self.layers = layers
self.cache = cache
self.checksum = checksum
if download:
self._download()
if not self._check_integrity():
raise RuntimeError(
"Dataset not found or corrupted. "
+ "You can use download=True to download it"
)
# Add all tiles into the index in epsg:3857 based on the included geojson
mint: float = 0
maxt: float = sys.maxsize
with fiona.open(os.path.join(root, "spatial_index.geojson"), "r") as f:
for i, row in enumerate(f):
if row["properties"]["split"] == split:
box = shapely.geometry.shape(row["geometry"])
minx, miny, maxx, maxy = box.bounds
coords = (minx, maxx, miny, maxy, mint, maxt)
self.index.insert(
i,
coords,
{
"naip-new": row["properties"]["naip-new"],
"naip-old": row["properties"]["naip-old"],
"landsat-leaf-on": row["properties"]["landsat-leaf-on"],
"landsat-leaf-off": row["properties"]["landsat-leaf-off"],
"lc": row["properties"]["lc"],
"nlcd": row["properties"]["nlcd"],
"buildings": row["properties"]["buildings"],
},
)
def __getitem__(self, query: BoundingBox) -> Dict[str, Any]:
"""Retrieve image/mask and metadata indexed by query.
Args:
query: (minx, maxx, miny, maxy, mint, maxt) coordinates to index
Returns:
sample of image/mask and metadata at that index
Raises:
IndexError: if query is not found in the index
"""
hits = self.index.intersection(query, objects=True)
filepaths = [hit.object for hit in hits]
sample = {
"crs": self.crs,
"bbox": query,
}
if len(filepaths) == 0:
raise IndexError(
f"query: {query} not found in index with bounds: {self.bounds}"
)
elif len(filepaths) == 1:
filenames = filepaths[0]
query_geom_transformed = None # is set by the first layer
minx, maxx, miny, maxy, mint, maxt = query
query_box = shapely.geometry.box(minx, miny, maxx, maxy)
for layer in self.layers:
fn = filenames[layer]
with rasterio.open(os.path.join(self.root, fn)) as f:
dst_crs = f.crs.to_string().lower()
if query_geom_transformed is None:
query_box_transformed = shapely.ops.transform(
self.p_transformers[dst_crs], query_box
).envelope
query_geom_transformed = shapely.geometry.mapping(
query_box_transformed
)
data, _ = rasterio.mask.mask(
f, [query_geom_transformed], crop=True, all_touched=True
)
sample[layer] = data.squeeze()
else:
raise IndexError(f"query: {query} spans multiple tiles which is not valid")
if self.transforms is not None:
sample = self.transforms(sample)
return sample
def _check_integrity(self) -> bool:
"""Check integrity of dataset.
Returns:
True if dataset files are found and/or MD5s match, else False
"""
integrity: bool = check_integrity(
os.path.join(self.root, self.filename),
self.md5 if self.checksum else None,
)
return integrity
def _download(self) -> None:
"""Download the dataset and extract it."""
if self._check_integrity():
print("Files already downloaded and verified")
return
download_and_extract_archive(
self.url,
self.root,
filename=self.filename,
md5=self.md5,
)