Add CMS Global Mangrove Canopy dataset (#391)

* CMS dataset

* dynamically set filename

* add warning in documentation

* requested changes and data.py

* single zip file and camel case

* md5 check added

* correct error messages

* compression smaller test file

Co-authored-by: Caleb Robinson <calebrob6@gmail.com>
This commit is contained in:
Nils Lehmann 2022-02-20 21:07:20 +01:00 коммит произвёл GitHub
Родитель 89277dc325
Коммит 9cf36fac12
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
9 изменённых файлов: 419 добавлений и 0 удалений

Просмотреть файл

@ -32,6 +32,11 @@ Chesapeake Bay High-Resolution Land Cover Project
.. autoclass:: ChesapeakeWV
.. autoclass:: ChesapeakeCVPR
CMS Global Mangrove Canopy Dataset
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. autoclass:: CMSGlobalMangroveCanopy
Cropland Data Layer (CDL)
^^^^^^^^^^^^^^^^^^^^^^^^^

Двоичный файл не отображается.

Двоичный файл не отображается.

Двоичный файл не отображается.

Двоичный файл не отображается.

Просмотреть файл

@ -0,0 +1,68 @@
#!/usr/bin/env python3
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import hashlib
import os
import random
import shutil
import numpy as np
import rasterio
np.random.seed(0)
random.seed(0)
SIZE = 64
files = [
{"image": "Mangrove_agb_Angola.tif"},
{"image": "Mangrove_hba95_Angola.tif"},
{"image": "Mangrove_hmax95_Angola.tif"},
]
def create_file(path: str, dtype: str, num_channels: int) -> None:
profile = {}
profile["driver"] = "GTiff"
profile["dtype"] = dtype
profile["count"] = num_channels
profile["crs"] = "epsg:4326"
profile["transform"] = rasterio.transform.from_bounds(0, 0, 1, 1, 1, 1)
profile["height"] = SIZE
profile["width"] = SIZE
profile["compress"] = "lzw"
profile["predictor"] = 2
Z = np.random.randint(
np.iinfo(profile["dtype"]).max, size=(1, SIZE, SIZE), dtype=profile["dtype"]
)
src = rasterio.open(path, "w", **profile)
src.write(Z)
if __name__ == "__main__":
directory = "CMS_Global_Map_Mangrove_Canopy_1665"
# Remove old data
if os.path.isdir(directory):
shutil.rmtree(directory)
os.makedirs(os.path.join(directory, "data"), exist_ok=True)
for file_dict in files:
# Create mask file
path = file_dict["image"]
create_file(
os.path.join(directory, "data", path), dtype="int32", num_channels=1
)
# Compress data
shutil.make_archive(directory.replace(".zip", ""), "zip", ".", directory)
# Compute checksums
with open(directory + ".zip", "rb") as f:
md5 = hashlib.md5(f.read()).hexdigest()
print(f"{directory}: {md5}")

Просмотреть файл

@ -0,0 +1,93 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import os
import shutil
from pathlib import Path
from typing import Generator
import pytest
import torch
import torch.nn as nn
from _pytest.monkeypatch import MonkeyPatch
from rasterio.crs import CRS
from torchgeo.datasets import CMSGlobalMangroveCanopy, IntersectionDataset, UnionDataset
def download_url(url: str, root: str, *args: str, **kwargs: str) -> None:
shutil.copy(url, root)
class TestCMSGlobalMangroveCanopy:
@pytest.fixture
def dataset(
self, monkeypatch: Generator[MonkeyPatch, None, None], tmp_path: Path
) -> CMSGlobalMangroveCanopy:
zipfile = "CMS_Global_Map_Mangrove_Canopy_1665.zip"
monkeypatch.setattr( # type: ignore[attr-defined]
CMSGlobalMangroveCanopy, "zipfile", zipfile
)
md5 = "d6894fa6293cc9c0f3f95a810e842de5"
monkeypatch.setattr( # type: ignore[attr-defined]
CMSGlobalMangroveCanopy, "md5", md5
)
root = os.path.join("tests", "data", "cms_mangrove_canopy")
transforms = nn.Identity() # type: ignore[attr-defined]
country = "Angola"
return CMSGlobalMangroveCanopy(
root, country=country, transforms=transforms, checksum=True
)
def test_getitem(self, dataset: CMSGlobalMangroveCanopy) -> None:
x = dataset[dataset.bounds]
assert isinstance(x, dict)
assert isinstance(x["crs"], CRS)
assert isinstance(x["mask"], torch.Tensor)
def test_no_dataset(self) -> None:
with pytest.raises(RuntimeError, match="Dataset not found in."):
CMSGlobalMangroveCanopy(root="/test")
def test_already_downloaded(self, tmp_path: Path) -> None:
pathname = os.path.join(
"tests",
"data",
"cms_mangrove_canopy",
"CMS_Global_Map_Mangrove_Canopy_1665.zip",
)
root = str(tmp_path)
shutil.copy(pathname, root)
CMSGlobalMangroveCanopy(root, country="Angola")
def test_corrupted(self, tmp_path: Path) -> None:
with open(
os.path.join(tmp_path, "CMS_Global_Map_Mangrove_Canopy_1665.zip"), "w"
) as f:
f.write("bad")
with pytest.raises(RuntimeError, match="Dataset found, but corrupted."):
CMSGlobalMangroveCanopy(root=str(tmp_path), country="Angola", checksum=True)
def test_invalid_country(self) -> None:
with pytest.raises(AssertionError):
CMSGlobalMangroveCanopy(country="fakeCountry")
def test_invalid_measurement(self) -> None:
with pytest.raises(AssertionError):
CMSGlobalMangroveCanopy(measurement="wrongMeasurement")
def test_and(self, dataset: CMSGlobalMangroveCanopy) -> None:
ds = dataset & dataset
assert isinstance(ds, IntersectionDataset)
def test_or(self, dataset: CMSGlobalMangroveCanopy) -> None:
ds = dataset | dataset
assert isinstance(ds, UnionDataset)
def test_plot(self, dataset: CMSGlobalMangroveCanopy) -> None:
query = dataset.bounds
x = dataset[query]
dataset.plot(x["mask"])

Просмотреть файл

@ -21,6 +21,7 @@ from .chesapeake import (
ChesapeakeVA,
ChesapeakeWV,
)
from .cms_mangrove_canopy import CMSGlobalMangroveCanopy
from .cowc import COWC, COWCCounting, COWCDetection
from .cv4a_kenya_crop_type import CV4AKenyaCropType
from .cyclone import TropicalCycloneWindEstimation
@ -97,6 +98,7 @@ __all__ = (
"ChesapeakeVA",
"ChesapeakeWV",
"ChesapeakeCVPR",
"CMSGlobalMangroveCanopy",
"Esri2020",
"Landsat",
"Landsat1",

Просмотреть файл

@ -0,0 +1,251 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
"""CMS Global Mangrove Canopy dataset."""
import glob
import os
from typing import Any, Callable, Dict, Optional
from rasterio.crs import CRS
from .geo import RasterDataset
from .utils import check_integrity, extract_archive
class CMSGlobalMangroveCanopy(RasterDataset):
"""CMS Global Mangrove Canopy dataset.
The `CMS Global Mangrove Canopy dataset
<https://daac.ornl.gov/cgi-bin/dsviewer.pl?ds_id=1665>`_
consists of a single band map at 30m resolution of either aboveground biomass (agb),
basal area weighted height (hba95), or maximum canopy height (hmax95).
The dataset needs to be manually dowloaded from the above link, where you can make
an account and subsequently download the dataset.
.. versionadded:: 0.3
"""
is_image = False
filename_regex = r"""^
(?P<mangrove>[A-Za-z]{8})
_(?P<variable>[a-z0-9]*)
_(?P<country>[A-Za-z][^.]*)
"""
zipfile = "CMS_Global_Map_Mangrove_Canopy_1665.zip"
md5 = "3e7f9f23bf971c25e828b36e6c5496e3"
all_countries = [
"AndamanAndNicobar",
"Angola",
"Anguilla",
"AntiguaAndBarbuda",
"Aruba",
"Australia",
"Bahamas",
"Bahrain",
"Bangladesh",
"Barbados",
"Belize",
"Benin",
"Brazil",
"BritishVirginIslands",
"Brunei",
"Cambodia",
"Cameroon",
"CarribeanCaymanIslands",
"China",
"Colombia",
"Comoros",
"CostaRica",
"Cote",
"CoteDivoire",
"CotedIvoire",
"Cuba",
"DemocraticRepublicOfCongo",
"Djibouti",
"DominicanRepublic",
"EcuadorWithGalapagos",
"Egypt",
"ElSalvador",
"EquatorialGuinea",
"Eritrea",
"EuropaIsland",
"Fiji",
"Fiji2",
"FrenchGuiana",
"FrenchGuyana",
"FrenchPolynesia",
"Gabon",
"Gambia",
"Ghana",
"Grenada",
"Guadeloupe",
"Guam",
"Guatemala",
"Guinea",
"GuineaBissau",
"Guyana",
"Haiti",
"Hawaii",
"Honduras",
"HongKong",
"India",
"Indonesia",
"Iran",
"Jamaica",
"Japan",
"Kenya",
"Liberia",
"Macau",
"Madagascar",
"Malaysia",
"Martinique",
"Mauritania",
"Mayotte",
"Mexico",
"Micronesia",
"Mozambique",
"Myanmar",
"NewCaledonia",
"NewZealand",
"Newzealand",
"Nicaragua",
"Nigeria",
"NorthernMarianaIslands",
"Oman",
"Pakistan",
"Palau",
"Panama",
"PapuaNewGuinea",
"Peru",
"Philipines",
"PuertoRico",
"Qatar",
"ReunionAndMauritius",
"SaintKittsAndNevis",
"SaintLucia",
"SaintVincentAndTheGrenadines",
"Samoa",
"SaudiArabia",
"Senegal",
"Seychelles",
"SierraLeone",
"Singapore",
"SolomonIslands",
"Somalia",
"Somalia2",
"Soudan",
"SouthAfrica",
"SriLanka",
"Sudan",
"Suriname",
"Taiwan",
"Tanzania",
"Thailand",
"TimorLeste",
"Togo",
"Tonga",
"TrinidadAndTobago",
"TurksAndCaicosIslands",
"Tuvalu",
"UnitedArabEmirates",
"UnitedStates",
"Vanuatu",
"Venezuela",
"Vietnam",
"VirginIslandsUs",
"WallisAndFutuna",
"Yemen",
]
measurements = ["agb", "hba95", "hmax95"]
def __init__(
self,
root: str = "data",
crs: Optional[CRS] = None,
res: Optional[float] = None,
measurement: str = "agb",
country: str = all_countries[0],
transforms: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None,
cache: bool = True,
checksum: bool = False,
) -> None:
"""Initialize a new Dataset instance.
Args:
root: root directory where dataset can be found
crs: :term:`coordinate reference system (CRS)` to warp to
(defaults to the CRS of the first file found)
res: resolution of the dataset in units of CRS
(defaults to the resolution of the first file found)
measurement: which of the three measurements, 'agb', 'hba95', or 'hmax95'
country: country for which to retrieve data
transforms: a function/transform that takes an input sample
and returns a transformed version
cache: if True, cache file handle to speed up repeated sampling
checksum: if True, check the MD5 of the downloaded files (may be slow)
Raises:
FileNotFoundError: if no files are found in ``root``
RuntimeError: if dataset is missing or checksum fails
AssertionError: if country or measurement arg are not str or invalid
"""
self.root = root
self.checksum = checksum
assert isinstance(country, str), "Country argument must be a str."
assert (
country in self.all_countries
), "You have selected an invalid country, please choose one of {}".format(
self.all_countries
)
self.country = country
assert isinstance(measurement, str), "Measurement must be a string."
assert (
measurement in self.measurements
), "You have entered an invalid measurement, please choose one of {}.".format(
self.measurements
)
self.measurement = measurement
self.filename_glob = "**/Mangrove_{}_{}*".format(self.measurement, self.country)
self._verify()
super().__init__(root, crs, res, transforms, cache)
def _verify(self) -> None:
"""Verify the integrity of the dataset.
Raises:
RuntimeError: if dataset is missing or checksum fails
"""
# Check if the extracted files already exist
pathname = os.path.join(self.root, "**", self.filename_glob)
if glob.glob(pathname):
return
# Check if the zip file has already been downloaded
pathname = os.path.join(self.root, self.zipfile)
if os.path.exists(pathname):
if self.checksum and not check_integrity(pathname, self.md5):
raise RuntimeError("Dataset found, but corrupted.")
self._extract()
return
raise RuntimeError(
f"Dataset not found in `root={self.root}` "
"either specify a different `root` directory or make sure you "
"have manually downloaded the dataset as instructed in the documentation."
)
def _extract(self) -> None:
"""Extract the dataset."""
pathname = os.path.join(self.root, self.zipfile)
extract_archive(pathname)