diff --git a/docs/api/datasets.rst b/docs/api/datasets.rst index e5dd579b9..f43e53e5c 100644 --- a/docs/api/datasets.rst +++ b/docs/api/datasets.rst @@ -52,6 +52,11 @@ Cropland Data Layer (CDL) .. autoclass:: CDL +EDDMapS +^^^^^^^ + +.. autoclass:: EDDMapS + EnviroAtlas ^^^^^^^^^^^ diff --git a/tests/data/eddmaps/data.py b/tests/data/eddmaps/data.py new file mode 100755 index 000000000..5af198e47 --- /dev/null +++ b/tests/data/eddmaps/data.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 + +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +import pandas as pd + +filename = "mappings.csv" + +size = 3 +data = { + "gbifID": [""] * size, + "decimalLatitude": [41.881832] * size, + "decimalLongitude": [""] + [-87.623177] * (size - 1), + "objectid": [""] * size, + "reporter": [""] * size, + "RecOwner": [""] * size, + "SciName": ["Homo sapiens"] * size, + "ComName": ["human"] * size, + "Nativity": ["Native"] * size, + "OccStatus": ["Detected"] * size, + "Status": ["Positive"] * size, + "ObsDate": ["", "", "05-07-22"], + "DateEnt": ["05-07-22"] * size, + "DateUp": ["05-07-22"] * size, + "Location": ["Chicago, Illinois, United States"] * size, + "Latitude": [41.881832] * size, + "Longitude": [""] + [-87.623177] * (size - 1), + "Datum": ["WGS84"] * size, + "Method": [""] * size, + "CoordAcc": [""] * size, + "DataType": [""] * size, + "Centroid": [""] * size, + "Abundance": [""] * size, + "InfestAcre": [""] * size, + "GrossAcre": [""] * size, + "Percentcov": [""] * size, + "Density": [""] * size, + "Quantity": [""] * size, + "QuantityU": [""] * size, + "APPXQuant": [""] * size, + "NumCollect": [""] * size, + "Smallest": [""] * size, + "Largest": [""] * size, + "Incidence": [""] * size, + "Severity": [""] * size, + "Host": [""] * size, + "Host_Name": [""] * size, + "HostPheno": [""] * size, + "HostDamage": [""] * size, + "ManageStat": ["Unknown"] * size, + "PopStat": [""] * size, + "Habitat": [""] * size, + "LocalOwner": [""] * size, + "Site": [""] * size, + "RecBasis": [""] * size, + "Museum": [""] * size, + "MuseumRec": [""] * size, + "Voucher": [""] * size, + "ObsIDer": [""] * size, + "CollectTme": [""] * size, + "UUID": [""] * size, + "OrgSrcID": [""] * size, + "OrigName": ["Homo sapiens"] * size, + "RecSrcTyp": ["Bulk Data"] * size, + "Surveyor": [""] * size, + "DateAcc": [""] * size, + "VisitType": [""] * size, + "DataMthd": [""] * size, + "TrapType": [""] * size, + "NumTraps": [""] * size, + "TargetName": [""] * size, + "TargetCnt": [""] * size, + "TargetRnge": [""] * size, + "Phenology": [""] * size, + "LifeStatus": [""] * size, + "Sex": [""] * size, + "PID": [""] * size, + "WaterName": [""] * size, + "WaterType": [""] * size, + "Substrate": [""] * size, + "TreatArea": [""] * size, + "PlantTreat": [""] * size, + "TreatComm": [""] * size, + "Reference": [""] * size, + "Locality": [""] * size, + "Comments": [""] * size, + "ReviewDate": ["05-07-22"] * size, + "Reviewer": ["Charles Darwin"] * size, + "VerifyMthd": ["Bulk Verified"] * size, + "Verified": ["Verified"] * size, + "IDCred": ["Credible"] * size, + "ReviewComm": [""] * size, +} + +df = pd.DataFrame(data) +df.to_csv(filename, index=False) diff --git a/tests/data/eddmaps/mappings.csv b/tests/data/eddmaps/mappings.csv new file mode 100644 index 000000000..5053ac298 --- /dev/null +++ b/tests/data/eddmaps/mappings.csv @@ -0,0 +1,4 @@ +gbifID,decimalLatitude,decimalLongitude,objectid,reporter,RecOwner,SciName,ComName,Nativity,OccStatus,Status,ObsDate,DateEnt,DateUp,Location,Latitude,Longitude,Datum,Method,CoordAcc,DataType,Centroid,Abundance,InfestAcre,GrossAcre,Percentcov,Density,Quantity,QuantityU,APPXQuant,NumCollect,Smallest,Largest,Incidence,Severity,Host,Host_Name,HostPheno,HostDamage,ManageStat,PopStat,Habitat,LocalOwner,Site,RecBasis,Museum,MuseumRec,Voucher,ObsIDer,CollectTme,UUID,OrgSrcID,OrigName,RecSrcTyp,Surveyor,DateAcc,VisitType,DataMthd,TrapType,NumTraps,TargetName,TargetCnt,TargetRnge,Phenology,LifeStatus,Sex,PID,WaterName,WaterType,Substrate,TreatArea,PlantTreat,TreatComm,Reference,Locality,Comments,ReviewDate,Reviewer,VerifyMthd,Verified,IDCred,ReviewComm +,41.881832,,,,,Homo sapiens,human,Native,Detected,Positive,,05-07-22,05-07-22,"Chicago, Illinois, United States",41.881832,,WGS84,,,,,,,,,,,,,,,,,,,,,,Unknown,,,,,,,,,,,,,Homo sapiens,Bulk Data,,,,,,,,,,,,,,,,,,,,,,,05-07-22,Charles Darwin,Bulk Verified,Verified,Credible, +,41.881832,-87.623177,,,,Homo sapiens,human,Native,Detected,Positive,,05-07-22,05-07-22,"Chicago, Illinois, United States",41.881832,-87.623177,WGS84,,,,,,,,,,,,,,,,,,,,,,Unknown,,,,,,,,,,,,,Homo sapiens,Bulk Data,,,,,,,,,,,,,,,,,,,,,,,05-07-22,Charles Darwin,Bulk Verified,Verified,Credible, +,41.881832,-87.623177,,,,Homo sapiens,human,Native,Detected,Positive,05-07-22,05-07-22,05-07-22,"Chicago, Illinois, United States",41.881832,-87.623177,WGS84,,,,,,,,,,,,,,,,,,,,,,Unknown,,,,,,,,,,,,,Homo sapiens,Bulk Data,,,,,,,,,,,,,,,,,,,,,,,05-07-22,Charles Darwin,Bulk Verified,Verified,Credible, diff --git a/tests/datasets/test_eddmaps.py b/tests/datasets/test_eddmaps.py new file mode 100644 index 000000000..e505bfc1f --- /dev/null +++ b/tests/datasets/test_eddmaps.py @@ -0,0 +1,67 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +import builtins +import os +from pathlib import Path +from typing import Any + +import pytest +from _pytest.monkeypatch import MonkeyPatch + +from torchgeo.datasets import BoundingBox, EDDMapS, IntersectionDataset, UnionDataset + +pytest.importorskip("pandas", minversion="0.23.2") + + +class TestEDDMapS: + @pytest.fixture(scope="class") + def dataset(self) -> EDDMapS: + root = os.path.join("tests", "data", "eddmaps") + return EDDMapS(root) + + def test_getitem(self, dataset: EDDMapS) -> None: + x = dataset[dataset.bounds] + assert isinstance(x, dict) + + def test_len(self, dataset: EDDMapS) -> None: + assert len(dataset) == 2 + + def test_and(self, dataset: EDDMapS) -> None: + ds = dataset & dataset + assert isinstance(ds, IntersectionDataset) + + def test_or(self, dataset: EDDMapS) -> None: + ds = dataset | dataset + assert isinstance(ds, UnionDataset) + + def test_no_data(self, tmp_path: Path) -> None: + with pytest.raises(FileNotFoundError, match="Dataset not found"): + EDDMapS(str(tmp_path)) + + @pytest.fixture + def mock_missing_module(self, monkeypatch: MonkeyPatch) -> None: + import_orig = builtins.__import__ + + def mocked_import(name: str, *args: Any, **kwargs: Any) -> Any: + if name == "pandas": + raise ImportError() + return import_orig(name, *args, **kwargs) + + monkeypatch.setattr(builtins, "__import__", mocked_import) + + def test_mock_missing_module( + self, dataset: EDDMapS, mock_missing_module: None + ) -> None: + with pytest.raises( + ImportError, + match="pandas is not installed and is required to use this dataset", + ): + EDDMapS(dataset.root) + + def test_invalid_query(self, dataset: EDDMapS) -> None: + query = BoundingBox(0, 0, 0, 0, 0, 0) + with pytest.raises( + IndexError, match="query: .* not found in index with bounds:" + ): + dataset[query] diff --git a/torchgeo/datasets/__init__.py b/torchgeo/datasets/__init__.py index 1e00587b8..d2b0ff185 100644 --- a/torchgeo/datasets/__init__.py +++ b/torchgeo/datasets/__init__.py @@ -28,6 +28,7 @@ from .cowc import COWC, COWCCounting, COWCDetection from .cv4a_kenya_crop_type import CV4AKenyaCropType from .cyclone import TropicalCycloneWindEstimation from .dfc2022 import DFC2022 +from .eddmaps import EDDMapS from .enviroatlas import EnviroAtlas from .esri2020 import Esri2020 from .etci2021 import ETCI2021 @@ -118,6 +119,7 @@ __all__ = ( "ChesapeakeWV", "ChesapeakeCVPR", "CMSGlobalMangroveCanopy", + "EDDMapS", "Esri2020", "EUDEM", "GBIF", diff --git a/torchgeo/datasets/eddmaps.py b/torchgeo/datasets/eddmaps.py new file mode 100644 index 000000000..3d681e91e --- /dev/null +++ b/torchgeo/datasets/eddmaps.py @@ -0,0 +1,116 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +"""Dataset for EDDMapS.""" + +import os +import sys +from typing import Any, Dict + +import numpy as np +from rasterio.crs import CRS + +from .geo import GeoDataset +from .utils import BoundingBox, disambiguate_timestamp + + +class EDDMapS(GeoDataset): + """Dataset for EDDMapS. + + `EDDMapS `_, Early Detection and Distribution Mapping + System, is a web-based mapping system for documenting invasive species and pest + distribution. Launched in 2005 by the Center for Invasive Species and Ecosystem + Health at the University of Georgia, it was originally designed as a tool for + state Exotic Pest Plant Councils to develop more complete distribution data of + invasive species. Since then, the program has expanded to include the entire US + and Canada as well as to document certain native pest species. + + EDDMapS query results can be downloaded in CSV, KML, or Shapefile format. This + dataset currently only supports CSV files. + + If you use an EDDMapS dataset in your research, please cite it like so: + + * EDDMapS. *YEAR*. Early Detection & Distribution Mapping System. The University of + Georgia - Center for Invasive Species and Ecosystem Health. Available online at + http://www.eddmaps.org/; last accessed *DATE*. + + .. note:: + This dataset requires the following additional library to be installed: + + * `pandas `_ to load CSV files + + .. versionadded:: 0.3 + """ + + res = 0 + _crs = CRS.from_epsg(4326) # Lat/Lon + + def __init__(self, root: str = "data") -> None: + """Initialize a new Dataset instance. + + Args: + root: root directory where dataset can be found + + Raises: + FileNotFoundError: if no files are found in ``root`` + ImportError: if pandas is not installed + """ + super().__init__() + + self.root = root + + filepath = os.path.join(root, "mappings.csv") + if not os.path.exists(filepath): + raise FileNotFoundError(f"Dataset not found in `root={self.root}`") + + try: + import pandas as pd # noqa: F401 + except ImportError: + raise ImportError( + "pandas is not installed and is required to use this dataset" + ) + + # Read CSV file + data = pd.read_csv( + filepath, engine="c", usecols=["ObsDate", "Latitude", "Longitude"] + ) + + # Convert from pandas DataFrame to rtree Index + i = 0 + for date, y, x in data.itertuples(index=False, name=None): + # Skip rows without lat/lon + if np.isnan(y) or np.isnan(x): + continue + + if not pd.isna(date): + mint, maxt = disambiguate_timestamp(date, "%m-%d-%y") + else: + mint, maxt = 0, sys.maxsize + + coords = (x, x, y, y, mint, maxt) + self.index.insert(i, coords) + i += 1 + + def __getitem__(self, query: BoundingBox) -> Dict[str, Any]: + """Retrieve metadata indexed by query. + + Args: + query: (minx, maxx, miny, maxy, mint, maxt) coordinates to index + + Returns: + sample of metadata at that index + + Raises: + IndexError: if query is not found in the index + """ + hits = self.index.intersection(tuple(query), objects=True) + bboxes = [hit.bbox for hit in hits] + + if not bboxes: + raise IndexError( + f"query: {query} not found in index with bounds: {self.bounds}" + ) + + sample = {"crs": self.crs, "bbox": bboxes} + + return sample