Add EDDMapS dataset (#533)

* Add EDDMapS dataset * Mypy hack * Test fix
2022-05-14 21:29:47 -05:00 · 2022-05-14 21:29:47 -05:00 · 827985ad0a
--- a/docs/api/datasets.rst
+++ b/docs/api/datasets.rst
@ -52,6 +52,11 @@ Cropland Data Layer (CDL)

 .. autoclass:: CDL

+EDDMapS
+^^^^^^^
+
+.. autoclass:: EDDMapS
+
 EnviroAtlas
 ^^^^^^^^^^^

--- a/tests/data/eddmaps/data.py
+++ b/tests/data/eddmaps/data.py
@ -0,0 +1,97 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import pandas as pd
+
+filename = "mappings.csv"
+
+size = 3
+data = {
+    "gbifID": [""] * size,
+    "decimalLatitude": [41.881832] * size,
+    "decimalLongitude": [""] + [-87.623177] * (size - 1),
+    "objectid": [""] * size,
+    "reporter": [""] * size,
+    "RecOwner": [""] * size,
+    "SciName": ["Homo sapiens"] * size,
+    "ComName": ["human"] * size,
+    "Nativity": ["Native"] * size,
+    "OccStatus": ["Detected"] * size,
+    "Status": ["Positive"] * size,
+    "ObsDate": ["", "", "05-07-22"],
+    "DateEnt": ["05-07-22"] * size,
+    "DateUp": ["05-07-22"] * size,
+    "Location": ["Chicago, Illinois, United States"] * size,
+    "Latitude": [41.881832] * size,
+    "Longitude": [""] + [-87.623177] * (size - 1),
+    "Datum": ["WGS84"] * size,
+    "Method": [""] * size,
+    "CoordAcc": [""] * size,
+    "DataType": [""] * size,
+    "Centroid": [""] * size,
+    "Abundance": [""] * size,
+    "InfestAcre": [""] * size,
+    "GrossAcre": [""] * size,
+    "Percentcov": [""] * size,
+    "Density": [""] * size,
+    "Quantity": [""] * size,
+    "QuantityU": [""] * size,
+    "APPXQuant": [""] * size,
+    "NumCollect": [""] * size,
+    "Smallest": [""] * size,
+    "Largest": [""] * size,
+    "Incidence": [""] * size,
+    "Severity": [""] * size,
+    "Host": [""] * size,
+    "Host_Name": [""] * size,
+    "HostPheno": [""] * size,
+    "HostDamage": [""] * size,
+    "ManageStat": ["Unknown"] * size,
+    "PopStat": [""] * size,
+    "Habitat": [""] * size,
+    "LocalOwner": [""] * size,
+    "Site": [""] * size,
+    "RecBasis": [""] * size,
+    "Museum": [""] * size,
+    "MuseumRec": [""] * size,
+    "Voucher": [""] * size,
+    "ObsIDer": [""] * size,
+    "CollectTme": [""] * size,
+    "UUID": [""] * size,
+    "OrgSrcID": [""] * size,
+    "OrigName": ["Homo sapiens"] * size,
+    "RecSrcTyp": ["Bulk Data"] * size,
+    "Surveyor": [""] * size,
+    "DateAcc": [""] * size,
+    "VisitType": [""] * size,
+    "DataMthd": [""] * size,
+    "TrapType": [""] * size,
+    "NumTraps": [""] * size,
+    "TargetName": [""] * size,
+    "TargetCnt": [""] * size,
+    "TargetRnge": [""] * size,
+    "Phenology": [""] * size,
+    "LifeStatus": [""] * size,
+    "Sex": [""] * size,
+    "PID": [""] * size,
+    "WaterName": [""] * size,
+    "WaterType": [""] * size,
+    "Substrate": [""] * size,
+    "TreatArea": [""] * size,
+    "PlantTreat": [""] * size,
+    "TreatComm": [""] * size,
+    "Reference": [""] * size,
+    "Locality": [""] * size,
+    "Comments": [""] * size,
+    "ReviewDate": ["05-07-22"] * size,
+    "Reviewer": ["Charles Darwin"] * size,
+    "VerifyMthd": ["Bulk Verified"] * size,
+    "Verified": ["Verified"] * size,
+    "IDCred": ["Credible"] * size,
+    "ReviewComm": [""] * size,
+}
+
+df = pd.DataFrame(data)
+df.to_csv(filename, index=False)
--- a/tests/data/eddmaps/mappings.csv
+++ b/tests/data/eddmaps/mappings.csv
@ -0,0 +1,4 @@
+gbifID,decimalLatitude,decimalLongitude,objectid,reporter,RecOwner,SciName,ComName,Nativity,OccStatus,Status,ObsDate,DateEnt,DateUp,Location,Latitude,Longitude,Datum,Method,CoordAcc,DataType,Centroid,Abundance,InfestAcre,GrossAcre,Percentcov,Density,Quantity,QuantityU,APPXQuant,NumCollect,Smallest,Largest,Incidence,Severity,Host,Host_Name,HostPheno,HostDamage,ManageStat,PopStat,Habitat,LocalOwner,Site,RecBasis,Museum,MuseumRec,Voucher,ObsIDer,CollectTme,UUID,OrgSrcID,OrigName,RecSrcTyp,Surveyor,DateAcc,VisitType,DataMthd,TrapType,NumTraps,TargetName,TargetCnt,TargetRnge,Phenology,LifeStatus,Sex,PID,WaterName,WaterType,Substrate,TreatArea,PlantTreat,TreatComm,Reference,Locality,Comments,ReviewDate,Reviewer,VerifyMthd,Verified,IDCred,ReviewComm
+,41.881832,,,,,Homo sapiens,human,Native,Detected,Positive,,05-07-22,05-07-22,"Chicago, Illinois, United States",41.881832,,WGS84,,,,,,,,,,,,,,,,,,,,,,Unknown,,,,,,,,,,,,,Homo sapiens,Bulk Data,,,,,,,,,,,,,,,,,,,,,,,05-07-22,Charles Darwin,Bulk Verified,Verified,Credible,
+,41.881832,-87.623177,,,,Homo sapiens,human,Native,Detected,Positive,,05-07-22,05-07-22,"Chicago, Illinois, United States",41.881832,-87.623177,WGS84,,,,,,,,,,,,,,,,,,,,,,Unknown,,,,,,,,,,,,,Homo sapiens,Bulk Data,,,,,,,,,,,,,,,,,,,,,,,05-07-22,Charles Darwin,Bulk Verified,Verified,Credible,
+,41.881832,-87.623177,,,,Homo sapiens,human,Native,Detected,Positive,05-07-22,05-07-22,05-07-22,"Chicago, Illinois, United States",41.881832,-87.623177,WGS84,,,,,,,,,,,,,,,,,,,,,,Unknown,,,,,,,,,,,,,Homo sapiens,Bulk Data,,,,,,,,,,,,,,,,,,,,,,,05-07-22,Charles Darwin,Bulk Verified,Verified,Credible,
--- a/tests/datasets/test_eddmaps.py
+++ b/tests/datasets/test_eddmaps.py
@ -0,0 +1,67 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import builtins
+import os
+from pathlib import Path
+from typing import Any
+
+import pytest
+from _pytest.monkeypatch import MonkeyPatch
+
+from torchgeo.datasets import BoundingBox, EDDMapS, IntersectionDataset, UnionDataset
+
+pytest.importorskip("pandas", minversion="0.23.2")
+
+
+class TestEDDMapS:
+    @pytest.fixture(scope="class")
+    def dataset(self) -> EDDMapS:
+        root = os.path.join("tests", "data", "eddmaps")
+        return EDDMapS(root)
+
+    def test_getitem(self, dataset: EDDMapS) -> None:
+        x = dataset[dataset.bounds]
+        assert isinstance(x, dict)
+
+    def test_len(self, dataset: EDDMapS) -> None:
+        assert len(dataset) == 2
+
+    def test_and(self, dataset: EDDMapS) -> None:
+        ds = dataset & dataset
+        assert isinstance(ds, IntersectionDataset)
+
+    def test_or(self, dataset: EDDMapS) -> None:
+        ds = dataset | dataset
+        assert isinstance(ds, UnionDataset)
+
+    def test_no_data(self, tmp_path: Path) -> None:
+        with pytest.raises(FileNotFoundError, match="Dataset not found"):
+            EDDMapS(str(tmp_path))
+
+    @pytest.fixture
+    def mock_missing_module(self, monkeypatch: MonkeyPatch) -> None:
+        import_orig = builtins.__import__
+
+        def mocked_import(name: str, *args: Any, **kwargs: Any) -> Any:
+            if name == "pandas":
+                raise ImportError()
+            return import_orig(name, *args, **kwargs)
+
+        monkeypatch.setattr(builtins, "__import__", mocked_import)
+
+    def test_mock_missing_module(
+        self, dataset: EDDMapS, mock_missing_module: None
+    ) -> None:
+        with pytest.raises(
+            ImportError,
+            match="pandas is not installed and is required to use this dataset",
+        ):
+            EDDMapS(dataset.root)
+
+    def test_invalid_query(self, dataset: EDDMapS) -> None:
+        query = BoundingBox(0, 0, 0, 0, 0, 0)
+        with pytest.raises(
+            IndexError, match="query: .* not found in index with bounds:"
+        ):
+            dataset[query]
--- a/torchgeo/datasets/init.py
+++ b/torchgeo/datasets/init.py
@ -28,6 +28,7 @@ from .cowc import COWC, COWCCounting, COWCDetection
 from .cv4a_kenya_crop_type import CV4AKenyaCropType
 from .cyclone import TropicalCycloneWindEstimation
 from .dfc2022 import DFC2022
+from .eddmaps import EDDMapS
 from .enviroatlas import EnviroAtlas
 from .esri2020 import Esri2020
 from .etci2021 import ETCI2021
@ -118,6 +119,7 @@ __all__ = (
    "ChesapeakeWV",
    "ChesapeakeCVPR",
    "CMSGlobalMangroveCanopy",
+    "EDDMapS",
    "Esri2020",
    "EUDEM",
    "GBIF",
--- a/torchgeo/datasets/eddmaps.py
+++ b/torchgeo/datasets/eddmaps.py
@ -0,0 +1,116 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+"""Dataset for EDDMapS."""
+
+import os
+import sys
+from typing import Any, Dict
+
+import numpy as np
+from rasterio.crs import CRS
+
+from .geo import GeoDataset
+from .utils import BoundingBox, disambiguate_timestamp
+
+
+class EDDMapS(GeoDataset):
+    """Dataset for EDDMapS.
+
+    `EDDMapS <https://www.eddmaps.org/>`_, Early Detection and Distribution Mapping
+    System, is a web-based mapping system for documenting invasive species and pest
+    distribution. Launched in 2005 by the Center for Invasive Species and Ecosystem
+    Health at the University of Georgia, it was originally designed as a tool for
+    state Exotic Pest Plant Councils to develop more complete distribution data of
+    invasive species. Since then, the program has expanded to include the entire US
+    and Canada as well as to document certain native pest species.
+
+    EDDMapS query results can be downloaded in CSV, KML, or Shapefile format. This
+    dataset currently only supports CSV files.
+
+    If you use an EDDMapS dataset in your research, please cite it like so:
+
+    * EDDMapS. *YEAR*. Early Detection & Distribution Mapping System. The University of
+      Georgia - Center for Invasive Species and Ecosystem Health. Available online at
+      http://www.eddmaps.org/; last accessed *DATE*.
+
+    .. note::
+       This dataset requires the following additional library to be installed:
+
+       * `pandas <https://pypi.org/project/pandas/>`_ to load CSV files
+
+    .. versionadded:: 0.3
+    """
+
+    res = 0
+    _crs = CRS.from_epsg(4326)  # Lat/Lon
+
+    def __init__(self, root: str = "data") -> None:
+        """Initialize a new Dataset instance.
+
+        Args:
+            root: root directory where dataset can be found
+
+        Raises:
+            FileNotFoundError: if no files are found in ``root``
+            ImportError: if pandas is not installed
+        """
+        super().__init__()
+
+        self.root = root
+
+        filepath = os.path.join(root, "mappings.csv")
+        if not os.path.exists(filepath):
+            raise FileNotFoundError(f"Dataset not found in `root={self.root}`")
+
+        try:
+            import pandas as pd  # noqa: F401
+        except ImportError:
+            raise ImportError(
+                "pandas is not installed and is required to use this dataset"
+            )
+
+        # Read CSV file
+        data = pd.read_csv(
+            filepath, engine="c", usecols=["ObsDate", "Latitude", "Longitude"]
+        )
+
+        # Convert from pandas DataFrame to rtree Index
+        i = 0
+        for date, y, x in data.itertuples(index=False, name=None):
+            # Skip rows without lat/lon
+            if np.isnan(y) or np.isnan(x):
+                continue
+
+            if not pd.isna(date):
+                mint, maxt = disambiguate_timestamp(date, "%m-%d-%y")
+            else:
+                mint, maxt = 0, sys.maxsize
+
+            coords = (x, x, y, y, mint, maxt)
+            self.index.insert(i, coords)
+            i += 1
+
+    def __getitem__(self, query: BoundingBox) -> Dict[str, Any]:
+        """Retrieve metadata indexed by query.
+
+        Args:
+            query: (minx, maxx, miny, maxy, mint, maxt) coordinates to index
+
+        Returns:
+            sample of metadata at that index
+
+        Raises:
+            IndexError: if query is not found in the index
+        """
+        hits = self.index.intersection(tuple(query), objects=True)
+        bboxes = [hit.bbox for hit in hits]
+
+        if not bboxes:
+            raise IndexError(
+                f"query: {query} not found in index with bounds: {self.bounds}"
+            )
+
+        sample = {"crs": self.crs, "bbox": bboxes}
+
+        return sample