Add EDDMapS dataset (#533)

* Add EDDMapS dataset * Mypy hack * Test fix
2022-05-14 21:29:47 -05:00 · 2022-05-14 21:29:47 -05:00 · 827985ad0a
--- a/docs/api/datasets.rst
+++ b/docs/api/datasets.rst
@ -52,6 +52,11 @@ Cropland Data Layer (CDL)
 .. autoclass:: CDL
 EDDMapS
 ^^^^^^^
 .. autoclass:: EDDMapS
 EnviroAtlas
 ^^^^^^^^^^^
--- a/tests/data/eddmaps/data.py
+++ b/tests/data/eddmaps/data.py
@ -0,0 +1,97 @@
 #!/usr/bin/env python3
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 import pandas as pd
 filename = "mappings.csv"
 size = 3
 data = {
    "gbifID": [""] * size,
    "decimalLatitude": [41.881832] * size,
    "decimalLongitude": [""] + [-87.623177] * (size - 1),
    "objectid": [""] * size,
    "reporter": [""] * size,
    "RecOwner": [""] * size,
    "SciName": ["Homo sapiens"] * size,
    "ComName": ["human"] * size,
    "Nativity": ["Native"] * size,
    "OccStatus": ["Detected"] * size,
    "Status": ["Positive"] * size,
    "ObsDate": ["", "", "05-07-22"],
    "DateEnt": ["05-07-22"] * size,
    "DateUp": ["05-07-22"] * size,
    "Location": ["Chicago, Illinois, United States"] * size,
    "Latitude": [41.881832] * size,
    "Longitude": [""] + [-87.623177] * (size - 1),
    "Datum": ["WGS84"] * size,
    "Method": [""] * size,
    "CoordAcc": [""] * size,
    "DataType": [""] * size,
    "Centroid": [""] * size,
    "Abundance": [""] * size,
    "InfestAcre": [""] * size,
    "GrossAcre": [""] * size,
    "Percentcov": [""] * size,
    "Density": [""] * size,
    "Quantity": [""] * size,
    "QuantityU": [""] * size,
    "APPXQuant": [""] * size,
    "NumCollect": [""] * size,
    "Smallest": [""] * size,
    "Largest": [""] * size,
    "Incidence": [""] * size,
    "Severity": [""] * size,
    "Host": [""] * size,
    "Host_Name": [""] * size,
    "HostPheno": [""] * size,
    "HostDamage": [""] * size,
    "ManageStat": ["Unknown"] * size,
    "PopStat": [""] * size,
    "Habitat": [""] * size,
    "LocalOwner": [""] * size,
    "Site": [""] * size,
    "RecBasis": [""] * size,
    "Museum": [""] * size,
    "MuseumRec": [""] * size,
    "Voucher": [""] * size,
    "ObsIDer": [""] * size,
    "CollectTme": [""] * size,
    "UUID": [""] * size,
    "OrgSrcID": [""] * size,
    "OrigName": ["Homo sapiens"] * size,
    "RecSrcTyp": ["Bulk Data"] * size,
    "Surveyor": [""] * size,
    "DateAcc": [""] * size,
    "VisitType": [""] * size,
    "DataMthd": [""] * size,
    "TrapType": [""] * size,
    "NumTraps": [""] * size,
    "TargetName": [""] * size,
    "TargetCnt": [""] * size,
    "TargetRnge": [""] * size,
    "Phenology": [""] * size,
    "LifeStatus": [""] * size,
    "Sex": [""] * size,
    "PID": [""] * size,
    "WaterName": [""] * size,
    "WaterType": [""] * size,
    "Substrate": [""] * size,
    "TreatArea": [""] * size,
    "PlantTreat": [""] * size,
    "TreatComm": [""] * size,
    "Reference": [""] * size,
    "Locality": [""] * size,
    "Comments": [""] * size,
    "ReviewDate": ["05-07-22"] * size,
    "Reviewer": ["Charles Darwin"] * size,
    "VerifyMthd": ["Bulk Verified"] * size,
    "Verified": ["Verified"] * size,
    "IDCred": ["Credible"] * size,
    "ReviewComm": [""] * size,
 }
 df = pd.DataFrame(data)
 df.to_csv(filename, index=False)
--- a/tests/data/eddmaps/mappings.csv
+++ b/tests/data/eddmaps/mappings.csv
@ -0,0 +1,4 @@
 gbifID,decimalLatitude,decimalLongitude,objectid,reporter,RecOwner,SciName,ComName,Nativity,OccStatus,Status,ObsDate,DateEnt,DateUp,Location,Latitude,Longitude,Datum,Method,CoordAcc,DataType,Centroid,Abundance,InfestAcre,GrossAcre,Percentcov,Density,Quantity,QuantityU,APPXQuant,NumCollect,Smallest,Largest,Incidence,Severity,Host,Host_Name,HostPheno,HostDamage,ManageStat,PopStat,Habitat,LocalOwner,Site,RecBasis,Museum,MuseumRec,Voucher,ObsIDer,CollectTme,UUID,OrgSrcID,OrigName,RecSrcTyp,Surveyor,DateAcc,VisitType,DataMthd,TrapType,NumTraps,TargetName,TargetCnt,TargetRnge,Phenology,LifeStatus,Sex,PID,WaterName,WaterType,Substrate,TreatArea,PlantTreat,TreatComm,Reference,Locality,Comments,ReviewDate,Reviewer,VerifyMthd,Verified,IDCred,ReviewComm
 ,41.881832,,,,,Homo sapiens,human,Native,Detected,Positive,,05-07-22,05-07-22,"Chicago, Illinois, United States",41.881832,,WGS84,,,,,,,,,,,,,,,,,,,,,,Unknown,,,,,,,,,,,,,Homo sapiens,Bulk Data,,,,,,,,,,,,,,,,,,,,,,,05-07-22,Charles Darwin,Bulk Verified,Verified,Credible,
 ,41.881832,-87.623177,,,,Homo sapiens,human,Native,Detected,Positive,,05-07-22,05-07-22,"Chicago, Illinois, United States",41.881832,-87.623177,WGS84,,,,,,,,,,,,,,,,,,,,,,Unknown,,,,,,,,,,,,,Homo sapiens,Bulk Data,,,,,,,,,,,,,,,,,,,,,,,05-07-22,Charles Darwin,Bulk Verified,Verified,Credible,
 ,41.881832,-87.623177,,,,Homo sapiens,human,Native,Detected,Positive,05-07-22,05-07-22,05-07-22,"Chicago, Illinois, United States",41.881832,-87.623177,WGS84,,,,,,,,,,,,,,,,,,,,,,Unknown,,,,,,,,,,,,,Homo sapiens,Bulk Data,,,,,,,,,,,,,,,,,,,,,,,05-07-22,Charles Darwin,Bulk Verified,Verified,Credible,
--- a/tests/datasets/test_eddmaps.py
+++ b/tests/datasets/test_eddmaps.py
@ -0,0 +1,67 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 import builtins
 import os
 from pathlib import Path
 from typing import Any
 import pytest
 from _pytest.monkeypatch import MonkeyPatch
 from torchgeo.datasets import BoundingBox, EDDMapS, IntersectionDataset, UnionDataset
 pytest.importorskip("pandas", minversion="0.23.2")
 class TestEDDMapS:
    @pytest.fixture(scope="class")
    def dataset(self) -> EDDMapS:
        root = os.path.join("tests", "data", "eddmaps")
        return EDDMapS(root)
    def test_getitem(self, dataset: EDDMapS) -> None:
        x = dataset[dataset.bounds]
        assert isinstance(x, dict)
    def test_len(self, dataset: EDDMapS) -> None:
        assert len(dataset) == 2
    def test_and(self, dataset: EDDMapS) -> None:
        ds = dataset & dataset
        assert isinstance(ds, IntersectionDataset)
    def test_or(self, dataset: EDDMapS) -> None:
        ds = dataset | dataset
        assert isinstance(ds, UnionDataset)
    def test_no_data(self, tmp_path: Path) -> None:
        with pytest.raises(FileNotFoundError, match="Dataset not found"):
            EDDMapS(str(tmp_path))
    @pytest.fixture
    def mock_missing_module(self, monkeypatch: MonkeyPatch) -> None:
        import_orig = builtins.__import__
        def mocked_import(name: str, *args: Any, **kwargs: Any) -> Any:
            if name == "pandas":
                raise ImportError()
            return import_orig(name, *args, **kwargs)
        monkeypatch.setattr(builtins, "__import__", mocked_import)
    def test_mock_missing_module(
        self, dataset: EDDMapS, mock_missing_module: None
    ) -> None:
        with pytest.raises(
            ImportError,
            match="pandas is not installed and is required to use this dataset",
        ):
            EDDMapS(dataset.root)
    def test_invalid_query(self, dataset: EDDMapS) -> None:
        query = BoundingBox(0, 0, 0, 0, 0, 0)
        with pytest.raises(
            IndexError, match="query: .* not found in index with bounds:"
        ):
            dataset[query]
--- a/torchgeo/datasets/init.py
+++ b/torchgeo/datasets/init.py
@ -28,6 +28,7 @@ from .cowc import COWC, COWCCounting, COWCDetection
 from .cv4a_kenya_crop_type import CV4AKenyaCropType
 from .cyclone import TropicalCycloneWindEstimation
 from .dfc2022 import DFC2022
 from .eddmaps import EDDMapS
 from .enviroatlas import EnviroAtlas
 from .esri2020 import Esri2020
 from .etci2021 import ETCI2021
@ -118,6 +119,7 @@ __all__ = (
    "ChesapeakeWV",
    "ChesapeakeCVPR",
    "CMSGlobalMangroveCanopy",
    "EDDMapS",
    "Esri2020",
    "EUDEM",
    "GBIF",
--- a/torchgeo/datasets/eddmaps.py
+++ b/torchgeo/datasets/eddmaps.py
@ -0,0 +1,116 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 """Dataset for EDDMapS."""
 import os
 import sys
 from typing import Any, Dict
 import numpy as np
 from rasterio.crs import CRS
 from .geo import GeoDataset
 from .utils import BoundingBox, disambiguate_timestamp
 class EDDMapS(GeoDataset):
    """Dataset for EDDMapS.
    `EDDMapS <https://www.eddmaps.org/>`_, Early Detection and Distribution Mapping
    System, is a web-based mapping system for documenting invasive species and pest
    distribution. Launched in 2005 by the Center for Invasive Species and Ecosystem
    Health at the University of Georgia, it was originally designed as a tool for
    state Exotic Pest Plant Councils to develop more complete distribution data of
    invasive species. Since then, the program has expanded to include the entire US
    and Canada as well as to document certain native pest species.
    EDDMapS query results can be downloaded in CSV, KML, or Shapefile format. This
    dataset currently only supports CSV files.
    If you use an EDDMapS dataset in your research, please cite it like so:
    * EDDMapS. *YEAR*. Early Detection & Distribution Mapping System. The University of
      Georgia - Center for Invasive Species and Ecosystem Health. Available online at
      http://www.eddmaps.org/; last accessed *DATE*.
    .. note::
       This dataset requires the following additional library to be installed:
       * `pandas <https://pypi.org/project/pandas/>`_ to load CSV files
    .. versionadded:: 0.3
    """
    res = 0
    _crs = CRS.from_epsg(4326)  # Lat/Lon
    def __init__(self, root: str = "data") -> None:
        """Initialize a new Dataset instance.
        Args:
            root: root directory where dataset can be found
        Raises:
            FileNotFoundError: if no files are found in ``root``
            ImportError: if pandas is not installed
        """
        super().__init__()
        self.root = root
        filepath = os.path.join(root, "mappings.csv")
        if not os.path.exists(filepath):
            raise FileNotFoundError(f"Dataset not found in `root={self.root}`")
        try:
            import pandas as pd  # noqa: F401
        except ImportError:
            raise ImportError(
                "pandas is not installed and is required to use this dataset"
            )
        # Read CSV file
        data = pd.read_csv(
            filepath, engine="c", usecols=["ObsDate", "Latitude", "Longitude"]
        )
        # Convert from pandas DataFrame to rtree Index
        i = 0
        for date, y, x in data.itertuples(index=False, name=None):
            # Skip rows without lat/lon
            if np.isnan(y) or np.isnan(x):
                continue
            if not pd.isna(date):
                mint, maxt = disambiguate_timestamp(date, "%m-%d-%y")
            else:
                mint, maxt = 0, sys.maxsize
            coords = (x, x, y, y, mint, maxt)
            self.index.insert(i, coords)
            i += 1
    def __getitem__(self, query: BoundingBox) -> Dict[str, Any]:
        """Retrieve metadata indexed by query.
        Args:
            query: (minx, maxx, miny, maxy, mint, maxt) coordinates to index
        Returns:
            sample of metadata at that index
        Raises:
            IndexError: if query is not found in the index
        """
        hits = self.index.intersection(tuple(query), objects=True)
        bboxes = [hit.bbox for hit in hits]
        if not bboxes:
            raise IndexError(
                f"query: {query} not found in index with bounds: {self.bounds}"
            )
        sample = {"crs": self.crs, "bbox": bboxes}
        return sample