зеркало из https://github.com/microsoft/torchgeo.git
Add iNaturalist dataset (#532)
This commit is contained in:
Родитель
1f2006e62c
Коммит
369b36122a
|
@ -77,6 +77,11 @@ GlobBiomass
|
|||
|
||||
.. autoclass:: GlobBiomass
|
||||
|
||||
iNaturalist
|
||||
^^^^^^^^^^^
|
||||
|
||||
.. autoclass:: INaturalist
|
||||
|
||||
Landsat
|
||||
^^^^^^^
|
||||
|
||||
|
|
|
@ -0,0 +1,58 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import pandas as pd
|
||||
|
||||
filename = "observations-012345.csv"
|
||||
|
||||
# User can select which columns to export. The following are the default columns.
|
||||
# Not all columns may exist in the actual dataset.
|
||||
size = 4
|
||||
data = {
|
||||
"id": [""] * size,
|
||||
"observed_on_string": [""] * size,
|
||||
"observed_on": ["", "", "2022-05-07", "2022-05-07"],
|
||||
"time_observed_at": ["", "", "", "2022-05-07 11:02:53 +0100"],
|
||||
"time_zone": ["Central Time (US & Canada)"] * size,
|
||||
"user_id": [123] * size,
|
||||
"user_login": ["darwin"] * size,
|
||||
"created_at": ["2022-05-07 11:02:53 +0100"] * size,
|
||||
"updated_at": ["2022-05-07 11:02:53 +0100"] * size,
|
||||
"quality_grade": ["research"] * size,
|
||||
"license": ["CCO"] * size,
|
||||
"url": ["https://inaturalist.org/observations/123"] * size,
|
||||
"image_url": [
|
||||
"https://inaturalist-open-data.s3.amazonaws.com/photos/123/medium.jpg"
|
||||
]
|
||||
* size,
|
||||
"sound_url": ["https://static.inaturalist.org/sounds/123.m4a?123"] * size,
|
||||
"tag_list": ["Chicago"] * size,
|
||||
"description": [""] * size,
|
||||
"num_identification_agreements": [1] * size,
|
||||
"num_identification_disagreements": [0] * size,
|
||||
"captive_cultivated": ["false"] * size,
|
||||
"oauth_application_id": [""] * size,
|
||||
"place_guess": ["Chicago"] * size,
|
||||
"latitude": [41.881832] * size,
|
||||
"longitude": [""] + [-87.623177] * (size - 1),
|
||||
"positional_accuracy": [5] * size,
|
||||
"private_place_guess": [""] * size,
|
||||
"private_latitude": [""] * size,
|
||||
"private_longitude": [""] * size,
|
||||
"public_positional_accuracy": [5] * size,
|
||||
"geoprivacy": [""] * size,
|
||||
"taxon_geoprivacy": [""] * size,
|
||||
"coordinates_obscured": ["false"] * size,
|
||||
"positioning_method": ["gps"] * size,
|
||||
"positioning_device": ["gps"] * size,
|
||||
"species_guess": ["Homo sapiens"] * size,
|
||||
"scientific_name": ["Homo sapiens"] * size,
|
||||
"common_name": ["human"] * size,
|
||||
"iconic_taxon_name": ["Animalia"] * size,
|
||||
"taxon_id": [123] * size,
|
||||
}
|
||||
|
||||
df = pd.DataFrame(data)
|
||||
df.to_csv(filename, index=False)
|
|
@ -0,0 +1,5 @@
|
|||
id,observed_on_string,observed_on,time_observed_at,time_zone,user_id,user_login,created_at,updated_at,quality_grade,license,url,image_url,sound_url,tag_list,description,num_identification_agreements,num_identification_disagreements,captive_cultivated,oauth_application_id,place_guess,latitude,longitude,positional_accuracy,private_place_guess,private_latitude,private_longitude,public_positional_accuracy,geoprivacy,taxon_geoprivacy,coordinates_obscured,positioning_method,positioning_device,species_guess,scientific_name,common_name,iconic_taxon_name,taxon_id
|
||||
,,,,Central Time (US & Canada),123,darwin,2022-05-07 11:02:53 +0100,2022-05-07 11:02:53 +0100,research,CCO,https://inaturalist.org/observations/123,https://inaturalist-open-data.s3.amazonaws.com/photos/123/medium.jpg,https://static.inaturalist.org/sounds/123.m4a?123,Chicago,,1,0,false,,Chicago,41.881832,,5,,,,5,,,false,gps,gps,Homo sapiens,Homo sapiens,human,Animalia,123
|
||||
,,,,Central Time (US & Canada),123,darwin,2022-05-07 11:02:53 +0100,2022-05-07 11:02:53 +0100,research,CCO,https://inaturalist.org/observations/123,https://inaturalist-open-data.s3.amazonaws.com/photos/123/medium.jpg,https://static.inaturalist.org/sounds/123.m4a?123,Chicago,,1,0,false,,Chicago,41.881832,-87.623177,5,,,,5,,,false,gps,gps,Homo sapiens,Homo sapiens,human,Animalia,123
|
||||
,,2022-05-07,,Central Time (US & Canada),123,darwin,2022-05-07 11:02:53 +0100,2022-05-07 11:02:53 +0100,research,CCO,https://inaturalist.org/observations/123,https://inaturalist-open-data.s3.amazonaws.com/photos/123/medium.jpg,https://static.inaturalist.org/sounds/123.m4a?123,Chicago,,1,0,false,,Chicago,41.881832,-87.623177,5,,,,5,,,false,gps,gps,Homo sapiens,Homo sapiens,human,Animalia,123
|
||||
,,2022-05-07,2022-05-07 11:02:53 +0100,Central Time (US & Canada),123,darwin,2022-05-07 11:02:53 +0100,2022-05-07 11:02:53 +0100,research,CCO,https://inaturalist.org/observations/123,https://inaturalist-open-data.s3.amazonaws.com/photos/123/medium.jpg,https://static.inaturalist.org/sounds/123.m4a?123,Chicago,,1,0,false,,Chicago,41.881832,-87.623177,5,,,,5,,,false,gps,gps,Homo sapiens,Homo sapiens,human,Animalia,123
|
|
|
@ -0,0 +1,72 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import builtins
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
from _pytest.monkeypatch import MonkeyPatch
|
||||
|
||||
from torchgeo.datasets import (
|
||||
BoundingBox,
|
||||
INaturalist,
|
||||
IntersectionDataset,
|
||||
UnionDataset,
|
||||
)
|
||||
|
||||
pytest.importorskip("pandas", minversion="0.23.2")
|
||||
|
||||
|
||||
class TestINaturalist:
|
||||
@pytest.fixture(scope="class")
|
||||
def dataset(self) -> INaturalist:
|
||||
root = os.path.join("tests", "data", "inaturalist")
|
||||
return INaturalist(root)
|
||||
|
||||
def test_getitem(self, dataset: INaturalist) -> None:
|
||||
x = dataset[dataset.bounds]
|
||||
assert isinstance(x, dict)
|
||||
|
||||
def test_len(self, dataset: INaturalist) -> None:
|
||||
assert len(dataset) == 3
|
||||
|
||||
def test_and(self, dataset: INaturalist) -> None:
|
||||
ds = dataset & dataset
|
||||
assert isinstance(ds, IntersectionDataset)
|
||||
|
||||
def test_or(self, dataset: INaturalist) -> None:
|
||||
ds = dataset | dataset
|
||||
assert isinstance(ds, UnionDataset)
|
||||
|
||||
def test_no_data(self, tmp_path: Path) -> None:
|
||||
with pytest.raises(FileNotFoundError, match="Dataset not found"):
|
||||
INaturalist(str(tmp_path))
|
||||
|
||||
@pytest.fixture
|
||||
def mock_missing_module(self, monkeypatch: MonkeyPatch) -> None:
|
||||
import_orig = builtins.__import__
|
||||
|
||||
def mocked_import(name: str, *args: Any, **kwargs: Any) -> Any:
|
||||
if name == "pandas":
|
||||
raise ImportError()
|
||||
return import_orig(name, *args, **kwargs)
|
||||
|
||||
monkeypatch.setattr(builtins, "__import__", mocked_import)
|
||||
|
||||
def test_mock_missing_module(
|
||||
self, dataset: INaturalist, mock_missing_module: None
|
||||
) -> None:
|
||||
with pytest.raises(
|
||||
ImportError,
|
||||
match="pandas is not installed and is required to use this dataset",
|
||||
):
|
||||
INaturalist(dataset.root)
|
||||
|
||||
def test_invalid_query(self, dataset: INaturalist) -> None:
|
||||
query = BoundingBox(0, 0, 0, 0, 0, 0)
|
||||
with pytest.raises(
|
||||
IndexError, match="query: .* not found in index with bounds:"
|
||||
):
|
||||
dataset[query]
|
|
@ -48,6 +48,7 @@ from .geo import (
|
|||
from .gid15 import GID15
|
||||
from .globbiomass import GlobBiomass
|
||||
from .idtrees import IDTReeS
|
||||
from .inaturalist import INaturalist
|
||||
from .inria import InriaAerialImageLabeling
|
||||
from .landcoverai import LandCoverAI
|
||||
from .landsat import (
|
||||
|
@ -121,6 +122,7 @@ __all__ = (
|
|||
"EUDEM",
|
||||
"GBIF",
|
||||
"GlobBiomass",
|
||||
"INaturalist",
|
||||
"Landsat",
|
||||
"Landsat1",
|
||||
"Landsat2",
|
||||
|
|
|
@ -0,0 +1,123 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
"""Dataset for iNaturalist."""
|
||||
|
||||
import glob
|
||||
import os
|
||||
import sys
|
||||
from typing import Any, Dict
|
||||
|
||||
from rasterio.crs import CRS
|
||||
|
||||
from .geo import GeoDataset
|
||||
from .utils import BoundingBox, disambiguate_timestamp
|
||||
|
||||
|
||||
class INaturalist(GeoDataset):
|
||||
"""Dataset for iNaturalist.
|
||||
|
||||
`iNaturalist <https://www.inaturalist.org/>`_ is a joint initiative of the
|
||||
California Academy of Sciences and the National Geographic Society. It allows
|
||||
citizen scientists to upload observations of organisms that can be downloaded by
|
||||
scientists and researchers.
|
||||
|
||||
If you use an iNaturalist dataset in your research, please cite it according to:
|
||||
|
||||
* https://www.inaturalist.org/pages/help#cite
|
||||
|
||||
.. note::
|
||||
This dataset requires the following additional library to be installed:
|
||||
|
||||
* `pandas <https://pypi.org/project/pandas/>`_ to load CSV files
|
||||
|
||||
.. versionadded:: 0.3
|
||||
"""
|
||||
|
||||
res = 0
|
||||
_crs = CRS.from_epsg(4326) # Lat/Lon
|
||||
|
||||
def __init__(self, root: str = "data") -> None:
|
||||
"""Initialize a new Dataset instance.
|
||||
|
||||
Args:
|
||||
root: root directory where dataset can be found
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: if no files are found in ``root``
|
||||
ImportError: if pandas is not installed
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
self.root = root
|
||||
|
||||
files = glob.glob(os.path.join(root, "**.csv"))
|
||||
if not files:
|
||||
raise FileNotFoundError(f"Dataset not found in `root={self.root}`")
|
||||
|
||||
try:
|
||||
import pandas as pd # noqa: F401
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"pandas is not installed and is required to use this dataset"
|
||||
)
|
||||
|
||||
# Read CSV file
|
||||
data = pd.read_csv(
|
||||
files[0],
|
||||
engine="c",
|
||||
usecols=["observed_on", "time_observed_at", "latitude", "longitude"],
|
||||
)
|
||||
|
||||
# Dataset contains many possible timestamps:
|
||||
#
|
||||
# * observed_on_string: no consistent format (can't use)
|
||||
# * observed_on: day precision (better)
|
||||
# * time_observed_at: second precision (best)
|
||||
# * created_at: when observation was submitted (shouldn't use)
|
||||
# * updated_at: when submission was updated (shouldn't use)
|
||||
#
|
||||
# The created_at/updated_at timestamps can be years after the actual submission,
|
||||
# so they shouldn't be used, even if observed_on/time_observed_at are missing.
|
||||
|
||||
# Convert from pandas DataFrame to rtree Index
|
||||
i = 0
|
||||
for date, time, y, x in data.itertuples(index=False, name=None):
|
||||
# Skip rows without lat/lon
|
||||
if pd.isna(y) or pd.isna(x):
|
||||
continue
|
||||
|
||||
if not pd.isna(time):
|
||||
mint, maxt = disambiguate_timestamp(time, "%Y-%m-%d %H:%M:%S %z")
|
||||
elif not pd.isna(date):
|
||||
mint, maxt = disambiguate_timestamp(date, "%Y-%m-%d")
|
||||
else:
|
||||
mint, maxt = 0, sys.maxsize
|
||||
|
||||
coords = (x, x, y, y, mint, maxt)
|
||||
self.index.insert(i, coords)
|
||||
i += 1
|
||||
|
||||
def __getitem__(self, query: BoundingBox) -> Dict[str, Any]:
|
||||
"""Retrieve metadata indexed by query.
|
||||
|
||||
Args:
|
||||
query: (minx, maxx, miny, maxy, mint, maxt) coordinates to index
|
||||
|
||||
Returns:
|
||||
sample of metadata at that index
|
||||
|
||||
Raises:
|
||||
IndexError: if query is not found in the index
|
||||
"""
|
||||
hits = self.index.intersection(tuple(query), objects=True)
|
||||
bboxes = [hit.bbox for hit in hits]
|
||||
|
||||
if not bboxes:
|
||||
raise IndexError(
|
||||
f"query: {query} not found in index with bounds: {self.bounds}"
|
||||
)
|
||||
|
||||
sample = {"crs": self.crs, "bbox": bboxes}
|
||||
|
||||
return sample
|
Загрузка…
Ссылка в новой задаче