* populate index attempt

* added tests

* correct plot method

* fix test

* fix documentation

* fix docs

* name changes

* lazy import pandas and Any instead of Tensor

* requested changes

* mypy fixes

* Close plot filehandles

Co-authored-by: Adam J. Stewart <ajstewart426@gmail.com>
This commit is contained in:
Nils Lehmann 2022-02-27 21:33:39 +01:00 коммит произвёл GitHub
Родитель 4c221dfc49
Коммит 06ec364b5f
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
7 изменённых файлов: 739 добавлений и 0 удалений

Просмотреть файл

@ -92,6 +92,11 @@ National Agriculture Imagery Program (NAIP)
.. autoclass:: NAIP
Open Buildings
^^^^^^^^^^^^^^
.. autoclass:: OpenBuildings
Sentinel
^^^^^^^^

Двоичные данные
tests/data/openbuildings/000_buildings.csv.gz Normal file

Двоичный файл не отображается.

Просмотреть файл

@ -0,0 +1,105 @@
#!/usr/bin/env python3
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import csv
import gzip
import hashlib
import json
import os
import random
import shutil
import numpy as np
from shapely.geometry import Polygon
SIZE = 0.05
np.random.seed(0)
random.seed(0)
def create_meta_data_file(zipfilename):
meta_data = {
"type": "FeatureCollection",
"features": [
{
"type": "Feature",
"geometry": {
"type": "Polygon",
"coordinates": [
[[0.0, 0.0], [0.0, SIZE], [SIZE, SIZE], [SIZE, 0.0], [0.0, 0.0]]
],
},
"properties": {
"tile_id": "025",
"tile_url": "polygons_s2_level_4_gzip/{}".format(zipfilename),
"size_mb": 0.2,
},
}
],
}
return meta_data
def create_csv_data_row(lat, long):
width, height = SIZE / 10, SIZE / 10
minx = long - 0.5 * width
maxx = long + 0.5 * width
miny = lat - 0.5 * height
maxy = lat - 0.5 * height
coordinates = [(minx, miny), (minx, maxy), (maxx, maxy), (maxx, miny), (minx, miny)]
polygon = Polygon(coordinates)
data_row = {
"latitude": lat,
"longitude": long,
"area_in_meters": 1.0,
"confidence": 1.0,
"geometry": polygon.wkt,
"full_plus_code": "ABC",
}
return data_row
def create_buildings_data():
fourth = SIZE / 4
# pandas df
dict_data = [
create_csv_data_row(fourth, fourth),
create_csv_data_row(SIZE - fourth, SIZE - fourth),
]
return dict_data
if __name__ == "__main__":
csvname = "000_buildings.csv"
zipfilename = csvname + ".gz"
# create and save metadata
meta_data = create_meta_data_file(zipfilename)
with open("tiles.geojson", "w") as fp:
json.dump(meta_data, fp)
# create and archive buildings data
buildings_data = create_buildings_data()
keys = buildings_data[0].keys()
with open(csvname, "w") as f:
w = csv.DictWriter(f, keys)
w.writeheader()
w.writerows(buildings_data)
# archive the csv to gzip
with open(csvname, "rb") as f_in:
with gzip.open(zipfilename, "wb") as f_out:
shutil.copyfileobj(f_in, f_out)
# Compute checksums
with open(zipfilename, "rb") as f:
md5 = hashlib.md5(f.read()).hexdigest()
print(f"{zipfilename}: {md5}")
# remove csv file
os.remove(csvname)

Просмотреть файл

@ -0,0 +1 @@
{"type": "FeatureCollection", "features": [{"type": "Feature", "geometry": {"type": "Polygon", "coordinates": [[[0.0, 0.0], [0.0, 0.05], [0.05, 0.05], [0.05, 0.0], [0.0, 0.0]]]}, "properties": {"tile_id": "025", "tile_url": "polygons_s2_level_4_gzip/000_buildings.csv.gz", "size_mb": 0.2}}]}

Просмотреть файл

@ -0,0 +1,156 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import builtins
import json
import os
import shutil
from pathlib import Path
from typing import Any, Generator
import matplotlib.pyplot as plt
import pandas as pd
import pytest
import torch
import torch.nn as nn
from _pytest.fixtures import SubRequest
from _pytest.monkeypatch import MonkeyPatch
from rasterio.crs import CRS
from torchgeo.datasets import (
BoundingBox,
IntersectionDataset,
OpenBuildings,
UnionDataset,
)
pytest.importorskip("pandas", minversion="0.19.1")
class TestOpenBuildings:
@pytest.fixture
def dataset(
self, monkeypatch: Generator[MonkeyPatch, None, None], tmp_path: Path
) -> OpenBuildings:
root = str(tmp_path)
shutil.copy(
os.path.join("tests", "data", "openbuildings", "tiles.geojson"), root
)
shutil.copy(
os.path.join("tests", "data", "openbuildings", "000_buildings.csv.gz"), root
)
md5s = {"000_buildings.csv.gz": "20aeeec9d45a0ce4d772a26e0bcbc25f"}
monkeypatch.setattr(OpenBuildings, "md5s", md5s) # type: ignore[attr-defined]
transforms = nn.Identity() # type: ignore[attr-defined]
return OpenBuildings(root=root, transforms=transforms)
@pytest.fixture(params=["pandas"])
def mock_missing_module(
self, monkeypatch: Generator[MonkeyPatch, None, None], request: SubRequest
) -> str:
import_orig = builtins.__import__
package = str(request.param)
def mocked_import(name: str, *args: Any, **kwargs: Any) -> Any:
if name == package:
raise ImportError()
return import_orig(name, *args, **kwargs)
monkeypatch.setattr( # type: ignore[attr-defined]
builtins, "__import__", mocked_import
)
return package
def test_mock_missing_module(
self, dataset: OpenBuildings, mock_missing_module: str
) -> None:
package = mock_missing_module
with pytest.raises(
ImportError,
match=f"{package} is not installed and is required to use this dataset",
):
OpenBuildings(root=dataset.root)
def test_no_shapes_to_rasterize(
self, dataset: OpenBuildings, tmp_path: Path
) -> None:
# empty csv buildings file
path = os.path.join(tmp_path, "000_buildings.csv.gz")
df = pd.read_csv(path)
df = pd.DataFrame(columns=df.columns)
df.to_csv(path, compression="gzip")
x = dataset[dataset.bounds]
assert isinstance(x, dict)
assert isinstance(x["crs"], CRS)
assert isinstance(x["mask"], torch.Tensor)
def test_no_building_data_found(self, tmp_path: Path) -> None:
false_root = os.path.join(tmp_path, "empty")
os.makedirs(false_root)
shutil.copy(
os.path.join("tests", "data", "openbuildings", "tiles.geojson"), false_root
)
with pytest.raises(
RuntimeError, match="have manually downloaded the dataset as suggested "
):
OpenBuildings(root=false_root)
def test_corrupted(self, dataset: OpenBuildings, tmp_path: Path) -> None:
with open(os.path.join(tmp_path, "000_buildings.csv.gz"), "w") as f:
f.write("bad")
with pytest.raises(RuntimeError, match="Dataset found, but corrupted."):
OpenBuildings(dataset.root, checksum=True)
def test_no_meta_data_found(self, tmp_path: Path) -> None:
false_root = os.path.join(tmp_path, "empty")
os.makedirs(false_root)
with pytest.raises(FileNotFoundError, match="Meta data file"):
OpenBuildings(root=false_root)
def test_nothing_in_index(self, dataset: OpenBuildings, tmp_path: Path) -> None:
# change meta data to another 'title_url' so that there is no match found
with open(os.path.join(tmp_path, "tiles.geojson"), "r") as f:
content = json.load(f)
content["features"][0]["properties"]["tile_url"] = "mismatch.csv.gz"
with open(os.path.join(tmp_path, "tiles.geojson"), "w") as f:
json.dump(content, f)
with pytest.raises(FileNotFoundError, match="data was found in"):
OpenBuildings(dataset.root)
def test_getitem(self, dataset: OpenBuildings) -> None:
x = dataset[dataset.bounds]
assert isinstance(x, dict)
assert isinstance(x["crs"], CRS)
assert isinstance(x["mask"], torch.Tensor)
def test_and(self, dataset: OpenBuildings) -> None:
ds = dataset & dataset
assert isinstance(ds, IntersectionDataset)
def test_or(self, dataset: OpenBuildings) -> None:
ds = dataset | dataset
assert isinstance(ds, UnionDataset)
def test_invalid_query(self, dataset: OpenBuildings) -> None:
query = BoundingBox(100, 100, 100, 100, 0, 0)
with pytest.raises(
IndexError, match="query: .* not found in index with bounds:"
):
dataset[query]
def test_plot(self, dataset: OpenBuildings) -> None:
x = dataset[dataset.bounds]
dataset.plot(x, suptitle="test")
plt.close()
def test_plot_prediction(self, dataset: OpenBuildings) -> None:
x = dataset[dataset.bounds]
x["prediction"] = x["mask"].clone()
dataset.plot(x, suptitle="Prediction")
plt.close()

Просмотреть файл

@ -66,6 +66,7 @@ from .loveda import LoveDA
from .naip import NAIP
from .nasa_marine_debris import NASAMarineDebris
from .nwpu import VHR10
from .openbuildings import OpenBuildings
from .oscd import OSCD
from .patternnet import PatternNet
from .potsdam import Potsdam2D
@ -121,6 +122,7 @@ __all__ = (
"Landsat8",
"Landsat9",
"NAIP",
"OpenBuildings",
"Sentinel",
"Sentinel2",
# VisionDataset

Просмотреть файл

@ -0,0 +1,470 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
"""Open Buildings datasets."""
import glob
import json
import os
import sys
from typing import Any, Callable, Dict, List, Optional
import fiona
import fiona.transform
import matplotlib.pyplot as plt
import rasterio
import shapely
import shapely.wkt as wkt
import torch
from rasterio.crs import CRS
from rtree.index import Index, Property
from .geo import VectorDataset
from .utils import BoundingBox, check_integrity
class OpenBuildings(VectorDataset):
r"""Open Buildings dataset.
The `Open Buildings
<https://sites.research.google/open-buildings/#download>`_ dataset
consists of computer generated building detections across the African continent.
Dataset features:
* 516M building detections as polygons with centroid lat/long
* covering area of 19.4M km\ :sup:`2`\ (64% of the African continent)
* confidence score and
`Plus Code <https://maps.google.com/pluscodes/>`_
Dataset format:
* csv files containing building detections compressed as csv.gz
* meta data geojson file
The data can be downloaded from `here
<https://sites.research.google/open-buildings/#download>`__. Additionally, the
`meta data geometry file
<https://sites.research.google/open-buildings/tiles.geojson>`_ also needs to be
placed in `root` as `tiles.geojson`.
If you use this dataset in your research, please cite the following technical
report:
* https://arxiv.org/abs/2107.12283
.. versionadded:: 0.3
"""
md5s = {
"025_buildings.csv.gz": "41db2572bfd08628d01475a2ee1a2f17",
"04f_buildings.csv.gz": "3232c1c6d45c1543260b77e5689fc8b1",
"05b_buildings.csv.gz": "4fc57c63bbbf9a21a3902da7adc3a670",
"093_buildings.csv.gz": "00fce146dadf0b30255e750c4c5ac2de",
"095_buildings.csv.gz": "f5765b0936f7ccbd0b4abed60d994f08",
"0c3_buildings.csv.gz": "013b130fe872387e0cff842399b423de",
"0c3_buildings.csv": "a697ad2433e9a9f6001de25b4664651a",
"0c5_buildings.csv.gz": "16ca283e9344e9da8b47acaf03c1c6e4",
"0c7_buildings.csv.gz": "b3774930006497a80c8a2fbf33056610",
"0d1_buildings.csv.gz": "41e652218ca5964d297d9cd1d84b831c",
"0d7_buildings.csv.gz": "d365fe47d10b0756dd54ceca24598d8e",
"0d9_buildings.csv.gz": "3ebd47fa4f86857266e9a7346d6aa163",
"0db_buildings.csv.gz": "368213e9caa7ee229ef9403b0ca8c80d",
"0dd_buildings.csv.gz": "8f5fcefff262fdfd82800092d2e9d841",
"0df_buildings.csv.gz": "cbb5f63b10daa25568bdde8d9f66f8a4",
"0e1_buildings.csv.gz": "a9b9bf1e541b62c8a34d2f6f2ae71e1c",
"0e3_buildings.csv.gz": "3d9c2ffc11c02aec2bd008699f9c4bd1",
"0e5_buildings.csv.gz": "1e1b2bf63dfc520e62e4b68db23fe64c",
"0e7_buildings.csv.gz": "c96797588c90e66268367cb56b4b9af8",
"0e9_buildings.csv.gz": "c53bb7bbc8140034d1be2c49ff49af68",
"0eb_buildings.csv.gz": "407c771f614a15d69d78f1e25decf694",
"0ed_buildings.csv.gz": "bddd10992d291677019d7106ce1f4fac",
"0ef_buildings.csv.gz": "d1b91936e7ac06c661878ef9eb5dba7b",
"0f1_buildings.csv.gz": "9d86eb10d2d8766e1385b6c52c11d5e2",
"0f9_buildings.csv.gz": "1c6775131214b26f4a27b4c42d6e9fca",
"0fb_buildings.csv.gz": "d39528cb4e0cbff589ca89dc86d9b5db",
"0fd_buildings.csv.gz": "304fe4a60e950c900697d975098f7536",
"0ff_buildings.csv.gz": "266ca7ed1ad0251b3999b0e2e9b54648",
"103_buildings.csv.gz": "8d3cafab5f1e02b2a0a6180eb34d1cac",
"105_buildings.csv.gz": "dd61cc74239aa9a1b30f10859122807b",
"107_buildings.csv.gz": "823c05984f859a1bf17af8ce78bf2892",
"109_buildings.csv.gz": "cfdee0e807168cd1c183d9c01535369b",
"10b_buildings.csv.gz": "d8ecaf406abd864b641ba34985f3042e",
"10d_buildings.csv.gz": "af584a542a17942ff7e94653322dba87",
"10f_buildings.csv.gz": "3d5369e15c4d1f59fb38cf61f4e6290b",
"111_buildings.csv.gz": "47504e43d1b67101bed5d924225328dc",
"113_buildings.csv.gz": "3f991c831569f91f34eaa8fc3882b2fd",
"117_buildings.csv.gz": "a4145fa6e458480e30c807f80ae5cd65",
"119_buildings.csv.gz": "5661b7ac23f266542c7e0d962a8cae58",
"11b_buildings.csv.gz": "41b6d036610d0bddac069ec72e68710e",
"11d_buildings.csv.gz": "1ef75e9d176dd8d6bfa6012d36b1d25c",
"11f_buildings.csv.gz": "f004873d1ef3933c1716ab6409565b7d",
"121_buildings.csv.gz": "0c7e7a9043ed069fbdefdcfcfc437482",
"123_buildings.csv.gz": "c46bd53b67025c3de11657220cce0aec",
"125_buildings.csv.gz": "33253ae1a82656f4eedca9bd86f981a3",
"127_buildings.csv.gz": "2f827f8fc93485572178e9ad0c65e22d",
"129_buildings.csv.gz": "74f98346990a1d1e41241ce8f4bb201a",
"12f_buildings.csv.gz": "b1b0777296df2bfef512df0945ca3e14",
"131_buildings.csv.gz": "8362825b10c9396ecbb85c49cd210bc6",
"137_buildings.csv.gz": "96da7389df820405b0010db4a6c98c61",
"139_buildings.csv.gz": "c41e26fc6f3565c3d7c66ab977dc8159",
"13b_buildings.csv.gz": "981d4ccb0f41a103bdad8ef949eb4ffe",
"13d_buildings.csv.gz": "d15585d06ee74b0095842dd887197035",
"141_buildings.csv.gz": "ae0bf17778d45119c74e50e06a04020d",
"143_buildings.csv.gz": "9699809e57eb097dfaf9d484f1d9c5fa",
"145_buildings.csv.gz": "81e74e0165ea358278ce18507dddfdb0",
"147_buildings.csv.gz": "39edad15fa16c432f5d460f0a8166032",
"149_buildings.csv.gz": "94bf8f8fa221744fb1d57c7d4065e69e",
"14f_buildings.csv.gz": "ca8410be89b5cf868c2a67861712e4ea",
"15b_buildings.csv.gz": "8c0071c0ae20a60e8dd4d7aa6aac5a99",
"15d_buildings.csv.gz": "35f044a323556adda5f31e8fc9307c85",
"161_buildings.csv.gz": "ba08b70a26f07b5e2cd4eafd9d6f826b",
"163_buildings.csv.gz": "2bec83a2504b531cd1cb0311fcb6c952",
"165_buildings.csv.gz": "48f934733dd3054164f9b09abee63312",
"167_buildings.csv.gz": "bba8657024d80d44e475759b65adc969",
"169_buildings.csv.gz": "13e142e48597ee7a8b0b812e226dfa72",
"16b_buildings.csv.gz": "9c62351d6cc8eaf761ab89d4586d26d6",
"16d_buildings.csv.gz": "a33c23da3f603c8c3eacc5e6a47aaf66",
"16f_buildings.csv.gz": "4850dd7c8f0fb628ba5864ea9f47647b",
"171_buildings.csv.gz": "4217f1b025db869c8bed1014704c2a79",
"173_buildings.csv.gz": "5a5f3f07e261a9dc58c6180b69130e4a",
"175_buildings.csv.gz": "5bbf7a7c8f57d28e024ddf8f4039b575",
"177_buildings.csv.gz": "76cd4b17d68d62e1f088f229b65f8acf",
"179_buildings.csv.gz": "a5a1c6609483336ddff91b2385e70eb9",
"17b_buildings.csv.gz": "a47c1145a3b0bcdaba18c153b7b92b87",
"17d_buildings.csv.gz": "3226d0abf396f44c1a436be83538dfd8",
"17f_buildings.csv.gz": "3e18d4fc5837ee89274d30f2126b92b2",
"181_buildings.csv.gz": "c87639d7f6d6a85a3fa6b06910b0e145",
"183_buildings.csv.gz": "e94438ebf19b3b25035954d23a0e90cf",
"185_buildings.csv.gz": "8de8d1d50c16c575f85b96dee474cb56",
"189_buildings.csv.gz": "da94cd495a99496fd687bbb4a1715c90",
"18b_buildings.csv.gz": "9ab353335fe6ff694e834889be2b305d",
"18d_buildings.csv.gz": "e37e0f868ce96f7d14f7bf1a301da1d3",
"18f_buildings.csv.gz": "e9000b9ef9bb0f838088e96becfc95a1",
"191_buildings.csv.gz": "c00bb4d6b2b12615d576c06fe545cbfa",
"193_buildings.csv.gz": "d48d4c03ef053f6987b3e6e9e78a8b03",
"195_buildings.csv.gz": "d93ab833e74480f07a5ccf227067db5a",
"197_buildings.csv.gz": "8667e040f9863e43924aafe6071fabc7",
"199_buildings.csv.gz": "04ba65a4caf16cc1e0d5c4e1322c5885",
"19b_buildings.csv.gz": "e49412e3e1bccceb0bdb4df5201288f4",
"19d_buildings.csv.gz": "92b5fb4e96529d90e99c788e3e8696d4",
"19f_buildings.csv.gz": "c023f6c37d0026b56f530b841517a6cd",
"1a1_buildings.csv.gz": "471483b50c722af104af8a582e780c04",
"1a3_buildings.csv.gz": "0a453053f1ff53f9e165e16c7f97354a",
"1a5_buildings.csv.gz": "1f6a823e223d5f29c66aa728933de684",
"1a7_buildings.csv.gz": "6130b724501fa16e6d84e484c4091f1f",
"1a9_buildings.csv.gz": "73022e8e7b994e76a58cc763a057d542",
"1b9_buildings.csv.gz": "48dea4af9d12b755e75b76c68c47de6b",
"1bb_buildings.csv.gz": "dfb9ee4d3843d81722b70f7582c775a4",
"1bd_buildings.csv.gz": "fdea2898fc50ae25b6196048373d8244",
"1bf_buildings.csv.gz": "96ef27d6128d0bcdfa896fed6f27cdd0",
"1c1_buildings.csv.gz": "32e3667d939e7f95316eb75a6ffdb603",
"1c3_buildings.csv.gz": "ed94b543da1bbe3101ed66f7d7727d24",
"1c5_buildings.csv.gz": "ce527ab33e564f0cc1b63ae467932a18",
"1c7_buildings.csv.gz": "d5fb474466d6a11d3b08e3a011984ada",
"1dd_buildings.csv.gz": "9e7e50e3f95b3f2ceff6351b75ca1e75",
"1e5_buildings.csv.gz": "f95ea85fce47ce7edf5729086d43f922",
"1e7_buildings.csv.gz": "2bca5682c48134e69b738d70dfe7d516",
"1e9_buildings.csv.gz": "f049ad06dbbb200f524b4f50d1df8c2e",
"1eb_buildings.csv.gz": "6822d7f202b453ec3cc03fb8f04691ad",
"1ed_buildings.csv.gz": "9dfc560e2c3d135ebdcd46fa09c47169",
"1ef_buildings.csv.gz": "506e7772c35b09cfd3b6f8691dc2947d",
"1f1_buildings.csv.gz": "b74f2b585cfad3b881fe4f124080440a",
"1f3_buildings.csv.gz": "12896642315320e11ed9ed2d3f0e5995",
"1f5_buildings.csv.gz": "334aea21e532e178bf5c54d028158906",
"1f7_buildings.csv.gz": "0e8c3d2e005eb04c6852a8aa993f5a76",
"217_buildings.csv.gz": "296e9ba121fea752b865a48e5c0fe8a5",
"219_buildings.csv.gz": "1d19b6626d738f7706f75c2935aaaff4",
"21d_buildings.csv.gz": "28bfca1f8668f59db021d3a195994768",
"21f_buildings.csv.gz": "06325c8b0a8f6ed598b7dc6f0bb5adf2",
"221_buildings.csv.gz": "a354ffc1f7226d525c7cf53848975da1",
"223_buildings.csv.gz": "3bda1339d561b3bc749220877f1384d9",
"225_buildings.csv.gz": "8eb02ad77919d9e551138a14d3ad1bbc",
"227_buildings.csv.gz": "c07aceb7c81f83a653810befa0695b61",
"22f_buildings.csv.gz": "97d63e30e008ec4424f6b0641b75377c",
"231_buildings.csv.gz": "f4bc384ed74552ddcfe2e69107b91345",
"233_buildings.csv.gz": "081756e7bdcfdc2aee9114c4cfe62bd8",
"23b_buildings.csv.gz": "75776d3dcbc90cf3a596664747880134",
"23d_buildings.csv.gz": "e5d0b9b7b14601f58cfdb9ea170e9520",
"23f_buildings.csv.gz": "77f38466419b4d391be8e4f05207fdf5",
"3d1_buildings.csv.gz": "6659c97bd765250b0dee4b1b7ff583a9",
"3d5_buildings.csv.gz": "c27d8f6b2808549606f00bc04d8b42bc",
"3d7_buildings.csv.gz": "abdef2e68cc31c67dbb6e60c4c40483e",
"3d9_buildings.csv.gz": "4c06ae37d8e76626345a52a32f989de9",
"3db_buildings.csv.gz": "e83ca0115eaf4ec0a72aaf932b00442a",
"b5b_buildings.csv.gz": "5e5f59cb17b81137d89c4bab8107e837",
}
filename_glob = "*_buildings.csv"
zipfile_glob = "*_buildings.csv.gz"
meta_data_url = "https://sites.research.google/open-buildings/tiles.geojson"
meta_data_filename = "tiles.geojson"
def __init__(
self,
root: str = "data",
crs: Optional[CRS] = None,
res: float = 0.0001,
transforms: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None,
checksum: bool = False,
) -> None:
"""Initialize a new Dataset instance.
Args:
root: root directory where dataset can be found
crs: :term:`coordinate reference system (CRS)` to warp to
(defaults to the CRS of the first file found)
res: resolution of the dataset in units of CRS
transforms: a function/transform that takes input sample and its target as
entry and returns a transformed version
checksum: if True, check the MD5 of the downloaded files (may be slow)
Raises:
FileNotFoundError: if no files are found in ``root``
"""
self.root = root
self.res = res
self.checksum = checksum
self.root = root
self.res = res
self.transforms = transforms
self._verify()
try:
import pandas as pd # noqa: F401
except ImportError:
raise ImportError(
"pandas is not installed and is required to use this dataset"
)
# Create an R-tree to index the dataset using the polygon centroid as bounds
self.index = Index(interleaved=False, properties=Property(dimension=3))
with open(os.path.join(root, "tiles.geojson")) as f:
data = json.load(f)
features = data["features"]
features_filenames = [
feature["properties"]["tile_url"].split("/")[-1] for feature in features
] # get csv filename
polygon_files = glob.glob(os.path.join(self.root, self.zipfile_glob))
polygon_filenames = [f.split(os.sep)[-1] for f in polygon_files]
matched_features = [
feature
for filename, feature in zip(features_filenames, features)
if filename in polygon_filenames
]
i = 0
source_crs = CRS.from_dict({"init": "epsg:4326"})
for feature in matched_features:
if crs is None:
crs = CRS.from_dict(source_crs)
c = feature["geometry"]["coordinates"][0]
xs = [x[0] for x in c]
ys = [x[1] for x in c]
minx, miny, maxx, maxy = min(xs), min(ys), max(xs), max(ys)
(minx, maxx), (miny, maxy) = fiona.transform.transform(
source_crs.to_dict(), crs.to_dict(), [minx, maxx], [miny, maxy]
)
mint = 0
maxt = sys.maxsize
coords = (minx, maxx, miny, maxy, mint, maxt)
filepath = os.path.join(
self.root, feature["properties"]["tile_url"].split("/")[-1]
)
self.index.insert(i, coords, filepath)
i += 1
if i == 0:
raise FileNotFoundError(
f"No {self.__class__.__name__} data was found in '{self.root}'"
)
self._crs = crs
self._source_crs = source_crs
def __getitem__(self, query: BoundingBox) -> Dict[str, Any]:
"""Retrieve image/mask and metadata indexed by query.
Args:
query: (minx, maxx, miny, maxy, mint, maxt) coordinates to index
Returns:
sample of image/mask and metadata for the given query. If there are
not matching shapes found within the query, an empty raster is returned
Raises:
IndexError: if query is not found in the index
"""
hits = self.index.intersection(tuple(query), objects=True)
filepaths = [hit.object for hit in hits]
if not filepaths:
raise IndexError(
f"query: {query} not found in index with bounds: {self.bounds}"
)
shapes = self._filter_geometries(query, filepaths)
# Rasterize geometries
width = (query.maxx - query.minx) / self.res
height = (query.maxy - query.miny) / self.res
transform = rasterio.transform.from_bounds(
query.minx, query.miny, query.maxx, query.maxy, width, height
)
if shapes:
masks = rasterio.features.rasterize(
shapes, out_shape=(int(height), int(width)), transform=transform
)
masks = torch.tensor(masks).unsqueeze(0) # type: ignore[attr-defined]
else:
masks = torch.zeros( # type: ignore[attr-defined]
size=(1, int(height), int(width))
)
sample = {"mask": masks, "crs": self.crs, "bbox": query}
if self.transforms is not None:
sample = self.transforms(sample)
return sample
def _filter_geometries(
self, query: BoundingBox, filepaths: List[str]
) -> List[Dict[str, Any]]:
"""Filters a df read from the polygon csv file based on query and conf thresh.
Args:
query: (minx, maxx, miny, maxy, mint, maxt) coordinates to index
filepaths: filepaths to files that were hits from rmtree index
Returns:
List with all polygons from all hit filepaths
"""
import pandas as pd
# We need to know the bounding box of the query in the source CRS
(minx, maxx), (miny, maxy) = fiona.transform.transform(
self._crs.to_dict(),
self._source_crs.to_dict(),
[query.minx, query.maxx],
[query.miny, query.maxy],
)
df_query = (
"longitude >= {} & longitude <= {} & " "latitude >= {} & latitude <= {}"
).format(minx, maxx, miny, maxy)
shapes = []
for f in filepaths:
csv_chunks = pd.read_csv(f, chunksize=200000, compression="gzip")
for chunk in csv_chunks:
df = chunk.query(df_query)
# Warp geometries to requested CRS
polygon_series = df["geometry"].map(self._wkt_fiona_geom_transform)
shapes.extend(polygon_series.values.tolist())
return shapes
def _wkt_fiona_geom_transform(self, x: str) -> Dict[str, Any]:
"""Function to transform a geometry string into new crs.
Args:
x: Polygon string
Returns:
transformed geometry in geojson format
"""
x = json.dumps(shapely.geometry.mapping(wkt.loads(x)))
x = json.loads(x.replace("'", '"'))
transformed: Dict[str, Any] = fiona.transform.transform_geom(
self._source_crs.to_dict(), self._crs.to_dict(), x
)
return transformed
def _verify(self) -> None:
"""Verify the integrity of the dataset.
Raises:
RuntimeError: if dataset is missing or checksum fails
FileNotFoundError: if metadata file is not found in root
"""
# Check if the zip files have already been downloaded and checksum
pathname = os.path.join(self.root, self.zipfile_glob)
i = 0
for zipfile in glob.iglob(pathname):
filename = os.path.basename(zipfile)
if self.checksum and not check_integrity(zipfile, self.md5s[filename]):
raise RuntimeError("Dataset found, but corrupted: {}.".format(filename))
i += 1
if i != 0:
return
# check if the metadata file has been downloaded
if not os.path.exists(os.path.join(self.root, self.meta_data_filename)):
raise FileNotFoundError(
f"Meta data file {self.meta_data_filename} "
f"not found in in `root={self.root}`."
)
raise RuntimeError(
f"Dataset not found in `root={self.root}` "
"either specify a different `root` directory or make sure you "
"have manually downloaded the dataset as suggested in the documentation."
)
def plot( # type: ignore[override]
self,
sample: Dict[str, Any],
show_titles: bool = True,
suptitle: Optional[str] = None,
) -> plt.Figure:
"""Plot a sample from the dataset.
Args:
sample: a sample returned by :meth:`__getitem__`
show_titles: flag indicating whether to show titles above each panel
suptitle: optional string to use as a suptitle
Returns:
a matplotlib Figure with the rendered sample
"""
mask = sample["mask"].permute(1, 2, 0)
showing_predictions = "prediction" in sample
if showing_predictions:
pred = sample["prediction"].permute(1, 2, 0)
ncols = 2
else:
ncols = 1
fig, axs = plt.subplots(nrows=1, ncols=ncols, figsize=(ncols * 4, 4))
if showing_predictions:
axs[0].imshow(mask)
axs[0].axis("off")
axs[1].imshow(pred)
axs[1].axis("off")
if show_titles:
axs[0].set_title("Mask")
axs[1].set_title("Prediction")
else:
axs.imshow(mask)
axs.axis("off")
if show_titles:
axs.set_title("Mask")
if suptitle is not None:
plt.suptitle(suptitle)
return fig