Add script to compress dataset files (#1326)

* Add script to compress dataset files

* Fix type annotation

* Add script for L5 L1
This commit is contained in:
Adam J. Stewart 2023-05-14 11:01:31 -05:00 коммит произвёл GitHub
Родитель dddd723d8d
Коммит 777b68376f
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
6 изменённых файлов: 220 добавлений и 0 удалений

Просмотреть файл

@ -0,0 +1,68 @@
#!/usr/bin/env python3
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import argparse
import glob
import os
import numpy as np
import rasterio as rio
from tqdm import tqdm
from tqdm.contrib.concurrent import thread_map
if __name__ == "__main__":
parser = argparse.ArgumentParser()
# Can be same directory for in-place compression
parser.add_argument("src_dir", help="directory to recursively search for files")
parser.add_argument("dst_dir", help="directory to save compressed files in")
parser.add_argument("--suffix", default=".tif", help="file suffix")
# Could be min/max, 2%/98%, mean ± 2 * std, etc.
parser.add_argument(
"--min", nargs="+", type=float, required=True, help="minimum range"
)
parser.add_argument(
"--max", nargs="+", type=float, required=True, help="maximum range"
)
parser.add_argument("--num-workers", type=int, default=10, help="number of threads")
args = parser.parse_args()
args.min = np.array(args.min)[:, np.newaxis, np.newaxis]
args.max = np.array(args.max)[:, np.newaxis, np.newaxis]
def compress(src_path: str) -> None:
"""Rescale, convert to uint8, and compress an image.
Args:
src_path: Path to an image file.
"""
global args
dst_path = src_path.replace(args.src_dir, args.dst_dir)
dst_dir = os.path.dirname(dst_path)
os.makedirs(dst_dir, exist_ok=True)
with rio.open(src_path, "r") as src:
x = src.read()
x = (x - args.min) / (args.max - args.min)
# 0-1 -> 0-255
x = np.clip(x * 2**8, 0, 2**8).astype(np.uint8)
profile = src.profile
profile["dtype"] = "uint8"
profile["compress"] = "lzw"
profile["predictor"] = 2
with rio.open(dst_path, "w", **profile) as dst:
for i, band in enumerate(dst.indexes):
dst.write(x[i], band)
paths = glob.glob(
os.path.join(args.src_dir, "**", f"*{args.suffix}"), recursive=True
)
if args.num_workers > 0:
thread_map(compress, paths, max_workers=args.num_workers)
else:
for path in tqdm(paths):
compress(path)

Просмотреть файл

@ -0,0 +1,32 @@
#!/usr/bin/env bash
set -euo pipefail
# User-specific parameters
ROOT_DIR=data
SRC_DIR="$ROOT_DIR/ssl4eo-l5-l1"
DST_DIR="$ROOT_DIR/ssl4eo-l5-l1-v2"
NUM_WORKERS=40
# Satellite-specific parameters
# https://www.usgs.gov/faqs/how-do-i-use-scale-factor-landsat-level-2-science-products
# https://developers.google.com/earth-engine/datasets/catalog/LANDSAT_LT05_C02_T1_TOA
R_MIN=0
R_MAX=0.4
# https://earthobservatory.nasa.gov/global-maps/MOD_LSTD_M
T_MIN=$(echo "273.15 - 25" | bc -l)
T_MAX=$(echo "273.15 + 45" | bc -l)
MIN=($R_MIN $R_MIN $R_MIN $R_MIN $R_MIN $T_MIN $R_MIN)
MAX=($R_MAX $R_MAX $R_MAX $R_MAX $R_MAX $T_MAX $R_MAX)
# Generic parameters
SCRIPT_DIR=$(cd $(dirname "${BASH_SOURCE[0]}") && pwd)
time python3 "$SCRIPT_DIR/compress_dataset.py" \
"$SRC_DIR" \
"$DST_DIR" \
--min ${MIN[@]} \
--max ${MAX[@]} \
--num-workers $NUM_WORKERS

Просмотреть файл

@ -0,0 +1,32 @@
#!/usr/bin/env bash
set -euo pipefail
# User-specific parameters
ROOT_DIR=data
SRC_DIR="$ROOT_DIR/ssl4eo-l7-l1"
DST_DIR="$ROOT_DIR/ssl4eo-l7-l1-v2"
NUM_WORKERS=40
# Satellite-specific parameters
# https://www.usgs.gov/faqs/how-do-i-use-scale-factor-landsat-level-2-science-products
# https://developers.google.com/earth-engine/datasets/catalog/LANDSAT_LE07_C02_T1_TOA
R_MIN=0
R_MAX=0.4
# https://earthobservatory.nasa.gov/global-maps/MOD_LSTD_M
T_MIN=$(echo "273.15 - 25" | bc -l)
T_MAX=$(echo "273.15 + 45" | bc -l)
MIN=($R_MIN $R_MIN $R_MIN $R_MIN $R_MIN $T_MIN $T_MIN $R_MIN $R_MIN)
MAX=($R_MAX $R_MAX $R_MAX $R_MAX $R_MAX $T_MAX $T_MAX $R_MAX $R_MAX)
# Generic parameters
SCRIPT_DIR=$(cd $(dirname "${BASH_SOURCE[0]}") && pwd)
time python3 "$SCRIPT_DIR/compress_dataset.py" \
"$SRC_DIR" \
"$DST_DIR" \
--min ${MIN[@]} \
--max ${MAX[@]} \
--num-workers $NUM_WORKERS

Просмотреть файл

@ -0,0 +1,28 @@
#!/usr/bin/env bash
set -euo pipefail
# User-specific parameters
ROOT_DIR=data
SRC_DIR="$ROOT_DIR/ssl4eo-l7-l2"
DST_DIR="$ROOT_DIR/ssl4eo-l7-l2-v2"
NUM_WORKERS=40
# Satellite-specific parameters
# https://www.usgs.gov/faqs/how-do-i-use-scale-factor-landsat-level-2-science-products
# https://developers.google.com/earth-engine/datasets/catalog/LANDSAT_LE07_C02_T1_L2
R_MIN=$(echo "(0 + 0.2) / 0.0000275" | bc -l)
R_MAX=$(echo "(0.3 + 0.2) / 0.0000275" | bc -l)
MIN=$R_MIN
MAX=$R_MAX
# Generic parameters
SCRIPT_DIR=$(cd $(dirname "${BASH_SOURCE[0]}") && pwd)
time python3 "$SCRIPT_DIR/compress_dataset.py" \
"$SRC_DIR" \
"$DST_DIR" \
--min ${MIN[@]} \
--max ${MAX[@]} \
--num-workers $NUM_WORKERS

Просмотреть файл

@ -0,0 +1,32 @@
#!/usr/bin/env bash
set -euo pipefail
# User-specific parameters
ROOT_DIR=data
SRC_DIR="$ROOT_DIR/ssl4eo-l8-l1"
DST_DIR="$ROOT_DIR/ssl4eo-l8-l1-v2"
NUM_WORKERS=40
# Satellite-specific parameters
# https://www.usgs.gov/faqs/how-do-i-use-scale-factor-landsat-level-2-science-products
# https://developers.google.com/earth-engine/datasets/catalog/LANDSAT_LC08_C02_T1_TOA
R_MIN=0
R_MAX=0.4
# https://earthobservatory.nasa.gov/global-maps/MOD_LSTD_M
T_MIN=$(echo "273.15 - 25" | bc -l)
T_MAX=$(echo "273.15 + 45" | bc -l)
MIN=($R_MIN $R_MIN $R_MIN $R_MIN $R_MIN $R_MIN $R_MIN $R_MIN $R_MIN $T_MIN $T_MIN)
MAX=($R_MAX $R_MAX $R_MAX $R_MAX $R_MAX $R_MAX $R_MAX $R_MAX $R_MAX $T_MAX $T_MAX)
# Generic parameters
SCRIPT_DIR=$(cd $(dirname "${BASH_SOURCE[0]}") && pwd)
time python3 "$SCRIPT_DIR/compress_dataset.py" \
"$SRC_DIR" \
"$DST_DIR" \
--min ${MIN[@]} \
--max ${MAX[@]} \
--num-workers $NUM_WORKERS

Просмотреть файл

@ -0,0 +1,28 @@
#!/usr/bin/env bash
set -euo pipefail
# User-specific parameters
ROOT_DIR=data
SRC_DIR="$ROOT_DIR/ssl4eo-l8-l2"
DST_DIR="$ROOT_DIR/ssl4eo-l8-l2-v2"
NUM_WORKERS=40
# Satellite-specific parameters
# https://www.usgs.gov/faqs/how-do-i-use-scale-factor-landsat-level-2-science-products
# https://developers.google.com/earth-engine/datasets/catalog/LANDSAT_LC08_C02_T1_L2
R_MIN=$(echo "(0 + 0.2) / 0.0000275" | bc -l)
R_MAX=$(echo "(0.3 + 0.2) / 0.0000275" | bc -l)
MIN=$R_MIN
MAX=$R_MAX
# Generic parameters
SCRIPT_DIR=$(cd $(dirname "${BASH_SOURCE[0]}") && pwd)
time python3 "$SCRIPT_DIR/compress_dataset.py" \
"$SRC_DIR" \
"$DST_DIR" \
--min ${MIN[@]} \
--max ${MAX[@]} \
--num-workers $NUM_WORKERS