From 777b68376f56638e77d74ca73af733f4559c5cec Mon Sep 17 00:00:00 2001 From: "Adam J. Stewart" Date: Sun, 14 May 2023 11:01:31 -0500 Subject: [PATCH] Add script to compress dataset files (#1326) * Add script to compress dataset files * Fix type annotation * Add script for L5 L1 --- experiments/ssl4eo/compress_dataset.py | 68 ++++++++++++++++++++++++++ experiments/ssl4eo/compress_l5_l1.sh | 32 ++++++++++++ experiments/ssl4eo/compress_l7_l1.sh | 32 ++++++++++++ experiments/ssl4eo/compress_l7_l2.sh | 28 +++++++++++ experiments/ssl4eo/compress_l8_l1.sh | 32 ++++++++++++ experiments/ssl4eo/compress_l8_l2.sh | 28 +++++++++++ 6 files changed, 220 insertions(+) create mode 100755 experiments/ssl4eo/compress_dataset.py create mode 100755 experiments/ssl4eo/compress_l5_l1.sh create mode 100755 experiments/ssl4eo/compress_l7_l1.sh create mode 100755 experiments/ssl4eo/compress_l7_l2.sh create mode 100755 experiments/ssl4eo/compress_l8_l1.sh create mode 100755 experiments/ssl4eo/compress_l8_l2.sh diff --git a/experiments/ssl4eo/compress_dataset.py b/experiments/ssl4eo/compress_dataset.py new file mode 100755 index 000000000..2cc20d74d --- /dev/null +++ b/experiments/ssl4eo/compress_dataset.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python3 + +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +import argparse +import glob +import os + +import numpy as np +import rasterio as rio +from tqdm import tqdm +from tqdm.contrib.concurrent import thread_map + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + # Can be same directory for in-place compression + parser.add_argument("src_dir", help="directory to recursively search for files") + parser.add_argument("dst_dir", help="directory to save compressed files in") + parser.add_argument("--suffix", default=".tif", help="file suffix") + # Could be min/max, 2%/98%, mean ± 2 * std, etc. + parser.add_argument( + "--min", nargs="+", type=float, required=True, help="minimum range" + ) + parser.add_argument( + "--max", nargs="+", type=float, required=True, help="maximum range" + ) + parser.add_argument("--num-workers", type=int, default=10, help="number of threads") + args = parser.parse_args() + + args.min = np.array(args.min)[:, np.newaxis, np.newaxis] + args.max = np.array(args.max)[:, np.newaxis, np.newaxis] + + def compress(src_path: str) -> None: + """Rescale, convert to uint8, and compress an image. + + Args: + src_path: Path to an image file. + """ + global args + dst_path = src_path.replace(args.src_dir, args.dst_dir) + dst_dir = os.path.dirname(dst_path) + os.makedirs(dst_dir, exist_ok=True) + with rio.open(src_path, "r") as src: + x = src.read() + + x = (x - args.min) / (args.max - args.min) + + # 0-1 -> 0-255 + x = np.clip(x * 2**8, 0, 2**8).astype(np.uint8) + + profile = src.profile + profile["dtype"] = "uint8" + profile["compress"] = "lzw" + profile["predictor"] = 2 + with rio.open(dst_path, "w", **profile) as dst: + for i, band in enumerate(dst.indexes): + dst.write(x[i], band) + + paths = glob.glob( + os.path.join(args.src_dir, "**", f"*{args.suffix}"), recursive=True + ) + + if args.num_workers > 0: + thread_map(compress, paths, max_workers=args.num_workers) + else: + for path in tqdm(paths): + compress(path) diff --git a/experiments/ssl4eo/compress_l5_l1.sh b/experiments/ssl4eo/compress_l5_l1.sh new file mode 100755 index 000000000..926eed7f2 --- /dev/null +++ b/experiments/ssl4eo/compress_l5_l1.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash + +set -euo pipefail + +# User-specific parameters +ROOT_DIR=data +SRC_DIR="$ROOT_DIR/ssl4eo-l5-l1" +DST_DIR="$ROOT_DIR/ssl4eo-l5-l1-v2" +NUM_WORKERS=40 + +# Satellite-specific parameters +# https://www.usgs.gov/faqs/how-do-i-use-scale-factor-landsat-level-2-science-products +# https://developers.google.com/earth-engine/datasets/catalog/LANDSAT_LT05_C02_T1_TOA +R_MIN=0 +R_MAX=0.4 + +# https://earthobservatory.nasa.gov/global-maps/MOD_LSTD_M +T_MIN=$(echo "273.15 - 25" | bc -l) +T_MAX=$(echo "273.15 + 45" | bc -l) + +MIN=($R_MIN $R_MIN $R_MIN $R_MIN $R_MIN $T_MIN $R_MIN) +MAX=($R_MAX $R_MAX $R_MAX $R_MAX $R_MAX $T_MAX $R_MAX) + +# Generic parameters +SCRIPT_DIR=$(cd $(dirname "${BASH_SOURCE[0]}") && pwd) + +time python3 "$SCRIPT_DIR/compress_dataset.py" \ + "$SRC_DIR" \ + "$DST_DIR" \ + --min ${MIN[@]} \ + --max ${MAX[@]} \ + --num-workers $NUM_WORKERS diff --git a/experiments/ssl4eo/compress_l7_l1.sh b/experiments/ssl4eo/compress_l7_l1.sh new file mode 100755 index 000000000..1171f1dde --- /dev/null +++ b/experiments/ssl4eo/compress_l7_l1.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash + +set -euo pipefail + +# User-specific parameters +ROOT_DIR=data +SRC_DIR="$ROOT_DIR/ssl4eo-l7-l1" +DST_DIR="$ROOT_DIR/ssl4eo-l7-l1-v2" +NUM_WORKERS=40 + +# Satellite-specific parameters +# https://www.usgs.gov/faqs/how-do-i-use-scale-factor-landsat-level-2-science-products +# https://developers.google.com/earth-engine/datasets/catalog/LANDSAT_LE07_C02_T1_TOA +R_MIN=0 +R_MAX=0.4 + +# https://earthobservatory.nasa.gov/global-maps/MOD_LSTD_M +T_MIN=$(echo "273.15 - 25" | bc -l) +T_MAX=$(echo "273.15 + 45" | bc -l) + +MIN=($R_MIN $R_MIN $R_MIN $R_MIN $R_MIN $T_MIN $T_MIN $R_MIN $R_MIN) +MAX=($R_MAX $R_MAX $R_MAX $R_MAX $R_MAX $T_MAX $T_MAX $R_MAX $R_MAX) + +# Generic parameters +SCRIPT_DIR=$(cd $(dirname "${BASH_SOURCE[0]}") && pwd) + +time python3 "$SCRIPT_DIR/compress_dataset.py" \ + "$SRC_DIR" \ + "$DST_DIR" \ + --min ${MIN[@]} \ + --max ${MAX[@]} \ + --num-workers $NUM_WORKERS diff --git a/experiments/ssl4eo/compress_l7_l2.sh b/experiments/ssl4eo/compress_l7_l2.sh new file mode 100755 index 000000000..7d816d0e4 --- /dev/null +++ b/experiments/ssl4eo/compress_l7_l2.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash + +set -euo pipefail + +# User-specific parameters +ROOT_DIR=data +SRC_DIR="$ROOT_DIR/ssl4eo-l7-l2" +DST_DIR="$ROOT_DIR/ssl4eo-l7-l2-v2" +NUM_WORKERS=40 + +# Satellite-specific parameters +# https://www.usgs.gov/faqs/how-do-i-use-scale-factor-landsat-level-2-science-products +# https://developers.google.com/earth-engine/datasets/catalog/LANDSAT_LE07_C02_T1_L2 +R_MIN=$(echo "(0 + 0.2) / 0.0000275" | bc -l) +R_MAX=$(echo "(0.3 + 0.2) / 0.0000275" | bc -l) + +MIN=$R_MIN +MAX=$R_MAX + +# Generic parameters +SCRIPT_DIR=$(cd $(dirname "${BASH_SOURCE[0]}") && pwd) + +time python3 "$SCRIPT_DIR/compress_dataset.py" \ + "$SRC_DIR" \ + "$DST_DIR" \ + --min ${MIN[@]} \ + --max ${MAX[@]} \ + --num-workers $NUM_WORKERS diff --git a/experiments/ssl4eo/compress_l8_l1.sh b/experiments/ssl4eo/compress_l8_l1.sh new file mode 100755 index 000000000..7f1f0c258 --- /dev/null +++ b/experiments/ssl4eo/compress_l8_l1.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash + +set -euo pipefail + +# User-specific parameters +ROOT_DIR=data +SRC_DIR="$ROOT_DIR/ssl4eo-l8-l1" +DST_DIR="$ROOT_DIR/ssl4eo-l8-l1-v2" +NUM_WORKERS=40 + +# Satellite-specific parameters +# https://www.usgs.gov/faqs/how-do-i-use-scale-factor-landsat-level-2-science-products +# https://developers.google.com/earth-engine/datasets/catalog/LANDSAT_LC08_C02_T1_TOA +R_MIN=0 +R_MAX=0.4 + +# https://earthobservatory.nasa.gov/global-maps/MOD_LSTD_M +T_MIN=$(echo "273.15 - 25" | bc -l) +T_MAX=$(echo "273.15 + 45" | bc -l) + +MIN=($R_MIN $R_MIN $R_MIN $R_MIN $R_MIN $R_MIN $R_MIN $R_MIN $R_MIN $T_MIN $T_MIN) +MAX=($R_MAX $R_MAX $R_MAX $R_MAX $R_MAX $R_MAX $R_MAX $R_MAX $R_MAX $T_MAX $T_MAX) + +# Generic parameters +SCRIPT_DIR=$(cd $(dirname "${BASH_SOURCE[0]}") && pwd) + +time python3 "$SCRIPT_DIR/compress_dataset.py" \ + "$SRC_DIR" \ + "$DST_DIR" \ + --min ${MIN[@]} \ + --max ${MAX[@]} \ + --num-workers $NUM_WORKERS diff --git a/experiments/ssl4eo/compress_l8_l2.sh b/experiments/ssl4eo/compress_l8_l2.sh new file mode 100755 index 000000000..01efa6731 --- /dev/null +++ b/experiments/ssl4eo/compress_l8_l2.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash + +set -euo pipefail + +# User-specific parameters +ROOT_DIR=data +SRC_DIR="$ROOT_DIR/ssl4eo-l8-l2" +DST_DIR="$ROOT_DIR/ssl4eo-l8-l2-v2" +NUM_WORKERS=40 + +# Satellite-specific parameters +# https://www.usgs.gov/faqs/how-do-i-use-scale-factor-landsat-level-2-science-products +# https://developers.google.com/earth-engine/datasets/catalog/LANDSAT_LC08_C02_T1_L2 +R_MIN=$(echo "(0 + 0.2) / 0.0000275" | bc -l) +R_MAX=$(echo "(0.3 + 0.2) / 0.0000275" | bc -l) + +MIN=$R_MIN +MAX=$R_MAX + +# Generic parameters +SCRIPT_DIR=$(cd $(dirname "${BASH_SOURCE[0]}") && pwd) + +time python3 "$SCRIPT_DIR/compress_dataset.py" \ + "$SRC_DIR" \ + "$DST_DIR" \ + --min ${MIN[@]} \ + --max ${MAX[@]} \ + --num-workers $NUM_WORKERS