diff --git a/experiments/ssl4eo/delete_excess.py b/experiments/ssl4eo/delete_excess.py new file mode 100755 index 000000000..ca7c1a346 --- /dev/null +++ b/experiments/ssl4eo/delete_excess.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python3 + +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +import argparse +import glob +import os +import shutil + +from tqdm import tqdm +from tqdm.contrib.concurrent import thread_map + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("root", help="directory to search for scenes") + parser.add_argument("--num-workers", type=int, default=10, help="number of threads") + parser.add_argument( + "--length", type=int, default=250000, help="number of scenes to keep" + ) + args = parser.parse_args() + + paths = sorted(glob.glob(os.path.join(args.root, "*"))) + paths = paths[args.length :] + + if args.num_workers > 0: + thread_map(shutil.rmtree, paths, max_workers=args.num_workers) + else: + for path in tqdm(paths): + shutil.rmtree(path) diff --git a/experiments/ssl4eo/landsat/README.md b/experiments/ssl4eo/landsat/README.md index 550d5326c..ef56db35b 100644 --- a/experiments/ssl4eo/landsat/README.md +++ b/experiments/ssl4eo/landsat/README.md @@ -52,6 +52,12 @@ For each TOA and SR product, we want to create a parallel corpus. This can be do $ bash delete_mismatch.sh ``` +To chop this down to 250K locations, you can then run: + +```console +$ bash delete_excess.sh +``` + You may want to modify `ROOT_DIR`. ## Compression diff --git a/experiments/ssl4eo/landsat/delete_excess.sh b/experiments/ssl4eo/landsat/delete_excess.sh new file mode 100755 index 000000000..ad9dd4757 --- /dev/null +++ b/experiments/ssl4eo/landsat/delete_excess.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash + +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +set -euo pipefail + +# User-specific parameters +ROOT_DIR=data +L5_L1="$ROOT_DIR/ssl4eo_l_tm_toa/imgs" +L7_L1="$ROOT_DIR/ssl4eo_l_etm_toa/imgs" +L7_L2="$ROOT_DIR/ssl4eo_l_etm_sr/imgs" +L8_L1="$ROOT_DIR/ssl4eo_l_oli_tirs_toa/imgs" +L8_L2="$ROOT_DIR/ssl4eo_l_oli_sr/imgs" +NUM_WORKERS=10 +LENGTH=250000 + +# Generic parameters +SCRIPT_DIR=$(cd $(dirname $(dirname "${BASH_SOURCE[0]}")) && pwd) + +time python3 "$SCRIPT_DIR/delete_excess.py" "$L5_L1" --num-workers $NUM_WORKERS --length $LENGTH +time python3 "$SCRIPT_DIR/delete_excess.py" "$L7_L1" --num-workers $NUM_WORKERS --length $LENGTH +time python3 "$SCRIPT_DIR/delete_excess.py" "$L7_L2" --num-workers $NUM_WORKERS --length $LENGTH +time python3 "$SCRIPT_DIR/delete_excess.py" "$L8_L1" --num-workers $NUM_WORKERS --length $LENGTH +time python3 "$SCRIPT_DIR/delete_excess.py" "$L8_L2" --num-workers $NUM_WORKERS --length $LENGTH