From 5c09c280a88db286727b6a2cab7ef4005e01a5a7 Mon Sep 17 00:00:00 2001 From: Kenza Bouzid <37396332+kenza-bouzid@users.noreply.github.com> Date: Fri, 31 Mar 2023 15:10:16 +0100 Subject: [PATCH] ENH: Add diagnosis for Tiff Conversion (#862) --- .../health_cpath/preprocessing/tiff_conversion.py | 13 ++++++++++++- .../health_cpath/utils/tiff_conversion_config.py | 12 ++++++++++++ .../testhisto/preprocessing/test_tiff_conversion.py | 11 ++++++++++- 3 files changed, 34 insertions(+), 2 deletions(-) diff --git a/hi-ml-cpath/src/health_cpath/preprocessing/tiff_conversion.py b/hi-ml-cpath/src/health_cpath/preprocessing/tiff_conversion.py index 7e924e80..a28b4057 100644 --- a/hi-ml-cpath/src/health_cpath/preprocessing/tiff_conversion.py +++ b/hi-ml-cpath/src/health_cpath/preprocessing/tiff_conversion.py @@ -55,6 +55,8 @@ class ConvertWSIToTiffd(MapTransform): replace_ampersand_by: str = UNDERSCORE, compression: COMPRESSION = COMPRESSION.ADOBE_DEFLATE, tile_size: int = 512, + min_file_size: int = 0, + verbose: bool = False, ) -> None: """ :param output_folder: The directory where the tiff file will be saved. @@ -76,6 +78,9 @@ class ConvertWSIToTiffd(MapTransform): aka ZLIB that is lossless compression. Make sure to use one of these options (RAW, LZW, JPEG, JPEG2000) so that the converted files are readable by cucim. :param tile_size: The size of the tiles that are used to write the tiff file, defaults to 512. + :param min_file_size: The minimum size of the tiff file in bytes. If the tiff file is smaller than this size, it + will get overwritten. Defaults to 0. + :param verbose: A flag to enable verbose logging, defaults to False. """ self.output_folder = output_folder self.image_key = image_key @@ -88,6 +93,8 @@ class ConvertWSIToTiffd(MapTransform): self.wsi_reader = WSIReader(backend=WSIBackend.OPENSLIDE) self.compression = compression self.tile_size = tile_size + self.min_file_size = min_file_size + self.verbose = verbose def get_tiff_path(self, src_path: Path) -> Path: """Returns the path to the tiff file that will be created from the src file. The tiff file is saved in the @@ -242,6 +249,10 @@ class ConvertWSIToTiffd(MapTransform): src_path = Path(data[self.image_key]) tiff_path = self.get_tiff_path(src_path) # if the tiff file does not exist or if it exists but is empty, we convert the wsi to tiff - if not tiff_path.exists() or (tiff_path.exists() and tiff_path.stat().st_size == 0): + if not tiff_path.exists() or (tiff_path.exists() and tiff_path.stat().st_size <= self.min_file_size): self.convert_wsi(src_path, tiff_path) + if self.verbose: + logging.info(f"Converted {src_path} to {tiff_path}") + logging.info(f"Source file size {src_path.stat().st_size / 1e6:.2f} MB") + logging.info(f"Tiff file size {tiff_path.stat().st_size / 1e6:.2f} MB") return data diff --git a/hi-ml-cpath/src/health_cpath/utils/tiff_conversion_config.py b/hi-ml-cpath/src/health_cpath/utils/tiff_conversion_config.py index cf7330ab..95c11abd 100644 --- a/hi-ml-cpath/src/health_cpath/utils/tiff_conversion_config.py +++ b/hi-ml-cpath/src/health_cpath/utils/tiff_conversion_config.py @@ -62,6 +62,16 @@ class TiffConversionConfig(param.Parameterized): doc="The name of the new dataset csv file that will be created for the converted data. If None, the default " "name of the original dataset will be used.", ) + min_file_size: int = param.Integer( + default=0, + doc="The minimum size of the tiff file in bytes. If the tiff file is smaller than this size, it will get " + "overwritten. Defaults to 0.", + ) + verbose: bool = param.Boolean( + default=False, + doc="If True, the progress of the conversion will be logged including src and tiff file sizes. " + "Defaults to False.", + ) def get_transform(self, output_folder: Path) -> ConvertWSIToTiffd: """Get the transform that will be used to convert the src files to tiff files.""" @@ -74,6 +84,8 @@ class TiffConversionConfig(param.Parameterized): replace_ampersand_by=self.replace_ampersand_by, compression=self.compression, tile_size=self.tile_size, + min_file_size=self.min_file_size, + verbose=self.verbose, ) def create_dataset_csv_for_converted_data(self, output_folder: Path) -> None: diff --git a/hi-ml-cpath/testhisto/testhisto/preprocessing/test_tiff_conversion.py b/hi-ml-cpath/testhisto/testhisto/preprocessing/test_tiff_conversion.py index 2882d9cd..97bc5882 100644 --- a/hi-ml-cpath/testhisto/testhisto/preprocessing/test_tiff_conversion.py +++ b/hi-ml-cpath/testhisto/testhisto/preprocessing/test_tiff_conversion.py @@ -7,6 +7,7 @@ import numpy as np import pytest from pathlib import Path +from pytest import LogCaptureFixture from monai.data.wsi_reader import WSIReader from health_cpath.datasets.panda_dataset import PandaDataset from health_cpath.preprocessing.loading import WSIBackend @@ -213,13 +214,17 @@ def test_convert_wsi_to_tiff(add_low_mag: bool, wsi_samples: WSISamplesType, tmp @pytest.mark.gpu @skipif_no_gpu() # cucim is not available on cpu -def test_convert_wsi_to_tiff_existing_empty_file(wsi_samples: WSISamplesType, tmp_path: Path) -> None: +def test_convert_wsi_to_tiff_existing_empty_file( + wsi_samples: WSISamplesType, tmp_path: Path, caplog: LogCaptureFixture +) -> None: target_mag = 2.5 transform = ConvertWSIToTiffd( output_folder=tmp_path, target_magnifications=[target_mag], default_base_objective_power=target_mag, tile_size=16, + min_file_size=0, + verbose=True, ) tiff_path = transform.get_tiff_path(wsi_samples[0][SlideKey.IMAGE]) # Create an empty file @@ -230,6 +235,10 @@ def test_convert_wsi_to_tiff_existing_empty_file(wsi_samples: WSISamplesType, tm for sample in wsi_samples: transform(sample) assert tiff_path.stat().st_size > 0 + messages = caplog.messages + assert "Converted" in messages[0] + assert "Source file size 0.02 MB" in messages[1] + assert "Tiff file size 0.01 MB" in messages[2] def test_tiff_conversion_config(mock_panda_slides_root_dir: Path, tmp_path: Path) -> None: