Enable tiling non-PANDA WSI datasets (#621)

* Add basic dataset and environment changes * Add loading/preproc utils * Back-up PANDA tiling scripts * Refactor and generalise tiling scripts * Remove Azure scripts * Add test WSI file * Add preprocessing tests * Update changelog * Add Linux condition for cuCIM in environment.yml * Use PANDA instead of TCGA-PRAD in test * Leave TcgaPradDataset as an example * Fix skipped InnerEye dataset tests * Create and test mock slides dataset * Remove Tests/ML/datasets from pytest discovery
2021-12-16 16:11:55 +00:00 · 2021-12-16 16:11:55 +00:00 · 6a4d334a99
--- a/.gitattributes
+++ b/.gitattributes
@ -15,3 +15,4 @@
 *.dcm filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.jpg filter=lfs diff=lfs merge=lfs -text
+*.tiff filter=lfs diff=lfs merge=lfs -text
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -38,6 +38,7 @@ jobs that run in AzureML.
 - ([#614](https://github.com/microsoft/InnerEye-DeepLearning/pull/614)) Checkpoint downloading falls back to looking into AzureML if no checkpoints on disk
 - ([#613](https://github.com/microsoft/InnerEye-DeepLearning/pull/613)) Add additional tests for histopathology datasets
 - ([#616](https://github.com/microsoft/InnerEye-DeepLearning/pull/616)) Add more histopathology configs and tests
+- ([#621](https://github.com/microsoft/InnerEye-DeepLearning/pull/621)) Add WSI preprocessing functions and enable tiling more generic slide datasets

 ### Changed
 - ([#588](https://github.com/microsoft/InnerEye-DeepLearning/pull/588)) Replace SciPy with PIL.PngImagePlugin.PngImageFile to load png files.
--- a/InnerEye/ML/Histopathology/datasets/base_dataset.py
+++ b/InnerEye/ML/Histopathology/datasets/base_dataset.py
@ -4,7 +4,7 @@
 #  ------------------------------------------------------------------------------------------

 from pathlib import Path
-from typing import Any, Dict, Optional, Union
+from typing import Any, Dict, Optional, Tuple, Union

 import numpy as np
 import pandas as pd
@ -12,6 +12,8 @@ import torch
 from sklearn.utils.class_weight import compute_class_weight
 from torch.utils.data import Dataset

+from InnerEye.ML.Histopathology.utils.naming import SlideKey
+

 class TilesDataset(Dataset):
    """Base class for datasets of WSI tiles, iterating dictionaries of image paths and metadata.
@ -71,7 +73,7 @@ class TilesDataset(Dataset):
            self.dataset_csv = dataset_csv or self.root_dir / self.DEFAULT_CSV_FILENAME
            dataset_df = pd.read_csv(self.dataset_csv)

-        columns = [self.SLIDE_ID_COLUMN, self.IMAGE_COLUMN, self.LABEL_COLUMN, self.LABEL_COLUMN,
+        columns = [self.SLIDE_ID_COLUMN, self.IMAGE_COLUMN, self.LABEL_COLUMN,
                   self.SPLIT_COLUMN, self.TILE_X_COLUMN, self.TILE_Y_COLUMN]
        for column in columns:
            if column is not None and column not in dataset_df.columns:
@ -110,3 +112,109 @@ class TilesDataset(Dataset):
        classes = np.unique(slide_labels)
        class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=slide_labels)
        return torch.as_tensor(class_weights)
+
+
+class SlidesDataset(Dataset):
+    """Base class for datasets of WSIs, iterating dictionaries of image paths and metadata.
+
+    The output dictionaries are indexed by `..utils.naming.SlideKey`.
+
+    :param SLIDE_ID_COLUMN: CSV column name for slide ID.
+    :param IMAGE_COLUMN: CSV column name for relative path to image file.
+    :param LABEL_COLUMN: CSV column name for tile label.
+    :param SPLIT_COLUMN: CSV column name for train/test split (optional).
+    :param TRAIN_SPLIT_LABEL: Value used to indicate the training split in `SPLIT_COLUMN`.
+    :param TEST_SPLIT_LABEL: Value used to indicate the test split in `SPLIT_COLUMN`.
+    :param DEFAULT_CSV_FILENAME: Default name of the dataset CSV at the dataset rood directory.
+    :param N_CLASSES: Number of classes indexed in `LABEL_COLUMN`.
+    """
+    SLIDE_ID_COLUMN: str = 'slide_id'
+    IMAGE_COLUMN: str = 'image'
+    LABEL_COLUMN: str = 'label'
+    MASK_COLUMN: Optional[str] = None
+    SPLIT_COLUMN: Optional[str] = None
+
+    TRAIN_SPLIT_LABEL: str = 'train'
+    TEST_SPLIT_LABEL: str = 'test'
+
+    METADATA_COLUMNS: Tuple[str, ...] = ()
+
+    DEFAULT_CSV_FILENAME: str = "dataset.csv"
+
+    N_CLASSES: int = 1  # binary classification by default
+
+    def __init__(self,
+                 root: Union[str, Path],
+                 dataset_csv: Optional[Union[str, Path]] = None,
+                 dataset_df: Optional[pd.DataFrame] = None,
+                 train: Optional[bool] = None,
+                 validate_columns: bool = True) -> None:
+        """
+        :param root: Root directory of the dataset.
+        :param dataset_csv: Full path to a dataset CSV file, containing at least
+        `TILE_ID_COLUMN`, `SLIDE_ID_COLUMN`, and `IMAGE_COLUMN`. If omitted, the CSV will be read
+        from `"{root}/{DEFAULT_CSV_FILENAME}"`.
+        :param dataset_df: A potentially pre-processed dataframe in the same format as would be read
+        from the dataset CSV file, e.g. after some filtering. If given, overrides `dataset_csv`.
+        :param train: If `True`, loads only the training split (resp. `False` for test split). By
+        default (`None`), loads the entire dataset as-is.
+        :param validate_columns: Whether to call `validate_columns()` at the end of `__init__()`.
+        """
+        if self.SPLIT_COLUMN is None and train is not None:
+            raise ValueError("Train/test split was specified but dataset has no split column")
+
+        self.root_dir = Path(root)
+
+        if dataset_df is not None:
+            self.dataset_csv = None
+        else:
+            self.dataset_csv = dataset_csv or self.root_dir / self.DEFAULT_CSV_FILENAME
+            dataset_df = pd.read_csv(self.dataset_csv)
+
+        dataset_df = dataset_df.set_index(self.SLIDE_ID_COLUMN)
+        if train is None:
+            self.dataset_df = dataset_df
+        else:
+            split = self.TRAIN_SPLIT_LABEL if train else self.TEST_SPLIT_LABEL
+            self.dataset_df = dataset_df[dataset_df[self.SPLIT_COLUMN] == split]
+
+        if validate_columns:
+            self.validate_columns()
+
+    def validate_columns(self) -> None:
+        """Check that loaded dataframe contains expected columns, raises `ValueError` otherwise.
+
+        If the constructor is overloaded in a subclass, you can pass `validate_columns=False` and
+        call `validate_columns()` after creating derived columns, for example.
+        """
+        columns = [self.IMAGE_COLUMN, self.LABEL_COLUMN, self.MASK_COLUMN,
+                   self.SPLIT_COLUMN] + list(self.METADATA_COLUMNS)
+        for column in columns:
+            if column is not None and column not in self.dataset_df.columns:
+                raise ValueError(f"Expected column '{column}' not found in the dataframe")
+
+    def __len__(self) -> int:
+        return self.dataset_df.shape[0]
+
+    def __getitem__(self, index: int) -> Dict[SlideKey, Any]:
+        slide_id = self.dataset_df.index[index]
+        slide_row = self.dataset_df.loc[slide_id]
+        sample = {SlideKey.SLIDE_ID: slide_id}
+
+        rel_image_path = slide_row[self.IMAGE_COLUMN]
+        sample[SlideKey.IMAGE] = str(self.root_dir / rel_image_path)
+        # we're replicating this column because we want to propagate the path to the batch
+        sample[SlideKey.IMAGE_PATH] = sample[SlideKey.IMAGE]
+
+        if self.MASK_COLUMN:
+            rel_mask_path = slide_row[self.MASK_COLUMN]
+            sample[SlideKey.MASK] = str(self.root_dir / rel_mask_path)
+            sample[SlideKey.MASK_PATH] = sample[SlideKey.MASK]
+
+        sample[SlideKey.LABEL] = slide_row[self.LABEL_COLUMN]
+        sample[SlideKey.METADATA] = {col: slide_row[col] for col in self.METADATA_COLUMNS}
+        return sample
+
+    @classmethod
+    def has_mask(cls) -> bool:
+        return cls.MASK_COLUMN is not None
--- a/InnerEye/ML/Histopathology/datasets/default_paths.py
+++ b/InnerEye/ML/Histopathology/datasets/default_paths.py
@ -3,11 +3,13 @@
 #  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
 #  ------------------------------------------------------------------------------------------

+PANDA_DATASET_ID = "PANDA"
 PANDA_TILES_DATASET_ID = "PANDA_tiles"
 TCGA_CRCK_DATASET_ID = "TCGA-CRCk"
 TCGA_PRAD_DATASET_ID = "TCGA-PRAD"

 DEFAULT_DATASET_LOCATION = "/tmp/datasets/"
+PANDA_DATASET_DIR = DEFAULT_DATASET_LOCATION + PANDA_DATASET_ID
 PANDA_TILES_DATASET_DIR = DEFAULT_DATASET_LOCATION + PANDA_TILES_DATASET_ID
 TCGA_CRCK_DATASET_DIR = DEFAULT_DATASET_LOCATION + TCGA_CRCK_DATASET_ID
 TCGA_PRAD_DATASET_DIR = DEFAULT_DATASET_LOCATION + TCGA_PRAD_DATASET_ID
--- a/InnerEye/ML/Histopathology/datasets/panda_dataset.py
+++ b/InnerEye/ML/Histopathology/datasets/panda_dataset.py
@ -7,50 +7,42 @@ from pathlib import Path
 from typing import Any, Dict, Union, Optional

 import pandas as pd
+from cucim import CuImage
+from health_ml.utils import box_utils
 from monai.config import KeysCollection
 from monai.data.image_reader import ImageReader, WSIReader
 from monai.transforms import MapTransform
-from openslide import OpenSlide
-from torch.utils.data import Dataset

-from health_ml.utils import box_utils
+from InnerEye.ML.Histopathology.datasets.base_dataset import SlidesDataset


-class PandaDataset(Dataset):
+class PandaDataset(SlidesDataset):
    """Dataset class for loading files from the PANDA challenge dataset.

-    Iterating over this dataset returns a dictionary containing the `'image_id'`, paths to the `'image'`
-    and `'mask'` files, and the remaining meta-data from the original dataset (`'data_provider'`,
-    `'isup_grade'`, and `'gleason_score'`).
+    Iterating over this dataset returns a dictionary following the `SlideKey` schema plus meta-data
+    from the original dataset (`'data_provider'`, `'isup_grade'`, and `'gleason_score'`).

    Ref.: https://www.kaggle.com/c/prostate-cancer-grade-assessment/overview
    """
-    def __init__(self, root_dir: Union[str, Path], n_slides: Optional[int] = None,
-                 frac_slides: Optional[float] = None) -> None:
-        super().__init__()
-        self.root_dir = Path(root_dir)
-        self.train_df = pd.read_csv(self.root_dir / "train.csv", index_col='image_id')
-        if n_slides or frac_slides:
-            self.train_df = self.train_df.sample(n=n_slides, frac=frac_slides, replace=False,
-                                                          random_state=1234)
+    SLIDE_ID_COLUMN = 'image_id'
+    IMAGE_COLUMN = 'image'
+    MASK_COLUMN = 'mask'
+    LABEL_COLUMN = 'isup_grade'

-    def __len__(self) -> int:
-        return self.train_df.shape[0]
+    METADATA_COLUMNS = ('data_provider', 'isup_grade', 'gleason_score')

-    def _get_image_path(self, image_id: str) -> Path:
-        return self.root_dir / "train_images" / f"{image_id}.tiff"
+    DEFAULT_CSV_FILENAME = "train.csv"

-    def _get_mask_path(self, image_id: str) -> Path:
-        return self.root_dir / "train_label_masks" / f"{image_id}_mask.tiff"
-
-    def __getitem__(self, index: int) -> Dict:
-        image_id = self.train_df.index[index]
-        return {
-            'image_id': image_id,
-            'image': str(self._get_image_path(image_id).absolute()),
-            'mask': str(self._get_mask_path(image_id).absolute()),
-            **self.train_df.loc[image_id].to_dict()
-        }
+    def __init__(self,
+                 root: Union[str, Path],
+                 dataset_csv: Optional[Union[str, Path]] = None,
+                 dataset_df: Optional[pd.DataFrame] = None) -> None:
+        super().__init__(root, dataset_csv, dataset_df, validate_columns=False)
+        # PANDA CSV does not come with paths for image and mask files
+        slide_ids = self.dataset_df.index
+        self.dataset_df[self.IMAGE_COLUMN] = "train_images/" + slide_ids + ".tiff"
+        self.dataset_df[self.MASK_COLUMN] = "train_label_masks/" + slide_ids + "_mask.tiff"
+        self.validate_columns()


 # MONAI's convention is that dictionary transforms have a 'd' suffix in the class name
@ -96,10 +88,10 @@ class LoadPandaROId(MapTransform):
        self.margin = margin
        self.kwargs = kwargs

-    def _get_bounding_box(self, mask_obj: OpenSlide) -> box_utils.Box:
+    def _get_bounding_box(self, mask_obj: CuImage) -> box_utils.Box:
        # Estimate bounding box at the lowest resolution (i.e. highest level)
-        highest_level = mask_obj.level_count - 1
-        scale = mask_obj.level_downsamples[highest_level]
+        highest_level = mask_obj.resolutions['level_count'] - 1
+        scale = mask_obj.resolutions['level_downsamples'][highest_level]
        mask, _ = self.reader.get_data(mask_obj, level=highest_level)  # loaded as RGB PIL image

        foreground_mask = mask[0] > 0  # PANDA segmentation mask is in 'R' channel
@ -107,14 +99,14 @@ class LoadPandaROId(MapTransform):
        return bbox

    def __call__(self, data: Dict) -> Dict:
-        mask_obj: OpenSlide = self.reader.read(data[self.mask_key])
-        image_obj: OpenSlide = self.reader.read(data[self.image_key])
+        mask_obj: CuImage = self.reader.read(data[self.mask_key])
+        image_obj: CuImage = self.reader.read(data[self.image_key])

        level0_bbox = self._get_bounding_box(mask_obj)

-        # OpenSlide takes absolute location coordinates in the level 0 reference frame,
+        # cuCIM/OpenSlide take absolute location coordinates in the level 0 reference frame,
        # but relative region size in pixels at the chosen level
-        scale = mask_obj.level_downsamples[self.level]
+        scale = mask_obj.resolutions['level_downsamples'][self.level]
        scaled_bbox = level0_bbox / scale
        get_data_kwargs = dict(location=(level0_bbox.x, level0_bbox.y),
                               size=(scaled_bbox.w, scaled_bbox.h),
--- a/InnerEye/ML/Histopathology/datasets/tcga_prad_dataset.py
+++ b/InnerEye/ML/Histopathology/datasets/tcga_prad_dataset.py
@ -4,13 +4,14 @@
 #  ------------------------------------------------------------------------------------------

 from pathlib import Path
-from typing import Any, Dict, Optional, Union
+from typing import Optional, Union

 import pandas as pd
-from torch.utils.data import Dataset
+
+from InnerEye.ML.Histopathology.datasets.base_dataset import SlidesDataset


-class TcgaPradDataset(Dataset):
+class TcgaPradDataset(SlidesDataset):
    """Dataset class for loading TCGA-PRAD slides.

    Iterating over this dataset returns a dictionary containing:
@ -19,16 +20,14 @@ class TcgaPradDataset(Dataset):
    - `'image_path'` (str): absolute slide image path
    - `'label'` (int, 0 or 1): label for predicting positive or negative
    """
-    SLIDE_ID_COLUMN: str = 'slide_id'
-    CASE_ID_COLUMN: str = 'case_id'
    IMAGE_COLUMN: str = 'image_path'
    LABEL_COLUMN: str = 'label'

    DEFAULT_CSV_FILENAME: str = "dataset.csv"

-    def __init__(self, root_dir: Union[str, Path],
+    def __init__(self, root: Union[str, Path],
                 dataset_csv: Optional[Union[str, Path]] = None,
-                 dataset_df: Optional[pd.DataFrame] = None,) -> None:
+                 dataset_df: Optional[pd.DataFrame] = None) -> None:
        """
        :param root: Root directory of the dataset.
        :param dataset_csv: Full path to a dataset CSV file. If omitted, the CSV will be read from
@ -36,27 +35,8 @@ class TcgaPradDataset(Dataset):
        :param dataset_df: A potentially pre-processed dataframe in the same format as would be read
        from the dataset CSV file, e.g. after some filtering. If given, overrides `dataset_csv`.
        """
-        self.root_dir = Path(root_dir)
-
-        if dataset_df is not None:
-            self.dataset_csv = None
-        else:
-            self.dataset_csv = dataset_csv or self.root_dir / self.DEFAULT_CSV_FILENAME
-            dataset_df = pd.read_csv(self.dataset_csv)
-
-        dataset_df = dataset_df.set_index(self.SLIDE_ID_COLUMN)
-        dataset_df[self.LABEL_COLUMN] = (dataset_df['label1_mutation']
-                                         | dataset_df['label2_mutation']).astype(int)
-        self.dataset_df = dataset_df
-
-    def __len__(self) -> int:
-        return self.dataset_df.shape[0]
-
-    def __getitem__(self, index: int) -> Dict[str, Any]:
-        slide_id = self.dataset_df.index[index]
-        sample = {
-            self.SLIDE_ID_COLUMN: slide_id,
-            **self.dataset_df.loc[slide_id].to_dict()
-        }
-        sample[self.IMAGE_COLUMN] = str(self.root_dir / sample.pop(self.IMAGE_COLUMN))
-        return sample
+        super().__init__(root, dataset_csv, dataset_df, validate_columns=False)
+        # Example of how to define a custom label column from existing columns:
+        self.dataset_df[self.LABEL_COLUMN] = (self.dataset_df['label1']
+                                              | self.dataset_df['label2']).astype(int)
+        self.validate_columns()
--- a/InnerEye/ML/Histopathology/preprocessing/create_panda_tiles_dataset.py
+++ b/InnerEye/ML/Histopathology/preprocessing/create_panda_tiles_dataset.py
@ -0,0 +1,230 @@
+#  ------------------------------------------------------------------------------------------
+#  Copyright (c) Microsoft Corporation. All rights reserved.
+#  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
+#  ------------------------------------------------------------------------------------------
+
+"""This script is specific to PANDA and is kept only for retrocompatibility.
+`create_tiles_dataset.py` is the new supported way to process slide datasets.
+"""
+import functools
+import os
+import logging
+import shutil
+import traceback
+import warnings
+from pathlib import Path
+from typing import Sequence, Tuple, Union
+
+import numpy as np
+import PIL
+from monai.data import Dataset
+from monai.data.image_reader import WSIReader
+from tqdm import tqdm
+
+from InnerEye.ML.Histopathology.preprocessing import tiling
+from InnerEye.ML.Histopathology.datasets.panda_dataset import PandaDataset, LoadPandaROId
+
+
+CSV_COLUMNS = ['slide_id', 'tile_id', 'image', 'mask', 'tile_x', 'tile_y', 'occupancy',
+               'data_provider', 'slide_isup_grade', 'slide_gleason_score']
+TMP_SUFFIX = "_tmp"
+
+logging.basicConfig(format='%(asctime)s %(message)s', filemode='w')
+logger = logging.getLogger()
+logger.setLevel(logging.DEBUG)
+
+
+def select_tile(mask_tile: np.ndarray, occupancy_threshold: float) \
+        -> Union[Tuple[bool, float], Tuple[np.ndarray, np.ndarray]]:
+    if occupancy_threshold < 0. or occupancy_threshold > 1.:
+        raise ValueError("Tile occupancy threshold must be between 0 and 1")
+    foreground_mask = mask_tile > 0
+    occupancy = foreground_mask.mean(axis=(-2, -1))
+    return (occupancy > occupancy_threshold).squeeze(), occupancy.squeeze()
+
+
+def get_tile_descriptor(tile_location: Sequence[int]) -> str:
+    return f"{tile_location[0]:05d}x_{tile_location[1]:05d}y"
+
+
+def get_tile_id(slide_id: str, tile_location: Sequence[int]) -> str:
+    return f"{slide_id}.{get_tile_descriptor(tile_location)}"
+
+
+def save_image(array_chw: np.ndarray, path: Path) -> PIL.Image:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    array_hwc = np.moveaxis(array_chw, 0, -1).astype(np.uint8).squeeze()
+    pil_image = PIL.Image.fromarray(array_hwc)
+    pil_image.convert('RGB').save(path)
+    return pil_image
+
+
+def generate_tiles(sample: dict, tile_size: int, occupancy_threshold: float) \
+        -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, int]:
+    image_tiles, tile_locations = tiling.tile_array_2d(sample['image'], tile_size=tile_size,
+                                                       constant_values=255)
+    mask_tiles, _ = tiling.tile_array_2d(sample['mask'], tile_size=tile_size, constant_values=0)
+
+    selected: np.ndarray
+    occupancies: np.ndarray
+    selected, occupancies = select_tile(mask_tiles, occupancy_threshold)
+    n_discarded = (~selected).sum()
+    logging.info(f"Percentage tiles discarded: {round(selected.sum() / n_discarded * 100, 2)}")
+
+    image_tiles = image_tiles[selected]
+    mask_tiles = mask_tiles[selected]
+    tile_locations = tile_locations[selected]
+    occupancies = occupancies[selected]
+
+    abs_tile_locations = (sample['scale'] * tile_locations + sample['location']).astype(int)
+
+    return image_tiles, mask_tiles, abs_tile_locations, occupancies, n_discarded
+
+
+# TODO refactor this to separate metadata identification from saving. We might want the metadata
+# even if the saving fails
+def save_tile(sample: dict, image_tile: np.ndarray, mask_tile: np.ndarray,
+              tile_location: Sequence[int], output_dir: Path) -> dict:
+    slide_id = sample['image_id']
+    descriptor = get_tile_descriptor(tile_location)
+    image_tile_filename = f"train_images/{descriptor}.png"
+    mask_tile_filename = f"train_label_masks/{descriptor}_mask.png"
+
+    save_image(image_tile, output_dir / image_tile_filename)
+    save_image(mask_tile, output_dir / mask_tile_filename)
+
+    tile_metadata = {
+        'slide_id': slide_id,
+        'tile_id': get_tile_id(slide_id, tile_location),
+        'image': image_tile_filename,
+        'mask': mask_tile_filename,
+        'tile_x': tile_location[0],
+        'tile_y': tile_location[1],
+        'data_provider': sample['data_provider'],
+        'slide_isup_grade': sample['isup_grade'],
+        'slide_gleason_score': sample['gleason_score'],
+    }
+
+    return tile_metadata
+
+
+def process_slide(sample: dict, level: int, margin: int, tile_size: int, occupancy_threshold: int,
+                  output_dir: Path, tile_progress: bool = False) -> None:
+    slide_id = sample['image_id']
+    slide_dir: Path = output_dir / (slide_id + "/")
+    logging.info(f">>> Slide dir {slide_dir}")
+    if slide_dir.exists():  # already processed slide - skip
+        logging.info(f">>> Skipping {slide_dir} - already processed")
+        return
+    else:
+        try:
+            slide_dir.mkdir(parents=True)
+
+            dataset_csv_path = slide_dir / "dataset.csv"
+            dataset_csv_file = dataset_csv_path.open('w')
+            dataset_csv_file.write(','.join(CSV_COLUMNS) + '\n')  # write CSV header
+
+            tiles_failure = 0
+            failed_tiles_csv_path = slide_dir / "failed_tiles.csv"
+            failed_tiles_file = failed_tiles_csv_path.open('w')
+            failed_tiles_file.write('tile_id' + '\n')
+
+            logging.info(f"Loading slide {slide_id} ...")
+            loader = LoadPandaROId(WSIReader(), level=level, margin=margin)
+            sample = loader(sample)  # load 'image' and 'mask' from disk
+
+            logging.info(f"Tiling slide {slide_id} ...")
+            image_tiles, mask_tiles, tile_locations, occupancies, _ = \
+                generate_tiles(sample, tile_size, occupancy_threshold)
+            n_tiles = image_tiles.shape[0]
+
+            for i in tqdm(range(n_tiles), f"Tiles ({slide_id[:6]}…)", unit="img", disable=not tile_progress):
+                try:
+                    tile_metadata = save_tile(sample, image_tiles[i], mask_tiles[i], tile_locations[i],
+                                              slide_dir)
+                    tile_metadata['occupancy'] = occupancies[i]
+                    tile_metadata['image'] = os.path.join(slide_dir.name, tile_metadata['image'])
+                    tile_metadata['mask'] = os.path.join(slide_dir.name, tile_metadata['mask'])
+                    dataset_row = ','.join(str(tile_metadata[column]) for column in CSV_COLUMNS)
+                    dataset_csv_file.write(dataset_row + '\n')
+                except Exception as e:
+                    tiles_failure += 1
+                    descriptor = get_tile_descriptor(tile_locations[i]) + '\n'
+                    failed_tiles_file.write(descriptor)
+                    traceback.print_exc()
+                    warnings.warn(f"An error occurred while saving tile "
+                                  f"{get_tile_id(slide_id, tile_locations[i])}: {e}")
+
+            dataset_csv_file.close()
+            failed_tiles_file.close()
+            if tiles_failure > 0:
+                # TODO what we want to do with slides that have some failed tiles?
+                logging.warning(f"{slide_id} is incomplete. {tiles_failure} tiles failed.")
+        except Exception as e:
+            traceback.print_exc()
+            warnings.warn(f"An error occurred while processing slide {slide_id}: {e}")
+
+
+def merge_dataset_csv_files(dataset_dir: Path) -> Path:
+    full_csv = dataset_dir / "dataset.csv"
+    # TODO change how we retrieve these filenames, probably because mounted, the operation is slow
+    #  and it seems to find many more files
+    # print("List of files")
+    # print([str(file) + '\n' for file in dataset_dir.glob("*/dataset.csv")])
+    with full_csv.open('w') as full_csv_file:
+        # full_csv_file.write(','.join(CSV_COLUMNS) + '\n')  # write CSV header
+        first_file = True
+        for slide_csv in tqdm(dataset_dir.glob("*/dataset.csv"), desc="Merging dataset.csv", unit='file'):
+            logging.info(f"Merging slide {slide_csv}")
+            content = slide_csv.read_text()
+            if not first_file:
+                content = content[content.index('\n') + 1:]  # discard header row for all but the first file
+            full_csv_file.write(content)
+            first_file = False
+    return full_csv
+
+
+def main(panda_dir: Union[str, Path], root_output_dir: Union[str, Path], level: int, tile_size: int,
+         margin: int, occupancy_threshold: float, parallel: bool = False, overwrite: bool = False) -> None:
+
+    # Ignoring some types here because mypy is getting confused with the MONAI Dataset class
+    # to select a subsample use keyword n_slides
+    dataset = Dataset(PandaDataset(panda_dir))  # type: ignore
+
+    output_dir = Path(root_output_dir) / f"panda_tiles_level{level}_{tile_size}"
+    logging.info(f"Creating dataset of level-{level} {tile_size}x{tile_size} PANDA tiles at: {output_dir}")
+
+    if overwrite and output_dir.exists():
+        shutil.rmtree(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=not overwrite)
+
+    func = functools.partial(process_slide, level=level, margin=margin, tile_size=tile_size,
+                             occupancy_threshold=occupancy_threshold, output_dir=output_dir,
+                             tile_progress=not parallel)
+
+    if parallel:
+        import multiprocessing
+
+        pool = multiprocessing.Pool()
+        map_func = pool.imap_unordered  # type: ignore
+    else:
+        map_func = map  # type: ignore
+
+    list(tqdm(map_func(func, dataset), desc="Slides", unit="img", total=len(dataset)))  # type: ignore
+
+    if parallel:
+        pool.close()
+
+    logging.info("Merging slide files in a single file")
+    merge_dataset_csv_files(output_dir)
+
+
+if __name__ == '__main__':
+    main(panda_dir="/tmp/datasets/PANDA",
+         root_output_dir="/datadrive",
+         level=1,
+         tile_size=224,
+         margin=64,
+         occupancy_threshold=0.05,
+         parallel=True,
+         overwrite=False)
--- a/InnerEye/ML/Histopathology/preprocessing/create_tiles_dataset.py
+++ b/InnerEye/ML/Histopathology/preprocessing/create_tiles_dataset.py
@ -4,13 +4,12 @@
 #  ------------------------------------------------------------------------------------------

 import functools
-import os
 import logging
 import shutil
 import traceback
 import warnings
 from pathlib import Path
-from typing import Sequence, Tuple, Union
+from typing import Any, Dict, Iterable, Optional, Sequence, Tuple, Union

 import numpy as np
 import PIL
@ -18,37 +17,43 @@ from monai.data import Dataset
 from monai.data.image_reader import WSIReader
 from tqdm import tqdm

+from InnerEye.ML.Histopathology.datasets.base_dataset import SlidesDataset
 from InnerEye.ML.Histopathology.preprocessing import tiling
-from InnerEye.ML.Histopathology.datasets.panda_dataset import PandaDataset, LoadPandaROId
-
-
-CSV_COLUMNS = ['slide_id', 'tile_id', 'image', 'mask', 'tile_x', 'tile_y', 'occupancy',
-               'data_provider', 'slide_isup_grade', 'slide_gleason_score']
-TMP_SUFFIX = "_tmp"
+from InnerEye.ML.Histopathology.preprocessing.loading import LoadROId, segment_foreground
+from InnerEye.ML.Histopathology.utils.naming import SlideKey, TileKey

 logging.basicConfig(format='%(asctime)s %(message)s', filemode='w')
 logger = logging.getLogger()
 logger.setLevel(logging.DEBUG)


-def select_tile(mask_tile: np.ndarray, occupancy_threshold: float) \
-        -> Union[Tuple[bool, float], Tuple[np.ndarray, np.ndarray]]:
+def select_tiles(foreground_mask: np.ndarray, occupancy_threshold: float) \
+        -> Tuple[np.ndarray, np.ndarray]:
+    """Exclude tiles that are mostly background based on estimated occupancy.
+
+    :param foreground_mask: Boolean array of shape (*, H, W).
+    :param occupancy_threshold: Tiles with lower occupancy (between 0 and 1) will be discarded.
+    :return: A tuple containing which tiles were selected and the estimated occupancies. These will
+    be boolean and float arrays of shape (*,), or scalars if `foreground_mask` is a single tile.
+    """
    if occupancy_threshold < 0. or occupancy_threshold > 1.:
        raise ValueError("Tile occupancy threshold must be between 0 and 1")
-    foreground_mask = mask_tile > 0
    occupancy = foreground_mask.mean(axis=(-2, -1))
-    return (occupancy > occupancy_threshold).squeeze(), occupancy.squeeze()
+    return (occupancy > occupancy_threshold).squeeze(), occupancy.squeeze()  # type: ignore


 def get_tile_descriptor(tile_location: Sequence[int]) -> str:
+    """Format the XY tile coordinates into a tile descriptor."""
    return f"{tile_location[0]:05d}x_{tile_location[1]:05d}y"


 def get_tile_id(slide_id: str, tile_location: Sequence[int]) -> str:
+    """Format the slide ID and XY tile coordinates into a unique tile ID."""
    return f"{slide_id}.{get_tile_descriptor(tile_location)}"


 def save_image(array_chw: np.ndarray, path: Path) -> PIL.Image:
+    """Save an image array in (C, H, W) format to disk."""
    path.parent.mkdir(parents=True, exist_ok=True)
    array_hwc = np.moveaxis(array_chw, 0, -1).astype(np.uint8).squeeze()
    pil_image = PIL.Image.fromarray(array_hwc)
@ -56,59 +61,102 @@ def save_image(array_chw: np.ndarray, path: Path) -> PIL.Image:
    return pil_image


-def generate_tiles(sample: dict, tile_size: int, occupancy_threshold: float) \
-        -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, int]:
-    image_tiles, tile_locations = tiling.tile_array_2d(sample['image'], tile_size=tile_size,
-                                                       constant_values=255)
-    mask_tiles, _ = tiling.tile_array_2d(sample['mask'], tile_size=tile_size, constant_values=0)
+def generate_tiles(slide_image: np.ndarray, tile_size: int, foreground_threshold: float,
+                   occupancy_threshold: float) -> Tuple[np.ndarray, np.ndarray, np.ndarray, int]:
+    """Split the foreground of an input slide image into tiles.

-    selected: np.ndarray
-    occupancies: np.ndarray
-    selected, occupancies = select_tile(mask_tiles, occupancy_threshold)
+    :param slide_image: The RGB image array in (C, H, W) format.
+    :param tile_size: Lateral dimensions of each tile, in pixels.
+    :param foreground_threshold: Luminance threshold (0 to 255) to determine tile occupancy.
+    :param occupancy_threshold: Threshold (between 0 and 1) to determine empty tiles to discard.
+    :return: A tuple containing the image tiles (N, C, H, W), tile coordinates (N, 2), occupancies
+    (N,), and total number of discarded empty tiles.
+    """
+    image_tiles, tile_locations = tiling.tile_array_2d(slide_image, tile_size=tile_size,
+                                                       constant_values=255)
+    foreground_mask, _ = segment_foreground(image_tiles, foreground_threshold)
+
+    selected, occupancies = select_tiles(foreground_mask, occupancy_threshold)
    n_discarded = (~selected).sum()
-    logging.info(f"Percentage tiles discarded: {round(selected.sum() / n_discarded * 100, 2)}")
+    logging.info(f"Percentage tiles discarded: {n_discarded / len(selected) * 100:.2f}")

    image_tiles = image_tiles[selected]
-    mask_tiles = mask_tiles[selected]
    tile_locations = tile_locations[selected]
    occupancies = occupancies[selected]

-    abs_tile_locations = (sample['scale'] * tile_locations + sample['location']).astype(int)
-
-    return image_tiles, mask_tiles, abs_tile_locations, occupancies, n_discarded
+    return image_tiles, tile_locations, occupancies, n_discarded


-# TODO refactor this to separate metadata identification from saving. We might want the metadata
-# even if the saving fails
-def save_tile(sample: dict, image_tile: np.ndarray, mask_tile: np.ndarray,
-              tile_location: Sequence[int], output_dir: Path) -> dict:
-    slide_id = sample['image_id']
+def get_tile_info(sample: Dict[SlideKey, Any], occupancy: float, tile_location: Sequence[int],
+                  rel_slide_dir: Path) -> Dict[TileKey, Any]:
+    """Map slide information and tiling outputs into tile-specific information dictionary.
+
+    :param sample: Slide dictionary.
+    :param occupancy: Estimated tile foreground occuppancy.
+    :param tile_location: Tile XY coordinates.
+    :param rel_slide_dir: Directory where tiles are saved, relative to dataset root.
+    :return: Tile information dictionary.
+    """
+    slide_id = sample[SlideKey.SLIDE_ID]
    descriptor = get_tile_descriptor(tile_location)
-    image_tile_filename = f"train_images/{descriptor}.png"
-    mask_tile_filename = f"train_label_masks/{descriptor}_mask.png"
+    rel_image_path = f"{rel_slide_dir}/{descriptor}.png"

-    save_image(image_tile, output_dir / image_tile_filename)
-    save_image(mask_tile, output_dir / mask_tile_filename)
-
-    tile_metadata = {
-        'slide_id': slide_id,
-        'tile_id': get_tile_id(slide_id, tile_location),
-        'image': image_tile_filename,
-        'mask': mask_tile_filename,
-        'tile_x': tile_location[0],
-        'tile_y': tile_location[1],
-        'data_provider': sample['data_provider'],
-        'slide_isup_grade': sample['isup_grade'],
-        'slide_gleason_score': sample['gleason_score'],
+    tile_info = {
+        TileKey.SLIDE_ID: slide_id,
+        TileKey.TILE_ID: get_tile_id(slide_id, tile_location),
+        TileKey.IMAGE: rel_image_path,
+        TileKey.LABEL: sample[SlideKey.LABEL],
+        TileKey.TILE_X: tile_location[0],
+        TileKey.TILE_Y: tile_location[1],
+        TileKey.OCCUPANCY: occupancy,
+        TileKey.SLIDE_METADATA: {TileKey.from_slide_metadata_key(key): value
+                                 for key, value in sample[SlideKey.METADATA].items()}
    }

-    return tile_metadata
+    return tile_info


-def process_slide(sample: dict, level: int, margin: int, tile_size: int, occupancy_threshold: int,
-                  output_dir: Path, tile_progress: bool = False) -> None:
-    slide_id = sample['image_id']
-    slide_dir: Path = output_dir / (slide_id + "/")
+def format_csv_row(tile_info: Dict[TileKey, Any], keys_to_save: Iterable[TileKey],
+                   metadata_keys: Iterable[str]) -> str:
+    """Format tile information dictionary as a row to write to a dataset CSV tile.
+
+    :param tile_info: Tile information dictionary.
+    :param keys_to_save: Which main keys to include in the row, and in which order.
+    :param metadata_keys: Likewise for metadata keys.
+    :return: The formatted CSV row.
+    """
+    tile_slide_metadata = tile_info.pop(TileKey.SLIDE_METADATA)
+    fields = [str(tile_info[key]) for key in keys_to_save]
+    fields.extend(str(tile_slide_metadata[key]) for key in metadata_keys)
+    dataset_row = ','.join(fields)
+    return dataset_row
+
+
+def process_slide(sample: Dict[SlideKey, Any], level: int, margin: int, tile_size: int,
+                  foreground_threshold: Optional[float], occupancy_threshold: float, output_dir: Path,
+                  tile_progress: bool = False) -> None:
+    """Load and process a slide, saving tile images and information to a CSV file.
+
+    :param sample: Slide information dictionary, returned by the input slide dataset.
+    :param level: Magnification level at which to process the slide.
+    :param margin: Margin around the foreground bounding box, in pixels at lowest resolution.
+    :param tile_size: Lateral dimensions of each tile, in pixels.
+    :param foreground_threshold: Luminance threshold (0 to 255) to determine tile occupancy.
+    If `None` (default), an optimal threshold will be estimated automatically.
+    :param occupancy_threshold: Threshold (between 0 and 1) to determine empty tiles to discard.
+    :param output_dir: Root directory for the output dataset; outputs for a single slide will be
+    saved inside `output_dir/slide_id/`.
+    :param tile_progress: Whether to display a progress bar in the terminal.
+    """
+    slide_metadata: Dict[str, Any] = sample[SlideKey.METADATA]
+    keys_to_save = (TileKey.SLIDE_ID, TileKey.TILE_ID, TileKey.IMAGE, TileKey.LABEL,
+                    TileKey.TILE_X, TileKey.TILE_Y, TileKey.OCCUPANCY)
+    metadata_keys = tuple(TileKey.from_slide_metadata_key(key) for key in slide_metadata)
+    csv_columns: Tuple[str, ...] = (*keys_to_save, *metadata_keys)
+
+    slide_id: str = sample[SlideKey.SLIDE_ID]
+    rel_slide_dir = Path(slide_id)
+    slide_dir = output_dir / rel_slide_dir
    logging.info(f">>> Slide dir {slide_dir}")
    if slide_dir.exists():  # already processed slide - skip
        logging.info(f">>> Skipping {slide_dir} - already processed")
@ -119,50 +167,57 @@ def process_slide(sample: dict, level: int, margin: int, tile_size: int, occupan

            dataset_csv_path = slide_dir / "dataset.csv"
            dataset_csv_file = dataset_csv_path.open('w')
-            dataset_csv_file.write(','.join(CSV_COLUMNS) + '\n')  # write CSV header
+            dataset_csv_file.write(','.join(csv_columns) + '\n')  # write CSV header

-            tiles_failure = 0
+            n_failed_tiles = 0
            failed_tiles_csv_path = slide_dir / "failed_tiles.csv"
            failed_tiles_file = failed_tiles_csv_path.open('w')
            failed_tiles_file.write('tile_id' + '\n')

            logging.info(f"Loading slide {slide_id} ...")
-            loader = LoadPandaROId(WSIReader(), level=level, margin=margin)
-            sample = loader(sample)  # load 'image' and 'mask' from disk
+            loader = LoadROId(WSIReader('cuCIM'), level=level, margin=margin,
+                              foreground_threshold=foreground_threshold)
+            sample = loader(sample)  # load 'image' from disk

            logging.info(f"Tiling slide {slide_id} ...")
-            image_tiles, mask_tiles, tile_locations, occupancies, _ = \
-                generate_tiles(sample, tile_size, occupancy_threshold)
+            image_tiles, rel_tile_locations, occupancies, _ = \
+                generate_tiles(sample[SlideKey.IMAGE], tile_size,
+                               sample[SlideKey.FOREGROUND_THRESHOLD],
+                               occupancy_threshold)
+
+            tile_locations = (sample[SlideKey.SCALE] * rel_tile_locations
+                              + sample[SlideKey.ORIGIN]).astype(int)
+
            n_tiles = image_tiles.shape[0]

+            logging.info(f"Saving tiles for slide {slide_id} ...")
            for i in tqdm(range(n_tiles), f"Tiles ({slide_id[:6]}…)", unit="img", disable=not tile_progress):
                try:
-                    tile_metadata = save_tile(sample, image_tiles[i], mask_tiles[i], tile_locations[i],
-                                              slide_dir)
-                    tile_metadata['occupancy'] = occupancies[i]
-                    tile_metadata['image'] = os.path.join(slide_dir.name, tile_metadata['image'])
-                    tile_metadata['mask'] = os.path.join(slide_dir.name, tile_metadata['mask'])
-                    dataset_row = ','.join(str(tile_metadata[column]) for column in CSV_COLUMNS)
+                    tile_info = get_tile_info(sample, occupancies[i], tile_locations[i], rel_slide_dir)
+                    save_image(image_tiles[i], output_dir / tile_info[TileKey.IMAGE])
+                    dataset_row = format_csv_row(tile_info, keys_to_save, metadata_keys)
                    dataset_csv_file.write(dataset_row + '\n')
                except Exception as e:
-                    tiles_failure += 1
-                    descriptor = get_tile_descriptor(tile_locations[i]) + '\n'
-                    failed_tiles_file.write(descriptor)
+                    n_failed_tiles += 1
+                    descriptor = get_tile_descriptor(tile_locations[i])
+                    failed_tiles_file.write(descriptor + '\n')
                    traceback.print_exc()
                    warnings.warn(f"An error occurred while saving tile "
                                  f"{get_tile_id(slide_id, tile_locations[i])}: {e}")

            dataset_csv_file.close()
            failed_tiles_file.close()
-            if tiles_failure > 0:
+            if n_failed_tiles > 0:
                # TODO what we want to do with slides that have some failed tiles?
-                logging.warning(f"{slide_id} is incomplete. {tiles_failure} tiles failed.")
+                logging.warning(f"{slide_id} is incomplete. {n_failed_tiles} tiles failed.")
+            logging.info(f"Finished processing slide {slide_id}")
        except Exception as e:
            traceback.print_exc()
            warnings.warn(f"An error occurred while processing slide {slide_id}: {e}")


 def merge_dataset_csv_files(dataset_dir: Path) -> Path:
+    """Combines all "*/dataset.csv" files into a single "dataset.csv" file in the given directory."""
    full_csv = dataset_dir / "dataset.csv"
    # TODO change how we retrieve these filenames, probably because mounted, the operation is slow
    #  and it seems to find many more files
@ -181,21 +236,40 @@ def merge_dataset_csv_files(dataset_dir: Path) -> Path:
    return full_csv


-def main(panda_dir: Union[str, Path], root_output_dir: Union[str, Path], level: int, tile_size: int,
-         margin: int, occupancy_threshold: float, parallel: bool = False, overwrite: bool = False) -> None:
+def main(slides_dataset: SlidesDataset, root_output_dir: Union[str, Path],
+         level: int, tile_size: int, margin: int, foreground_threshold: Optional[float],
+         occupancy_threshold: float, parallel: bool = False, overwrite: bool = False,
+         n_slides: Optional[int] = None) -> None:
+    """Process a slides dataset to produce a tiles dataset.
+
+    :param slides_dataset: Input tiles dataset object.
+    :param root_output_dir: The root directory of the output tiles dataset.
+    :param level: Magnification level at which to process the slide.
+    :param tile_size: Lateral dimensions of each tile, in pixels.
+    :param margin: Margin around the foreground bounding box, in pixels at lowest resolution.
+    :param foreground_threshold: Luminance threshold (0 to 255) to determine tile occupancy.
+    If `None` (default), an optimal threshold will be estimated automatically.
+    :param occupancy_threshold: Threshold (between 0 and 1) to determine empty tiles to discard.
+    :param parallel: Whether slides should be processed in parallel with multiprocessing.
+    :param overwrite: Whether to overwrite an existing output tiles dataset. If `True`, will delete
+    and recreate `root_output_dir`, otherwise will resume by skipping already processed slides.
+    :param n_slides: If given, limit the total number of slides for debugging.
+    """

    # Ignoring some types here because mypy is getting confused with the MONAI Dataset class
    # to select a subsample use keyword n_slides
-    dataset = Dataset(PandaDataset(panda_dir))  # type: ignore
+    dataset = Dataset(slides_dataset)[:n_slides]  # type: ignore

-    output_dir = Path(root_output_dir) / f"panda_tiles_level{level}_{tile_size}"
-    logging.info(f"Creating dataset of level-{level} {tile_size}x{tile_size} PANDA tiles at: {output_dir}")
+    output_dir = Path(root_output_dir)
+    logging.info(f"Creating dataset of level-{level} {tile_size}x{tile_size} "
+                 f"{slides_dataset.__class__.__name__} tiles at: {output_dir}")

    if overwrite and output_dir.exists():
        shutil.rmtree(output_dir)
    output_dir.mkdir(parents=True, exist_ok=not overwrite)

    func = functools.partial(process_slide, level=level, margin=margin, tile_size=tile_size,
+                             foreground_threshold=foreground_threshold,
                             occupancy_threshold=occupancy_threshold, output_dir=output_dir,
                             tile_progress=not parallel)

@ -217,11 +291,16 @@ def main(panda_dir: Union[str, Path], root_output_dir: Union[str, Path], level:


 if __name__ == '__main__':
-    main(panda_dir="/tmp/datasets/PANDA",
-         root_output_dir="/datadrive",
-         level=1,
+    from InnerEye.ML.Histopathology.datasets.tcga_prad_dataset import TcgaPradDataset
+
+    # Example set up for an existing slides dataset:
+    main(slides_dataset=TcgaPradDataset("/tmp/datasets/TCGA-PRAD"),
+         root_output_dir="/datadrive/TCGA-PRAD_tiles",
+         n_slides=5,
+         level=3,
         tile_size=224,
         margin=64,
+         foreground_threshold=None,
         occupancy_threshold=0.05,
-         parallel=True,
-         overwrite=False)
+         parallel=False,
+         overwrite=True)
--- a/InnerEye/ML/Histopathology/preprocessing/loading.py
+++ b/InnerEye/ML/Histopathology/preprocessing/loading.py
@ -0,0 +1,108 @@
+from typing import Dict, Optional, Tuple
+
+import numpy as np
+import skimage.filters
+from cucim import CuImage
+from health_ml.utils import box_utils
+from monai.data.image_reader import WSIReader
+from monai.transforms import MapTransform
+
+from InnerEye.ML.Histopathology.utils.naming import SlideKey
+
+
+def get_luminance(slide: np.ndarray) -> np.ndarray:
+    """Compute a grayscale version of the input slide.
+
+    :param slide: The RGB image array in (*, C, H, W) format.
+    :return: The single-channel luminance array as (*, H, W).
+    """
+    # TODO: Consider more sophisticated luminance calculation if necessary
+    return slide.mean(axis=-3)  # type: ignore
+
+
+def segment_foreground(slide: np.ndarray, threshold: Optional[float] = None) \
+        -> Tuple[np.ndarray, float]:
+    """Segment the given slide by thresholding its luminance.
+
+    :param slide: The RGB image array in (*, C, H, W) format.
+    :param threshold: Pixels with luminance below this value will be considered foreground.
+    If `None` (default), an optimal threshold will be estimated automatically using Otsu's method.
+    :return: A tuple containing the boolean output array in (*, H, W) format and the threshold used.
+    """
+    luminance = get_luminance(slide)
+    if threshold is None:
+        threshold = skimage.filters.threshold_otsu(luminance)
+    return luminance < threshold, threshold
+
+
+def load_slide_at_level(reader: WSIReader, slide_obj: CuImage, level: int) -> np.ndarray:
+    """Load full slide array at the given magnification level.
+
+    This is a manual workaround for a MONAI bug (https://github.com/Project-MONAI/MONAI/issues/3415)
+    fixed in a currently unreleased PR (https://github.com/Project-MONAI/MONAI/pull/3417).
+
+    :param reader: A MONAI `WSIReader` using cuCIM backend.
+    :param slide_obj: The cuCIM image object returned by `reader.read(<image_file>)`.
+    :param level: Index of the desired magnification level as defined in the `slide_obj` headers.
+    :return: The loaded image array in (C, H, W) format.
+    """
+    size = slide_obj.resolutions['level_dimensions'][level][::-1]
+    slide, _ = reader.get_data(slide_obj, size=size, level=level)  # loaded as RGB PIL image
+    return slide
+
+
+class LoadROId(MapTransform):
+    """Transform that loads a pathology slide, cropped to an estimated bounding box (ROI).
+
+    Operates on dictionaries, replacing the file path in `image_key` with the loaded array in
+    (C, H, W) format. Also adds the following entries:
+    - `SlideKey.ORIGIN` (tuple): top-right coordinates of the bounding box
+    - `SlideKey.SCALE` (float): corresponding scale, loaded from the file
+    - `SlideKey.FOREGROUND_THRESHOLD` (float): threshold used to segment the foreground
+    """
+    def __init__(self, reader: WSIReader, image_key: str = SlideKey.IMAGE, level: int = 0,
+                 margin: int = 0, foreground_threshold: Optional[float] = None) -> None:
+        """
+        :param reader: And instance of MONAI's `WSIReader`.
+        :param image_key: Image key in the input and output dictionaries.
+        :param level: Magnification level to load from the raw multi-scale file.
+        :param margin: Amount in pixels by which to enlarge the estimated bounding box for cropping.
+        :param foreground_threshold: Pixels with luminance below this value will be considered foreground.
+        If `None` (default), an optimal threshold will be estimated automatically using Otsu's method.
+        """
+        super().__init__([image_key], allow_missing_keys=False)
+        self.reader = reader
+        self.image_key = image_key
+        self.level = level
+        self.margin = margin
+        self.foreground_threshold = foreground_threshold
+
+    def _get_bounding_box(self, slide_obj: CuImage) -> Tuple[box_utils.Box, float]:
+        # Estimate bounding box at the lowest resolution (i.e. highest level)
+        highest_level = slide_obj.resolutions['level_count'] - 1
+        scale = slide_obj.resolutions['level_downsamples'][highest_level]
+        slide = load_slide_at_level(self.reader, slide_obj, level=highest_level)
+
+        foreground_mask, threshold = segment_foreground(slide, self.foreground_threshold)
+        bbox = scale * box_utils.get_bounding_box(foreground_mask).add_margin(self.margin)
+        return bbox, threshold
+
+    def __call__(self, data: Dict) -> Dict:
+        image_obj: CuImage = self.reader.read(data[self.image_key])
+
+        level0_bbox, threshold = self._get_bounding_box(image_obj)
+
+        # cuCIM/OpenSlide takes absolute location coordinates in the level 0 reference frame,
+        # but relative region size in pixels at the chosen level
+        origin = (level0_bbox.x, level0_bbox.y)
+        scale = image_obj.resolutions['level_downsamples'][self.level]
+        scaled_bbox = level0_bbox / scale
+
+        data[self.image_key], _ = self.reader.get_data(image_obj, location=origin, level=self.level,
+                                                       size=(scaled_bbox.w, scaled_bbox.h))
+        data[SlideKey.ORIGIN] = origin
+        data[SlideKey.SCALE] = scale
+        data[SlideKey.FOREGROUND_THRESHOLD] = threshold
+
+        image_obj.close()
+        return data
--- a/InnerEye/ML/Histopathology/scripts/azure/azure_tiles_creation.py
+++ b/InnerEye/ML/Histopathology/scripts/azure/azure_tiles_creation.py
@ -1,61 +0,0 @@
-#  ------------------------------------------------------------------------------------------
-#  Copyright (c) Microsoft Corporation. All rights reserved.
-#  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
-#  ------------------------------------------------------------------------------------------
-
-"""
-This script is an example of how to use the submit_to_azure_if_needed function from the hi-ml package to run the
-main pre-processing function that creates tiles from slides in the PANDA dataset. The advantage of using this script
-is the ability to submit to a cluster on azureml and to have the output files directly saved as a registered dataset.
-
-To run execute, from inside the pre-processing folder,
-python azure_tiles_creation.py --azureml
-
-A json configuration file containing the credentials to the Azure workspace and an environment.yml file are expected
-in input.
-
-This has been tested on hi-mlv0.1.4.
-"""
-
-from pathlib import Path
-import sys
-import time
-
-current_file = Path(__file__)
-radiomics_root = current_file.absolute().parent.parent.parent.parent.parent
-sys.path.append(str(radiomics_root))
-from health_azure.himl import submit_to_azure_if_needed, DatasetConfig  # noqa
-from InnerEye.ML.Histopathology.preprocessing.create_tiles_dataset import main  # noqa
-
-# Pre-built environment file that contains all the requirements (RadiomicsNN + histo)
-# Assuming ENV_NAME is a complete environment, `conda env export -n ENV_NAME -f ENV_NAME.yml` will create the desired file
-ENVIRONMENT_FILE = radiomics_root.joinpath(Path("/envs/innereyeprivatetiles.yml"))
-DATASET_NAME = "PANDA_tiles"
-timestr = time.strftime("%Y%m%d-%H%M%S")
-folder_name = DATASET_NAME + '_' + timestr
-
-if __name__ == '__main__':
-    print(f"Running {str(current_file)}")
-    input_dataset = DatasetConfig(name="PANDA", datastore="innereyedatasets", local_folder=Path("/tmp/datasets/PANDA"), use_mounting=True)
-    output_dataset = DatasetConfig(name=DATASET_NAME, datastore="innereyedatasets", local_folder=Path("/datadrive/"), use_mounting=True)
-    run_info = submit_to_azure_if_needed(entry_script=current_file,
-                                         snapshot_root_directory=radiomics_root,
-                                         workspace_config_file=Path("config.json"),
-                                         compute_cluster_name='training-pr-nc12',  # training-nd24
-                                         default_datastore="innereyedatasets",
-                                         conda_environment_file=Path(ENVIRONMENT_FILE),
-                                         input_datasets=[input_dataset],
-                                         output_datasets=[output_dataset],
-                                         )
-    input_folder = run_info.input_datasets[0]
-    output_folder = Path(run_info.output_datasets[0], folder_name)
-    print(f'This will be the final ouput folder {str(output_folder)}')
-
-    main(panda_dir=str(input_folder),
-         root_output_dir=str(output_folder),
-         level=1,
-         tile_size=224,
-         margin=64,
-         occupancy_threshold=0.05,
-         parallel=True,
-         overwrite=False)
--- a/InnerEye/ML/Histopathology/utils/naming.py
+++ b/InnerEye/ML/Histopathology/utils/naming.py
@ -5,6 +5,41 @@

 from enum import Enum

+
+class SlideKey(str, Enum):
+    SLIDE_ID = 'slide_id'
+    IMAGE = 'image'
+    IMAGE_PATH = 'image_path'
+    MASK = 'mask'
+    MASK_PATH = 'mask_path'
+    LABEL = 'label'
+    SPLIT = 'split'
+    SCALE = 'scale'
+    ORIGIN = 'origin'
+    FOREGROUND_THRESHOLD = 'foreground_threshold'
+    METADATA = 'metadata'
+
+
+class TileKey(str, Enum):
+    TILE_ID = 'tile_id'
+    SLIDE_ID = 'slide_id'
+    IMAGE = 'image'
+    IMAGE_PATH = 'image_path'
+    MASK = 'mask'
+    MASK_PATH = 'mask_path'
+    LABEL = 'label'
+    SPLIT = 'split'
+    TILE_X = 'tile_x'
+    TILE_Y = 'tile_y'
+    OCCUPANCY = 'occupancy'
+    FOREGROUND_THRESHOLD = 'foreground_threshold'
+    SLIDE_METADATA = 'slide_metadata'
+
+    @staticmethod
+    def from_slide_metadata_key(slide_metadata_key: str) -> str:
+        return 'slide_' + slide_metadata_key
+
+
 class ResultsKey(str, Enum):
    SLIDE_ID = 'slide_id'
    TILE_ID = 'tile_id'
--- a/InnerEye/ML/Histopathology/utils/viz_utils.py
+++ b/InnerEye/ML/Histopathology/utils/viz_utils.py
@ -4,29 +4,32 @@
 #  ------------------------------------------------------------------------------------------

 import math
-import matplotlib.pyplot as plt
+from typing import Any, Dict

+import matplotlib.pyplot as plt
+from monai.data.dataset import Dataset
 from monai.data.image_reader import WSIReader
 from torch.utils.data import DataLoader

 from InnerEye.ML.Histopathology.datasets.panda_dataset import PandaDataset, LoadPandaROId
+from InnerEye.ML.Histopathology.utils.naming import SlideKey


-def load_image_dict(sample: dict, level: int, margin: int) -> dict:
+def load_image_dict(sample: dict, level: int, margin: int) -> Dict[SlideKey, Any]:
    """
    Load image from metadata dictionary
-    param sample: dict describing image metadata. Example:
+    :param sample: dict describing image metadata. Example:
        {'image_id': ['1ca999adbbc948e69783686e5b5414e4'],
        'image': ['/tmp/datasets/PANDA/train_images/1ca999adbbc948e69783686e5b5414e4.tiff'],
         'mask': ['/tmp/datasets/PANDA/train_label_masks/1ca999adbbc948e69783686e5b5414e4_mask.tiff'],
         'data_provider': ['karolinska'],
         'isup_grade': tensor([0]),
         'gleason_score': ['0+0']}
-    param level: level of resolution to be loaded
-    param margin: margin to be included
-    return: a dict containing the image data and metadata
+    :param level: level of resolution to be loaded
+    :param margin: margin to be included
+    :return: a dict containing the image data and metadata
    """
-    loader = LoadPandaROId(WSIReader(), level=level, margin=margin)
+    loader = LoadPandaROId(WSIReader('cuCIM'), level=level, margin=margin)
    img = loader(sample)
    return img

@ -34,25 +37,25 @@ def load_image_dict(sample: dict, level: int, margin: int) -> dict:
 def plot_panda_data_sample(panda_dir: str, nsamples: int, ncols: int, level: int, margin: int,
                           title_key: str = 'data_provider') -> None:
    """
-    param panda_dir: path to the dataset, it's expected a file called "train.csv" exists at the path.
+    :param panda_dir: path to the dataset, it's expected a file called "train.csv" exists at the path.
        Look at the PandaDataset for more detail
-    param nsamples: number of random samples to be visualized
-    param ncols: number of columns in the figure grid. Nrows is automatically inferred
-    param level: level of resolution to be loaded
-    param margin: margin to be included
-    param title_key: key in image_dict used to label each subplot
+    :param nsamples: number of random samples to be visualized
+    :param ncols: number of columns in the figure grid. Nrows is automatically inferred
+    :param level: level of resolution to be loaded
+    :param margin: margin to be included
+    :param title_key: metadata key in image_dict used to label each subplot
    """
-    panda_dataset = PandaDataset(root_dir=panda_dir, n_slides=nsamples)
+    panda_dataset = Dataset(PandaDataset(root=panda_dir))[:nsamples]  # type: ignore
    loader = DataLoader(panda_dataset, batch_size=1)

    nrows = math.ceil(nsamples/ncols)
    fig, axes = plt.subplots(ncols=ncols, nrows=nrows, figsize=(9, 9))

    for dict_images, ax in zip(loader, axes.flat):
-        slide_id = dict_images['image_id']
-        title = dict_images[title_key]
+        slide_id = dict_images[SlideKey.SLIDE_ID]
+        title = dict_images[SlideKey.METADATA][title_key]
        print(f">>> Slide {slide_id}")
        img = load_image_dict(dict_images, level=level, margin=margin)
-        ax.imshow(img['image'].transpose(1, 2, 0))
+        ax.imshow(img[SlideKey.IMAGE].transpose(1, 2, 0))
        ax.set_title(title)
    fig.tight_layout()
--- a/Tests/ML/datasets/test_dataset.py
+++ b/Tests/ML/datasets/test_dataset.py
@ -8,7 +8,7 @@ import numpy as np
 import pandas as pd
 import pytest
 import torch
-from pytorch_lightning.core.step_result import Result
+from pytorch_lightning.utilities.data import extract_batch_size

 from InnerEye.Common import common_util
 from InnerEye.ML.config import PaddingMode, SegmentationModelBase
@ -502,7 +502,7 @@ def test_sample_metadata_field() -> None:
    assert SAMPLE_METADATA_FIELD in fields
    # Lightning attempts to determine the batch size by trying to find a tensor field in the sample.
    # This only works if any field other than Metadata is first.
-    assert Result.unpack_batch_size(fields) == batch_size
+    assert extract_batch_size(fields) == batch_size


 def test_custom_collate() -> None:
--- a/Tests/ML/histopathology/datasets/test_slides_dataset.py
+++ b/Tests/ML/histopathology/datasets/test_slides_dataset.py
@ -0,0 +1,40 @@
+import os
+
+import pandas as pd
+
+from InnerEye.Common.fixed_paths_for_tests import tests_root_directory
+from InnerEye.ML.Histopathology.datasets.base_dataset import SlidesDataset
+from InnerEye.ML.Histopathology.utils.naming import SlideKey
+
+HISTO_TEST_DATA_DIR = str(tests_root_directory("ML/histopathology/test_data"))
+
+
+class MockSlidesDataset(SlidesDataset):
+    DEFAULT_CSV_FILENAME = "test_slides_dataset.csv"
+    METADATA_COLUMNS = ('meta1', 'meta2')
+
+    def __init__(self) -> None:
+        super().__init__(root=HISTO_TEST_DATA_DIR)
+
+
+def test_slides_dataset() -> None:
+    dataset = MockSlidesDataset()
+    assert isinstance(dataset.dataset_df, pd.DataFrame)
+    assert dataset.dataset_df.index.name == dataset.SLIDE_ID_COLUMN
+    assert len(dataset) == len(dataset.dataset_df)
+
+    sample = dataset[0]
+    assert isinstance(sample, dict)
+    assert all(isinstance(key, SlideKey) for key in sample)
+
+    expected_keys = [SlideKey.SLIDE_ID, SlideKey.IMAGE, SlideKey.IMAGE_PATH, SlideKey.LABEL,
+                     SlideKey.METADATA]
+    assert all(key in sample for key in expected_keys)
+
+    image_path = sample[SlideKey.IMAGE_PATH]
+    assert isinstance(image_path, str)
+    assert os.path.isfile(image_path)
+
+    metadata = sample[SlideKey.METADATA]
+    assert isinstance(metadata, dict)
+    assert all(meta_col in metadata for meta_col in type(dataset).METADATA_COLUMNS)
--- a/Tests/ML/histopathology/datasets/test_tcga_prad_dataset.py
+++ b/Tests/ML/histopathology/datasets/test_tcga_prad_dataset.py
@ -1,34 +0,0 @@
-#  ------------------------------------------------------------------------------------------
-#  Copyright (c) Microsoft Corporation. All rights reserved.
-#  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
-#  ------------------------------------------------------------------------------------------
-
-import os
-
-import pytest
-
-from InnerEye.ML.Histopathology.datasets.default_paths import TCGA_PRAD_DATASET_DIR
-from InnerEye.ML.Histopathology.datasets.tcga_prad_dataset import TcgaPradDataset
-
-
-@pytest.mark.skipif(not os.path.isdir(TCGA_PRAD_DATASET_DIR),
-                    reason="TCGA-PRAD dataset is unavailable")
-def test_dataset() -> None:
-    dataset = TcgaPradDataset(TCGA_PRAD_DATASET_DIR)
-
-    expected_length = 449
-    assert len(dataset) == expected_length
-
-    expected_num_positives = 10
-    assert dataset.dataset_df[dataset.LABEL_COLUMN].sum() == expected_num_positives
-
-    sample = dataset[0]
-    assert isinstance(sample, dict)
-
-    expected_keys = [dataset.SLIDE_ID_COLUMN, dataset.CASE_ID_COLUMN,
-                     dataset.IMAGE_COLUMN, dataset.LABEL_COLUMN]
-    assert all(key in sample for key in expected_keys)
-
-    image_path = sample[dataset.IMAGE_COLUMN]
-    assert isinstance(image_path, str)
-    assert os.path.isfile(image_path)
--- a/Tests/ML/histopathology/preprocessing/test_slide_loading.py
+++ b/Tests/ML/histopathology/preprocessing/test_slide_loading.py
@ -0,0 +1,161 @@
+from typing import Optional
+
+import numpy as np
+import pytest
+from cucim import CuImage
+from monai.data.image_reader import WSIReader
+
+from InnerEye.Common.fixed_paths_for_tests import tests_root_directory
+from InnerEye.ML.Histopathology.preprocessing.tiling import tile_array_2d
+from InnerEye.ML.Histopathology.preprocessing.loading import LoadROId, get_luminance, load_slide_at_level, segment_foreground
+from InnerEye.ML.Histopathology.utils.naming import SlideKey
+from Tests.ML.histopathology.datasets.test_slides_dataset import MockSlidesDataset
+
+TEST_IMAGE_PATH = str(tests_root_directory("ML/histopathology/test_data/panda_wsi_example.tiff"))
+
+
+def test_load_slide() -> None:
+    level = 2
+    reader = WSIReader('cuCIM')
+    slide_obj: CuImage = reader.read(TEST_IMAGE_PATH)
+    dims = slide_obj.resolutions['level_dimensions'][level][::-1]
+
+    slide = load_slide_at_level(reader, slide_obj, level)
+    assert isinstance(slide, np.ndarray)
+    expected_shape = (3, *dims)
+    assert slide.shape == expected_shape
+    frac_empty = (slide == 0).mean()
+    assert frac_empty == 0.0
+
+    larger_dims = (2 * dims[0], 2 * dims[1])
+    larger_slide, _ = reader.get_data(slide_obj, size=larger_dims, level=level)
+    assert isinstance(larger_slide, np.ndarray)
+    assert larger_slide.shape == (3, *larger_dims)
+    # Overlapping parts match exactly
+    assert np.array_equal(larger_slide[:, :dims[0], :dims[1]], slide)
+    # Non-overlapping parts are all empty
+    empty_fill_value = 0  # fill value seems to depend on the image
+    assert np.array_equiv(larger_slide[:, dims[0]:, :], empty_fill_value)
+    assert np.array_equiv(larger_slide[:, :, dims[1]:], empty_fill_value)
+
+
+def test_get_luminance() -> None:
+    level = 2  # here we only need to test at a single resolution
+    reader = WSIReader('cuCIM')
+    slide_obj: CuImage = reader.read(TEST_IMAGE_PATH)
+
+    slide = load_slide_at_level(reader, slide_obj, level)
+    slide_luminance = get_luminance(slide)
+    assert isinstance(slide_luminance, np.ndarray)
+    assert slide_luminance.shape == slide.shape[1:]
+    assert (slide_luminance <= 255).all() and (slide_luminance >= 0).all()
+
+    tiles, _ = tile_array_2d(slide, tile_size=224, constant_values=255)
+    tiles_luminance = get_luminance(tiles)
+    assert isinstance(tiles_luminance, np.ndarray)
+    assert tiles_luminance.shape == (tiles.shape[0], *tiles.shape[2:])
+    assert (tiles_luminance <= 255).all() and (tiles_luminance >= 0).all()
+
+    slide_luminance_tiles, _ = tile_array_2d(np.expand_dims(slide_luminance, axis=0),
+                                             tile_size=224, constant_values=255)
+    assert np.array_equal(slide_luminance_tiles.squeeze(1), tiles_luminance)
+
+
+def test_segment_foreground() -> None:
+    level = 2  # here we only need to test at a single resolution
+    reader = WSIReader('cuCIM')
+    slide_obj: CuImage = reader.read(TEST_IMAGE_PATH)
+    slide = load_slide_at_level(reader, slide_obj, level)
+
+    auto_mask, auto_threshold = segment_foreground(slide, threshold=None)
+    assert isinstance(auto_mask, np.ndarray)
+    assert auto_mask.dtype == bool
+    assert auto_mask.shape == slide.shape[1:]
+    assert 0 < auto_mask.sum() < auto_mask.size  # auto-seg should not produce trivial mask
+    luminance = get_luminance(slide)
+    assert luminance.min() < auto_threshold < luminance.max()
+
+    mask, returned_threshold = segment_foreground(slide, threshold=auto_threshold)
+    assert isinstance(mask, np.ndarray)
+    assert mask.dtype == bool
+    assert mask.shape == slide.shape[1:]
+    assert np.array_equal(mask, auto_mask)
+    assert returned_threshold == auto_threshold
+
+    tiles, _ = tile_array_2d(slide, tile_size=224, constant_values=255)
+    tiles_mask, _ = segment_foreground(tiles, threshold=auto_threshold)
+    assert isinstance(tiles_mask, np.ndarray)
+    assert tiles_mask.dtype == bool
+    assert tiles_mask.shape == (tiles.shape[0], *tiles.shape[2:])
+
+    slide_mask_tiles, _ = tile_array_2d(np.expand_dims(mask, axis=0),
+                                        tile_size=224, constant_values=False)
+    assert np.array_equal(slide_mask_tiles.squeeze(1), tiles_mask)
+
+
+@pytest.mark.parametrize('level', [1, 2])
+@pytest.mark.parametrize('foreground_threshold', [None, 215])
+def test_get_bounding_box(level: int, foreground_threshold: Optional[float]) -> None:
+    margin = 0
+    reader = WSIReader('cuCIM')
+    loader = LoadROId(reader, image_key=SlideKey.IMAGE, level=level, margin=margin,
+                      foreground_threshold=foreground_threshold)
+    slide_obj: CuImage = reader.read(TEST_IMAGE_PATH)
+    level0_bbox, _ = loader._get_bounding_box(slide_obj)
+
+    highest_level = slide_obj.resolutions['level_count'] - 1
+    # level = highest_level
+    slide = load_slide_at_level(reader, slide_obj, level=level)
+    scale = slide_obj.resolutions['level_downsamples'][level]
+    bbox = level0_bbox / scale
+    assert bbox.x >= 0 and bbox.y >= 0
+    assert bbox.x + bbox.w <= slide.shape[1]
+    assert bbox.y + bbox.h <= slide.shape[2]
+
+    # Now with nonzero margin
+    margin = 42
+    loader_margin = LoadROId(reader, image_key=SlideKey.IMAGE, level=level, margin=margin,
+                             foreground_threshold=foreground_threshold)
+    level0_bbox_margin, _ = loader_margin._get_bounding_box(slide_obj)
+    # Here we test the box differences at the highest resolution, because margin is
+    # specified in low-res pixels. Otherwise could fail due to rounding error.
+    level0_scale: float = slide_obj.resolutions['level_downsamples'][highest_level]
+    level0_margin = int(level0_scale * margin)
+    assert level0_bbox_margin.x == level0_bbox.x - level0_margin
+    assert level0_bbox_margin.y == level0_bbox.y - level0_margin
+    assert level0_bbox_margin.w == level0_bbox.w + 2 * level0_margin
+    assert level0_bbox_margin.h == level0_bbox.h + 2 * level0_margin
+
+
+@pytest.mark.parametrize('level', [1, 2])
+@pytest.mark.parametrize('margin', [0, 42])
+@pytest.mark.parametrize('foreground_threshold', [None, 215])
+def test_load_roi(level: int, margin: int, foreground_threshold: Optional[float]) -> None:
+    dataset = MockSlidesDataset()
+    sample = dataset[0]
+    reader = WSIReader('cuCIM')
+    loader = LoadROId(reader, image_key=SlideKey.IMAGE, level=level, margin=margin,
+                      foreground_threshold=foreground_threshold)
+    loaded_sample = loader(sample)
+    assert isinstance(loaded_sample, dict)
+    # Check that none of the input keys were removed
+    assert all(key in loaded_sample for key in sample)
+
+    # Check that the expected new keys were inserted
+    additional_keys = [SlideKey.ORIGIN, SlideKey.SCALE, SlideKey.FOREGROUND_THRESHOLD]
+    assert all(key in loaded_sample for key in additional_keys)
+
+    assert isinstance(loaded_sample[SlideKey.IMAGE], np.ndarray)
+    image_shape = loaded_sample[SlideKey.IMAGE].shape
+    assert len(image_shape)
+    assert image_shape[0] == 3
+
+    origin = loaded_sample[SlideKey.ORIGIN]
+    assert isinstance(origin, tuple)
+    assert len(origin) == 2
+    assert all(isinstance(coord, int) for coord in origin)
+
+    assert isinstance(loaded_sample[SlideKey.SCALE], (int, float))
+    assert loaded_sample[SlideKey.SCALE] >= 1.0
+
+    assert isinstance(loaded_sample[SlideKey.FOREGROUND_THRESHOLD], (int, float))
--- a/Tests/ML/histopathology/test_data/panda_wsi_example.tiff
+++ b/Tests/ML/histopathology/test_data/panda_wsi_example.tiff
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:06eb0acaa2883181e9b6ab976863f71cc843a75ed9175fae8fe9b879635af1b0
+size 816563
--- a/Tests/ML/histopathology/test_data/test_slides_dataset.csv
+++ b/Tests/ML/histopathology/test_data/test_slides_dataset.csv
@ -0,0 +1,2 @@
+slide_id,image,label,meta1,meta2
+foo,panda_wsi_example.tiff,0,bar,baz
--- a/environment.yml
+++ b/environment.yml
@ -20,6 +20,7 @@ dependencies:
      - azureml-tensorboard==1.36.0
      - conda-merge==0.1.5
      - cryptography==3.3.2
+      - cucim==21.10.1; platform_system=="Linux"
      - dataclasses-json==0.5.2
      - docker==4.3.1
      - flake8==3.8.3
--- a/pytest.ini
+++ b/pytest.ini
@ -1,6 +1,6 @@
 [pytest]
 testpaths=Tests TestsOutsidePackage TestSubmodule
-norecursedirs=azure-pipelines docs datasets sphinx-docs InnerEye logs outputs test_data
+norecursedirs=azure-pipelines docs sphinx-docs InnerEye logs outputs test_data Tests/ML/datasets
 addopts=--strict-markers
 markers=
    gpu: Test needs a GPU to run