ENH: Add code to create montages from WSI (#787)

2023-02-09 19:50:07 +00:00 · 2023-02-09 19:50:07 +00:00 · a055dfae9f
--- a/.github/workflows/cpath-pr.yml
+++ b/.github/workflows/cpath-pr.yml
@ -303,6 +303,22 @@ jobs:
          cd ${{ env.folder }}
          make smoke_test_tiles_panda_no_ddp_sampler_aml

+  smoke_test_montage_creation:
+    runs-on: ubuntu-20.04
+    needs: [ cancel-azureml ]
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          lfs: true
+
+      - name: Prepare Conda environment
+        uses: ./.github/actions/prepare_cpath_environment
+
+      - name: smoke test
+        run: |
+          cd ${{ env.folder }}
+          make smoke_test_montage_creation
+
  cpath-publish-pypi-package:
    runs-on: ubuntu-20.04
    needs: [
@ -321,6 +337,7 @@ jobs:
      smoke_test_slides_panda_loss_analysis,
      smoke_test_slides_panda_no_ddp_sampler,
      smoke_test_tiles_panda_no_ddp_sampler,
+      smoke_test_montage_creation,
      ]
    steps:
      - uses: actions/checkout@v3
--- a/build_requirements.txt
+++ b/build_requirements.txt
@ -3,6 +3,7 @@ myst-parser==0.15.2
 sphinx==4.1.2
 sphinx-autodoc-typehints==1.12.0
 sphinx-automodapi==0.13
+sphinx-argparse==0.3.1
 sphinx-rtd-theme==1.0.0
 twine==3.3.0
 wheel==0.38.1
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -43,6 +43,7 @@ extensions = [
    'sphinx_automodapi.automodapi',
    'sphinx_autodoc_typehints',
    'sphinx.ext.viewcode',
+    "sphinxarg.ext",
 ]

 numpydoc_show_class_members = False
--- a/docs/source/cpath_scripts.rst
+++ b/docs/source/cpath_scripts.rst
@ -0,0 +1,9 @@
+Histopathology Scripts
+======================
+
+Please see links below for details on the available arguments for each of the histopathology scripts.
+
+.. toctree::
+    :maxdepth: 1
+
+    create_montage
--- a/docs/source/create_montage.rst
+++ b/docs/source/create_montage.rst
@ -0,0 +1,8 @@
+create_montage
+==============
+
+This script can create a high-resolution montage of all images in a folder or datasets.
+
+.. argparse::
+    :ref: health_cpath.utils.montage_config.create_montage_argparser
+    :prog: runner.py
--- a/docs/source/images/montage_from_dataset.png
+++ b/docs/source/images/montage_from_dataset.png
--- a/docs/source/images/montage_from_folder.png
+++ b/docs/source/images/montage_from_folder.png
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@ -54,6 +54,8 @@ The `hi-ml` toolbox provides
   tcga_model.md
   ssl_on_tile_dataset.md
   dsa.md
+   montage_creation.md
+   cpath_scripts

 .. toctree::
   :maxdepth: 1
--- a/docs/source/montage_creation.md
+++ b/docs/source/montage_creation.md
@ -0,0 +1,166 @@
+# Creating montages from whole slide images (WSIs)
+
+For working with large amounts of histology data, it is often useful to create montages of the data.
+Montages are a collection of images that are stitched together to form a single image.
+Montages are useful for visualizing large amounts of data at once, and can be used to create a single image that can be used for analysis.
+The `hi-ml-cpath` toolbox contains scripts that help with the creation of montages from whole slide images (WSIs).
+
+Creating montages can be very time-consuming. It can hence be helpful to run the process in the cloud. The montage
+creation code provided here can be run in AzureML very easily.
+
+## Types of data for montage creation
+
+1. Montages can be created from a folder of images, by specifying the name of the folder and a glob pattern, like
+   `**/foo_*.tiff`.
+1. Montages can be created by first reading a file called `dataset.csv` located in a folder. `dataset.csv` is
+   effectively a Pandas DataFrame, with each row corresponding to a single image.
+   More details on the format of `dataset.csv` can be found below.
+
+Montage creation works for all WSI image formats that are supported by either of the two possible backends:
+
+- [`openslide`](https://openslide.org/api/python/) supports `.tif(f)`, `.ndpi`, `.scn` and others
+- [`cucim`](https://pypi.org/project/cucim/) supports `.svs`, `.tiff` and others
+
+## Setup
+
+- Check out the `hi-ml` repository via `git clone https://github.com/microsoft/hi-ml`
+- Run the following commands:
+
+```shell
+cd hi-ml-cpath
+make env
+conda activate HimlHisto
+make pip_local
+```
+
+All the commands listed below assume that
+
+- you have activated the Conda environment `HimlHisto`
+- your current working directory is `<repo root>/hi-ml-cpath`
+
+## Creating montages from a folder with files
+
+The following command will create a montage from all files in the folder `/data` that match the pattern
+`**/*.tiff`.
+
+```shell
+python src/health_cpath/scripts/create_montage.py --dataset /data --image_glob_pattern '**/*.tiff' --level 2 --width 1000 --output_path montage1
+```
+
+This will create a montage from all TIFF files in folder `/data`. Each TIFF file is read as a multi-level image, and
+level 2 is read for creating the montage.
+
+The `--width` argument determines the width in pixel of the output image. The height of the output image is determined
+automatically. Note that the width given here, 1000, is suitable only for montages from a very small number of files
+(say, 10). See below for more details on this commandline option.
+
+This will create two images in the folder specified by the `--output_path montage1` argument, hence outputting the files `montage1/montage.jpg` and `montage1/montage.png`.
+
+Here's an example how this could look like for a folder with 6 images, `0.tiff` through `5.tiff`:
+
+![image](images/montage_from_folder.png)
+
+## Creating montages from a `dataset.csv` file
+
+If the montage creation script is only pointed to a folder, without providing a glob pattern,
+it assumes that a file `dataset.csv` is present. A montage will be created from only the images
+listed in `dataset.csv`. In addition, an optional `label` column will be added to the text that is
+overlayed onto the images itself.
+
+The dataset file should be a CSV file, with each row corresponding to a single image.
+When working with a `dataset.csv` file, the following columns are handled:
+
+| Column name | Contents                                                                                       | Required? |
+| ----------- | ---------------------------------------------------------------------------------------------- | --------- |
+| `image`     | The path of the image that should be loaded                                                    | Required  |
+| `slide_id`  | A unique identifier for the slide                                                              | Required  |
+| `label`     | An additional string that will be placed on the montage, This could be `0`, `1`, `tumour`, ... | Optional  |
+| `mask`      | The path of an additional image that will rendered next to the image given in `image`          | Optional  |
+
+Consider this example dataset file:
+
+```text
+image,slide_id,label
+2.tiff,ID 2,Label 2
+3.tiff,ID 3,Label 3
+4.tiff,ID 4,Label 4
+5.tiff,ID 5,Label 5
+```
+
+Run montage creation with the following command:
+
+```shell
+python src/health_cpath/scripts/create_montage.py --dataset /data --level 2 --width 1000 --output_path montage1
+```
+
+This would produce (assuming that the images `2.tiff`, `3.tiff`, `4.tiff`, and `5.tiff` are present in the folder
+`/data`) a montage similar to this one:
+
+![image](images/montage_from_dataset.png)
+
+### Using inclusion or exclusion lists
+
+When creating montages from a `dataset.csv` file, it is possible to create montages from only a specific subset
+of rows, or all rows apart from those in a given list.
+
+- Use the `--exclude_by_slide_id exclude.txt` argument to point to a file with a list of slide IDs that should be
+   excluded from the montage.
+- Use the `--include_by_slide_id include.txt` argument to point to a file with a list of slide IDs for which
+   the montage should be created.
+
+The files `exclude.txt` and `include.txt` should contain one slide ID per line.
+
+## Other commandline options
+
+- Use `--width=20000` to set the width of the output montage image. The height of the output image is determined
+  automatically. Mind that the default value is 60_000, suitable for several hundreds of input images. If you want to try
+  out montage creation on a small set of files (say, 10), ensure that you set the width to a reasonably small value,
+  like `--width=1000`
+- Use `--parallel=2` to specify the number of parallel processes that should be used for creating image thumbnails.
+  Thumbnails are created in a first step, using multiple processes, and then the thumbnails are stitched into the final
+  montage in the main process.
+- Use `--backend=cucim` to switch the image reader backend to `CuCIM`. The default backend is `openslide`.
+
+## Running in Azure
+
+The `create_montage.py` script can be run in AzureML by adding 3 commandline arguments.
+
+To set up Azure and AzureML:
+
+- Follow the steps in the [AzureML onboarding](azure_setup.md).
+- At the end of the onboarding you will download a file `config.json` from your AzureML workspace to your repository
+  root folder.
+- To understand datasets, please read through the [AzureML datasets](datasets.md) documentation. Then create an AzureML
+  datastore that points to your Azure Blob Storage account.
+- Upload your WSIs to a folder in Azure Blob Storage. This can be done most efficiently via
+  [azcopy](http://aka.ms/azcopy). `azcopy` can also copy directly across cloud providers, for example from AWS to Azure.
+
+The following command will upload all files in the folder `my_test_slides` to a container `datasets` in your Azure Blob
+Storage account called `mystorage`, creating a folder `my_test_slides` in the storage account in the process:
+
+```shell
+azcopy copy my_test_slides https://mystorage.blob.core.windows.net/datasets/ --recursive
+```
+
+The following command will then create a run in AzureML that executes montage creation from that folder:
+
+```shell
+python src/health_cpath/scripts/create_montage.py --dataset my_test_slides --level 2 --width 1000 --cluster <clustername> --conda_env environment.yml --datastore <datastorename>
+```
+
+In this command, replace the following:
+
+- Replace `my_test_slides` with the name of the folder in blob storage where you uploaded your WSIs.
+- `clustername` is the name of a [compute
+cluster](https://learn.microsoft.com/en-us/azure/machine-learning/quickstart-create-resources#create-compute-clusters)
+where your job will execute)
+- `datastorename` is the name of an AzureML datastore, essential a pointer to your blob storage account plus the
+  credentials that are necessary to access it. For the above example, the data store needs to point to storage account
+  `mystorage` and container `datasets`.
+
+The command above will only run for a minute or less - it will mostly create a snapshot of the code and send that off to
+the cloud for execution. At the end you will see a link printed out that takes you to the AzureML portal, where you can
+monitor the progress of the run.
+
+Once the run is completed, you will find two files `montage.jpg` and `montage.png` in the tab "Outputs" of the run, and
+an option to download it to your machine.
--- a/hi-ml-cpath/.vscode/launch.json
+++ b/hi-ml-cpath/.vscode/launch.json
@ -85,5 +85,20 @@
            "console": "integratedTerminal",
            "justMyCode": false,
        },
+        {
+            "name": "Montage creation in AzureML",
+            "type": "python",
+            "request": "launch",
+            "program": "${workspaceFolder}/src/health_cpath/scripts/create_montage.py",
+            "args": [
+                "--dataset=test_montage",
+                "--width=1000",
+                "--cluster=lite-testing-ds2",
+                "--datastore=himldatasets",
+                "--conda_env=environment.yml"
+            ],
+            "console": "integratedTerminal",
+            "justMyCode": true
+        },
    ]
 }
--- a/hi-ml-cpath/Makefile
+++ b/hi-ml-cpath/Makefile
@ -301,6 +301,9 @@ smoke_test_tiles_panda_no_ddp_sampler_aml:
 	{ ${BASE_CPATH_RUNNER_COMMAND} ${DEEPSMILEPANDATILES_ARGS} ${DEFAULT_SMOKE_TEST_ARGS} \
 	${DEEPSMILEDEFAULT_SMOKE_TEST_ARGS} ${DDP_SAMPLER_ARGS} ${AML_MULTIPLE_DEVICE_ARGS} --tag smoke_test_tiles_panda_no_ddp_sampler_aml;}

+smoke_test_montage_creation:
+	python src/health_cpath/scripts/create_montage.py --dataset test_montage --width 1000 --cluster lite-testing-ds2 --datastore himldatasets --conda_env environment.yml --wait_for_completion
+
 smoke tests local: smoke_test_cucim_slidespandaimagenetmil_local smoke_test_openslide_slidespandaimagenetmil_local smoke_test_tilespandaimagenetmil_local smoke_test_tcgacrcksslmil_local smoke_test_crck_simclr_local smoke_test_crck_flexible_finetuning_local smoke_test_tcgacrckimagenetmil_local smoke_test_crck_loss_analysis_local smoke_test_slides_panda_loss_analysis_local smoke_test_slides_panda_no_ddp_sampler_local smoke_test_tiles_panda_no_ddp_sampler_local

 smoke tests AML: smoke_test_cucim_slidespandaimagenetmil_aml smoke_test_openslide_slidespandaimagenetmil_aml smoke_test_tilespandaimagenetmil_aml smoke_test_tcgacrcksslmil_aml smoke_test_crck_simclr_aml smoke_test_crck_flexible_finetuning_aml smoke_test_tcgacrckimagenetmil_aml smoke_test_crck_loss_analysis_aml smoke_test_slides_panda_loss_analysis_aml smoke_test_slides_panda_no_ddp_sampler_aml smoke_test_tiles_panda_no_ddp_sampler_aml
--- a/hi-ml-cpath/src/health_cpath/scripts/create_montage.py
+++ b/hi-ml-cpath/src/health_cpath/scripts/create_montage.py
@ -0,0 +1,62 @@
+#  -------------------------------------------------------------------------------------------
+#  Copyright (c) Microsoft Corporation. All rights reserved.
+#  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
+#  -------------------------------------------------------------------------------------------
+"""
+This script can be used to create a montage of slides, given a slides dataset or a folder with images.
+
+For a full documentation of the parameters, run `python create_montage.py --help`
+"""
+import logging
+from pathlib import Path
+import sys
+from typing import Optional
+
+
+current_file = Path(__file__).absolute()
+repository_root = current_file.parent.parent.parent.parent.parent
+folders_to_add = [repository_root / "hi-ml" / "src",
+                  repository_root / "hi-ml-azure" / "src",
+                  repository_root / "hi-ml-cpath" / "src"]
+for folder in folders_to_add:
+    assert folder.is_dir()
+    sys.path.insert(0, str(folder))
+
+from health_azure.himl import submit_to_azure_if_needed, DatasetConfig  # noqa
+from health_azure.logging import logging_to_stdout  # noqa
+from health_cpath.utils.montage import create_config_from_args  # noqa
+
+
+def main() -> None:
+    config = create_config_from_args()
+    logging_to_stdout()
+    submit_to_azureml = config.cluster != ""
+    if config.dataset.strip() == "":
+        raise ValueError("Please provide a dataset name via --dataset")
+    elif config.dataset.startswith("/"):
+        if submit_to_azureml:
+            raise ValueError("Cannot submit to AzureML if dataset is a local folder")
+        input_folder: Optional[Path] = Path(config.dataset)
+    else:
+        logging.info(f"In AzureML use mounted dataset '{config.dataset}' in datastore {config.datastore}")
+        input_dataset = DatasetConfig(name=config.dataset, datastore=config.datastore, use_mounting=True)
+        logging.info(f"Submitting to AzureML, running on cluster {config.cluster}")
+        run_info = submit_to_azure_if_needed(
+            entry_script=current_file,
+            snapshot_root_directory=repository_root,
+            compute_cluster_name=config.cluster,
+            conda_environment_file=config.conda_env,
+            submit_to_azureml=submit_to_azureml,
+            input_datasets=[input_dataset],
+            strictly_aml_v1=True,
+            docker_shm_size="100g",
+            wait_for_completion=config.wait_for_completion,
+        )
+        input_folder = run_info.input_datasets[0]
+
+    assert input_folder is not None
+    config.create_montage(input_folder)
+
+
+if __name__ == "__main__":
+    main()
--- a/hi-ml-cpath/src/health_cpath/utils/montage.py
+++ b/hi-ml-cpath/src/health_cpath/utils/montage.py
@ -0,0 +1,521 @@
+#  ------------------------------------------------------------------------------------------
+#  Copyright (c) Microsoft Corporation. All rights reserved.
+#  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
+#  ------------------------------------------------------------------------------------------
+
+import functools
+import logging
+import multiprocessing
+import shutil
+import sys
+import tempfile
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torchvision
+import torch
+import numpy as np
+import pandas as pd
+from PIL import Image, ImageDraw, ImageFont
+from tqdm import tqdm
+from monai.data.image_reader import WSIReader
+
+from health_azure.utils import apply_overrides, parse_arguments
+from health_cpath.preprocessing.loading import WSIBackend
+from health_cpath.utils.montage_config import MontageConfig, create_montage_argparser
+from health_cpath.utils.naming import SlideKey
+from health_cpath.datasets.base_dataset import SlidesDataset
+from health_ml.utils.type_annotations import TupleInt3
+
+
+MONTAGE_FILE = "montage.png"
+DatasetOrDataframe = Union[SlidesDataset, pd.DataFrame]
+DatasetRecord = Dict[SlideKey, Any]
+
+logger = logging.getLogger(__name__)
+
+
+def add_text(image: Image, text: str, y: float = 0.9, color: TupleInt3 = (27, 77, 40), fontsize_step: int = 2) -> None:
+    """Add text to a PIL image.
+
+    :param image: Image object to which text needs to be added.
+    :param text: The text that needs to be added.
+    :param y: Float between 0-1 specifying the vertical position of the text (default=0.9).
+    :param color: A 3-tuple indicating the fill color of the text (default = (27, 77, 40)).
+    :param fontsize_step: Steps of font size to reduce if the text size is more than image size (default=2).
+    """
+    # This font is usually found in a path like /usr/share/fonts/truetype/dejavu
+    font_path = Path('DejaVuSans.ttf')
+    fontsize = 48
+    draw = ImageDraw.Draw(image)
+    image_size_x, image_size_y = image.size
+    font = ImageFont.truetype(str(font_path), fontsize)
+    text_size_x, text_size_y = draw.textsize(text, font=font)
+    while text_size_x >= image_size_x:
+        fontsize -= fontsize_step
+        font = ImageFont.truetype(str(font_path), fontsize)
+        text_size_x, text_size_y = draw.textsize(text, font=font)
+    start_x = image_size_x // 2 - text_size_x // 2
+    start_y = image_size_y * y - text_size_y // 2
+    xy = start_x, start_y
+    draw.text(xy, text, fill=color, font=font, align='center')
+
+
+def load_slide_as_pil(reader: WSIReader, slide_file: Path, level: int = 0) -> Image:
+    """Load a WSI as a PIL image.
+
+    :param reader: The WSI reader for loading the slide.
+    :param slide_file: The file to read from.
+    :param level: Resolution downsampling level (default=0).
+    :return: PIL image object corresponding to the WSI image.
+    """
+    image = reader.read(slide_file)
+    try:
+        image_array, _ = reader.get_data(image, level=level)
+    except ValueError:
+        logger.warning(f"Level {level} not available for {slide_file}, using level 0 instead.")
+        image_array, _ = reader.get_data(image, level=0)
+    array = image_array.numpy().transpose(1, 2, 0)
+    to_pil = torchvision.transforms.ToPILImage()
+    array_pil = to_pil(array)
+    return array_pil
+
+
+def _make_thumbnail(sample: DatasetRecord, reader: WSIReader, level: int, slide_size: Tuple[int, int], images_dir: Path,
+                    masks_dir: Optional[Path] = None, image_suffix: str = '.png',
+                    default_mask_color: TupleInt3 = (119, 161, 120)) -> None:
+    """Make thumbnails of the slides in slides dataset.
+
+    :param sample: The slide dataset object dictionary for which thumbnail needs to be created.
+    :param reader: The WSI reader for loading the slide.
+    :param slide_size: The tuple of slide size (width, height).
+    :param images_dir: The path to the `images` directory where WSI thumbnails will be stored.
+    :param level: Resolution downsampling level.
+    :param masks_dir: Optional path to `masks` directory where mask thumbnails will be stored.
+        If `None` (default), masks thumbnails will not be created.
+    :param image_suffix: Suffix of image thumbnails (default=`.png`).
+    :param default_mask_color: Color of the masks (default = (119, 161, 120)).
+    """
+    try:
+        image_pil = load_slide_as_pil(reader, sample[SlideKey.IMAGE], level)
+        image_pil = image_pil.resize(slide_size)
+        slide_id = sample.get(SlideKey.SLIDE_ID, "")
+        # Slide IDs can be purely numeric, in those cases need to convert to str
+        text_to_add = str(slide_id)
+        if SlideKey.LABEL in sample:
+            label = str(sample[SlideKey.LABEL])
+            text_to_add += ": " + str(label)
+        if text_to_add:
+            add_text(image_pil, text_to_add)
+        if masks_dir is not None and SlideKey.MASK in sample:
+            masks_dir.mkdir(exist_ok=True)
+            try:
+                mask_pil = load_slide_as_pil(reader, sample[SlideKey.MASK], level=level)
+                mask_pil = mask_pil.resize(slide_size, Image.NEAREST)
+                mask_pil = Image.fromarray(np.asarray(mask_pil) * 255)  # for visualization
+            except ValueError:
+                mask_pil = Image.new("RGB", slide_size, default_mask_color)
+            finally:
+                mask_path = masks_dir / f"{slide_id}.png"
+                mask_pil.save(mask_path)
+        image_path = images_dir / f"{slide_id}{image_suffix}"
+        image_pil.save(image_path)
+    except Exception as ex:
+        slide_id = sample.get(SlideKey.SLIDE_ID, "(no slide ID found)")
+        logging.warning(f"Unable to process slide with ID '{slide_id}': {ex}")
+
+
+def make_thumbnails(records: List[DatasetRecord],
+                    slide_size: Tuple[int, int],
+                    images_dir: Path,
+                    level: int,
+                    masks_dir: Optional[Path] = None,
+                    num_parallel: int = 0,
+                    image_suffix: str = '.png',
+                    backend: str = WSIBackend.CUCIM) -> None:
+    """Make thumbnails of the slides in slides dataset.
+
+    :param records: A list of dataframe records. The records must contain at least the columns `slide_id` and `image`.
+    :param slide_size: The tuple of slide size (width, height).
+    :param images_dir: The path to the `images` directory where WSI thumbnails will be stored.
+    :param level: Resolution downsampling level.
+    :param masks_dir: Optional path to `masks` directory where mask thumbnails will be stored (default=None).
+    :param num_parallel: Number of parallel processes for thumbnail creation. Use 0 to disable parallel.
+    :param image_suffix: Suffix of image thumbnails (default=`.png`).
+    :param backend: The backend to use for reading the WSI (default=`cucim`).
+    """
+    images_dir.mkdir(exist_ok=True, parents=True)
+    reader = WSIReader(backend=backend)
+    func = functools.partial(
+        _make_thumbnail,
+        reader=reader,
+        level=level,
+        slide_size=slide_size,
+        images_dir=images_dir,
+        masks_dir=masks_dir,
+        image_suffix=image_suffix,
+    )
+    if num_parallel > 0:
+        pool = multiprocessing.Pool(num_parallel)
+        map_func = pool.imap_unordered  # type: ignore
+    else:
+        map_func = map  # type: ignore
+    progress = tqdm(map_func(func, records), total=len(records))
+    list(progress)  # type: ignore
+    if num_parallel > 0:
+        pool.close()
+
+
+def make_montage_from_dir(images_dir: Path, num_cols: int, masks_dir: Optional[Path] = None,
+                          image_suffix: str = '.png') -> Image:
+    """Create the montage image from the thumbnails.
+
+    :param images_dir: The path to the `images` directory where WSI thumbnails will be stored.
+    :param num_cols: Number of columns in the montage.
+    :param masks_dir: Optional path to `masks` directory where mask thumbnails will be stored (default=None).
+    :param image_suffix: Suffix of image thumbnails (default=`.png`).
+    :return: PIL image of the montage.
+    """
+    image_paths = sorted(images_dir.glob(f'*{image_suffix}'))
+    if len(image_paths) == 0:
+        raise ValueError(f"No thumbnail images found in {images_dir}")
+    images_arrays = []
+    for image_path in tqdm(image_paths):
+        image_pil = Image.open(image_path)
+        images_arrays.append(np.asarray(image_pil))
+    images_array = np.asarray(images_arrays)
+    if masks_dir is not None:
+        mask_paths = sorted(masks_dir.glob('*.png'))
+        # Don't process masks if there are no files present, even if the mask directory has been passed as an argument.
+        if len(mask_paths) > 0:
+            if len(mask_paths) != len(image_paths):
+                raise ValueError("Number of masks is different from number of images.")
+            masks_arrays = []
+            for mask_path in tqdm(mask_paths):
+                mask_pil = Image.open(mask_path)
+                masks_arrays.append(np.asarray(mask_pil))
+            masks_array = np.asarray(masks_arrays)
+            images_array = np.concatenate((images_array, masks_array), axis=-2)
+    images_tensor = torch.from_numpy(images_array).permute(0, 3, 1, 2)
+    grid_tensor = torchvision.utils.make_grid(images_tensor, nrow=num_cols)
+    grid_pil = torchvision.transforms.ToPILImage()(grid_tensor)
+    return grid_pil
+
+
+def dataset_from_folder(root_folder: Path, glob_pattern: str = "**/*") -> pd.DataFrame:
+    """Create slides dataset all files in a folder. The function searches for all files in the `root_folder` and its
+    subfolders, and creates a dataframe with the following columns: `image`: The absolute path of the file,
+    column `slide_id`: Either the file name only if that is unique, or otherwise the path of the file relative to
+    the `root_folder`.
+
+    :param root_folder: A directory with (image) files.
+    :param glob_pattern: The glob pattern to match the image files (default=`**/*`, using all files recursively in all
+        subfolders).
+    :return: Slides dataset.
+    """
+    if not root_folder.is_dir():
+        raise ValueError(f"Root folder '{root_folder}' does not exist or is not a directory.")
+    # Find all image files in the folder, exclude folders in the result
+    image_paths = list(sorted(f for f in root_folder.glob(glob_pattern) if f.is_file()))
+    file_names_only = [path.name for path in image_paths]
+    # Check if file names alone are enough to make the dataset unique.
+    if len(file_names_only) != len(set(file_names_only)):
+        # There are duplicates when going only by file names. Hence, use full paths relative to the root folder.
+        image_ids = [str(path.relative_to(root_folder)) for path in image_paths]
+    else:
+        # File names are unique. Hence, use them as slide IDs.
+        image_ids = file_names_only
+    # Mounted datasets can show up with '&' appearing escape. Clean up the image IDs if so.
+    # We expect that the exclusion list shows '&' for the offending slides.
+    escaped_amp = "%26"
+    if any(escaped_amp in image_id for image_id in image_ids):
+        logging.info(f"Some image IDs contain '{escaped_amp}', replacing that with '&'.")
+        image_ids = [id.replace(escaped_amp, "&") for id in image_ids]
+    return pd.DataFrame({SlideKey.SLIDE_ID: image_ids, SlideKey.IMAGE: map(str, image_paths)})
+
+
+def restrict_dataset(dataset: pd.DataFrame,
+                     column: str,
+                     items: List[str],
+                     include: bool) -> pd.DataFrame:
+    """Exclude or include slides from a dataset, based on values in a column.
+    For example, to exclude slides with label `0` from the dataset, use:
+    restrict_dataset(dataset, column='label', items=['0'], include=False).
+
+    The items are matched with the column values using `isin` operator. The code also handles
+    the case where the column in question is the dataset index.
+    If the items in question are not present in the column, the result is an empty dataset (if include=True)
+    or the original dataset (if include=False).
+
+    :param dataset: Slides dataset.
+    :param column: The name of the column on which the inclusion/exclusion name.
+    :param items: The values that the column should match.
+    :param include: If True, modify the dataset to only include the rows where the column matches a value in the
+        item list in `items`. If False, modify the dataset to exclude those rows.
+    :return: Filtered dataset.
+    """
+    if column in dataset:
+        matching_rows = dataset[column].isin(items)
+        if not include:
+            matching_rows = ~matching_rows
+        return dataset[matching_rows]
+    elif dataset.index.name == column:
+        # Drop or loc operations on an index column when the values do not exist raise an error. Hence, restrict
+        # to existing values first.
+        items = list(set(items).intersection(set(dataset.index)))
+        if include:
+            return dataset.loc[items]
+        else:
+            return dataset.drop(items)
+    else:
+        raise ValueError(f"Column {column} not found in dataset.")
+
+
+def dataset_to_records(dataset: DatasetOrDataframe) -> List[DatasetRecord]:
+    """Converts a SlidesDataset or a plain dataframe to a list of dictionaries.
+
+    :param dataset: Slides dataset or a plain dataframe.
+    """
+    if isinstance(dataset, pd.DataFrame):
+        return dataset.to_dict(orient='records')
+    elif isinstance(dataset, SlidesDataset):
+        # SlidesData overrides __getitem__, use that to convert to records
+        return [dataset[i] for i in range(len(dataset))]
+    else:
+        raise ValueError(f"Can't convert {type(dataset)} to a list of records.")
+
+
+def make_montage(records: List[DatasetRecord],
+                 out_path: Path,
+                 width: int = 60_000,
+                 level: int = 2,
+                 image_suffix: str = '.png',
+                 masks: bool = True,
+                 temp_dir: Optional[Union[Path, str]] = None,
+                 cleanup: bool = False,
+                 num_parallel: int = 0,
+                 backend: str = "cucim") -> None:
+    """Make the montage of WSI thumbnails from a slides dataset.
+
+    :param records: A list of dataframe records. The records must contain at least the columns `slide_id` and `image`.
+    :param out_path: The output path where the montage image will be stored.
+    :param width: The width of the montage (default=60000).
+    :param level: Resolution downsampling level at which the WSI will be read (default=2).
+    :param image_suffix: Suffix of image thumbnails (default=`.png`).
+    :param masks: Flag to denote if masks need to be included (default=True).
+    :param temp_dir: Optional path to temporary directory that stores the slide thumbnails.
+        If `None`(default), a temporary directory will be created in `tmp` folder.
+    :param cleanup: Flag to determine whether to clean the temporary directory containing thumbnails
+        after the montage is created (default=False).
+    :param num_parallel: Number of parallel processes for thumbnail creation. Use 0 or 1 to disable parallel.
+    :param backend: The backend to use for reading the WSI (default=`cucim`).
+    """
+    # There might be some blanks at the bottom right
+    # rows * cols <= N
+    # We are going to stack the slides and their masks slide by side, so we need 2 * cols
+    # 2 * cols / rows = 16 / 9; rows = 2 * cols * 9 / 16
+    # cols * 2 * cols * 9 / 16 <= N; cols <= (N / 2 * 16 / 9)**(1 / 2)
+    num_slides = len(records)
+    multiplier = 2 if masks else 1
+    num_cols = int(np.sqrt(num_slides / multiplier * 16 / 9))
+    logging.info(f"Creating montage from {num_slides} slides with {num_cols} columns.")
+    slide_width = (width // num_cols) // multiplier
+    slide_size = slide_width, slide_width // 2
+    temp_dir = tempfile.mkdtemp() if temp_dir is None else temp_dir
+    temp_dir = Path(temp_dir)
+    image_thumbnail_dir = temp_dir / "images"
+    mask_thumbnail_dir = temp_dir / "masks" if masks else None
+
+    if image_thumbnail_dir.is_dir():
+        logging.info(f"Skipping thumbnail creation because folder already exists: {image_thumbnail_dir}")
+    else:
+        logging.info(f"Starting thumbnail creation with thumbnail size {slide_size}")
+        make_thumbnails(records=records, slide_size=slide_size, images_dir=image_thumbnail_dir, level=level,
+                        masks_dir=mask_thumbnail_dir, image_suffix=image_suffix, num_parallel=num_parallel,
+                        backend=backend)
+    try:
+        logging.info("Starting montage creation")
+        montage_pil = make_montage_from_dir(image_thumbnail_dir,
+                                            num_cols,
+                                            masks_dir=mask_thumbnail_dir,
+                                            image_suffix=image_suffix)
+    except Exception as ex:
+        raise ValueError(f"Failed to create montage from {image_thumbnail_dir}: {ex}")
+    logger.info(f"Saving montage to {out_path}")
+    montage_pil.save(out_path)
+    if out_path.suffix != '.jpg':
+        jpeg_out = out_path.with_suffix('.jpg')
+        montage_pil.save(jpeg_out, format='JPEG', quality=90)
+    if cleanup:
+        shutil.rmtree(temp_dir)
+
+
+class MontageCreation(MontageConfig):
+    def read_list(self, csv_file_path: Optional[Path]) -> List[str]:
+        """Reads a list of slide IDs from a file."""
+        if csv_file_path:
+            df = pd.read_csv(csv_file_path)
+            column_to_read = df.columns[0]
+            if len(df.columns) > 1:
+                logger.warning(f"More than one column in file, using first column: {column_to_read}")
+            return df[column_to_read].tolist()
+        else:
+            return []
+
+    def read_exclusion_list(self) -> List[str]:
+        """Read the list of slide IDs that should be excluded from the montage."""
+        if self.exclude_by_slide_id:
+            slides_to_exclude = self.read_list(self.exclude_by_slide_id)
+            logger.info(f"Excluding {len(slides_to_exclude)} slides from montage. First 3: {slides_to_exclude[:3]}")
+            logger.info("Exclusion list will be matched against the Slide ID column (for predefined datasets) or the "
+                        "filename.")
+            return slides_to_exclude
+        else:
+            return []
+
+    def read_inclusion_list(self) -> List[str]:
+        """Read the list of slide IDs that should be included in the montage."""
+        if self.include_by_slide_id:
+            slides_to_include = self.read_list(self.include_by_slide_id)
+            logger.info(f"Restricting montage to {len(slides_to_include)} slides. First 3: {slides_to_include[:3]}")
+            logger.info("Inclusion list will be matched against the Slide ID column (for predefined datasets) or the "
+                        "filename.")
+            return slides_to_include
+        else:
+            return []
+
+    def read_dataset(self, input_folder: Path) -> DatasetOrDataframe:
+        """Read the dataset that should be used for creating the montage. If a glob pattern has been provided, then
+        all the image files specified by that pattern will be used for the montage. Otherwise, a file `dataset.csv`
+        is expected in the input folder. The `dataset.csv` will be used to create an instance of `SlidesDataset`.
+
+        :param input_folder: The folder where the dataset is located.
+        :return: A SlidesDataset or dataframe object that contains the dataset."""
+        if self.image_glob_pattern:
+            logger.info(f"Trying to create a dataset from files that match: {self.image_glob_pattern}")
+            try:
+                dataset = dataset_from_folder(input_folder, glob_pattern=self.image_glob_pattern)
+            except Exception as ex:
+                raise ValueError(f"Unable to create dataset from files in folder {input_folder}: {ex}")
+            if len(dataset) == 0:
+                raise ValueError(f"No images found in folder {input_folder} with pattern {self.image_glob_pattern}")
+            return dataset
+        else:
+            logger.info(f"Trying to load the dataset as a SlidesDataset from folder {input_folder}")
+            try:
+                dataset = SlidesDataset(root=input_folder)
+            except Exception as ex:
+                logging.error("Unable to load dataset.")
+                file = input_folder / SlidesDataset.DEFAULT_CSV_FILENAME
+                # Print the whole directory tree to check where the problem is.
+                while str(file) != str(file.root):
+                    logging.debug(f"File: {file}, exists: {file.exists()}")
+                    file = file.parent
+                raise ValueError(f"Unable to load dataset. Check if the file {SlidesDataset.DEFAULT_CSV_FILENAME} "
+                                 f"exists, or provide a file name pattern via --image_glob_pattern. Error: {ex}")
+            return dataset
+
+    def create_montage(self, input_folder: Path) -> None:
+        """Creates a montage from the dataset in the input folder. The method reads the dataset, creates an output
+        folder, handles the inclusion and exclusion lists, and then calls the method that creates the montage.
+
+        :param input_folder: The folder where the dataset is located.
+        :raises ValueError: If both an inclusion and exclusion list have been provided.
+        """
+        dataset = self.read_dataset(input_folder)
+        self.output_path.mkdir(parents=True, exist_ok=True)
+        if self.include_by_slide_id and self.exclude_by_slide_id:
+            raise ValueError("You cannot provide both an inclusion and exclusion list.")
+        if self.include_by_slide_id:
+            items = self.read_inclusion_list()
+            exclude_items = False
+        elif self.exclude_by_slide_id:
+            items = self.read_exclusion_list()
+            exclude_items = True
+        else:
+            items = []
+            exclude_items = True
+        self.montage_from_included_and_excluded_slides(
+            dataset=dataset,
+            items=items,
+            exclude_items=exclude_items,
+        )
+
+    def montage_from_included_and_excluded_slides(self,
+                                                  dataset: DatasetOrDataframe,
+                                                  items: Optional[List[str]] = None,
+                                                  exclude_items: bool = True,
+                                                  restrict_by_column: str = "") -> Optional[Path]:
+        """Creates a montage of included and excluded slides from the dataset.
+
+        :param dataset: Slides dataset or a plain dataframe.
+        :param items: A list values for SlideID that should be included/excluded from the montage.
+        :param exclude_items: If True, exclude the list in `items` from the montage. If False, include
+            only those in the montage.
+        :param restrict_by_column: The column name that should be used for inclusion/exclusion lists
+            (default=dataset.SLIDE_ID_COLUMN).
+        :return: A path to the created montage, or None if no images were available for creating the montage.
+        """
+        if isinstance(dataset, pd.DataFrame):
+            df_original = dataset
+        else:
+            df_original = dataset.dataset_df
+        logging.info(f"Input dataset contains {len(df_original)} records.")
+
+        if restrict_by_column == "":
+            if isinstance(dataset, pd.DataFrame):
+                restrict_by_column = SlideKey.SLIDE_ID.value
+            else:
+                restrict_by_column = dataset.SLIDE_ID_COLUMN
+        if items:
+            if exclude_items:
+                logging.info(f"Using dataset column '{restrict_by_column}' to exclude slides")
+                include = False
+            else:
+                logging.info(f"Using dataset column '{restrict_by_column}' to restrict the set of slides")
+                include = True
+            df_restricted = restrict_dataset(
+                df_original,
+                column=restrict_by_column,
+                items=items,
+                include=include)
+            logging.info(f"Updated dataset contains {len(df_restricted)} records")
+        else:
+            df_restricted = df_original
+
+        montage_result = self.output_path / MONTAGE_FILE
+        logging.info(f"Creating montage in {montage_result}")
+        if isinstance(dataset, pd.DataFrame):
+            records_restricted = dataset_to_records(df_restricted)
+        else:
+            dataset.dataset_df = df_restricted
+            records_restricted = dataset_to_records(dataset)
+            # We had to modify the dataset in place, hence restore the original dataset to avoid odd side-effects.
+            dataset.dataset_df = df_original
+
+        if len(records_restricted) > 0:
+            make_montage(
+                records=records_restricted,
+                out_path=montage_result,
+                width=self.width,
+                level=self.level,
+                masks=False,
+                cleanup=True,
+                num_parallel=self.parallel,
+                backend=self.backend)
+            return montage_result
+        else:
+            logging.info("No slides to include in montage, skipping.")
+            return None
+
+
+def create_config_from_args() -> MontageConfig:
+    """Creates a configuration object for montage creation from the commandline arguments.
+
+    :return: An object that describes all options for the montage creation.
+    """
+    parser = create_montage_argparser()
+    config = MontageCreation()
+    parser_results = parse_arguments(parser, args=sys.argv[1:], fail_on_unknown_args=True)
+    _ = apply_overrides(config, parser_results.args)
+    return config
--- a/hi-ml-cpath/src/health_cpath/utils/montage_config.py
+++ b/hi-ml-cpath/src/health_cpath/utils/montage_config.py
@ -0,0 +1,75 @@
+#  ------------------------------------------------------------------------------------------
+#  Copyright (c) Microsoft Corporation. All rights reserved.
+#  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
+#  ------------------------------------------------------------------------------------------
+from argparse import ArgumentParser
+from pathlib import Path
+from typing import Optional
+
+import param
+
+from health_azure.utils import create_argparser
+
+
+class AzureRunConfig(param.Parameterized):
+    cluster: str = \
+        param.String(default="", allow_None=False,
+                     doc="The name of the GPU or CPU cluster inside the AzureML workspace"
+                         "that should execute the job. To run on your local machine, omit this argument.")
+    datastore = \
+        param.String(default="",
+                     doc="The name of the AzureML datastore where the dataset is defined.")
+    dataset = \
+        param.String(default="",
+                     doc="The name of the AzureML dataset to use for creating the montage. The dataset will be "
+                         "mounted automatically. Use an absolute path to a folder on the local machine to bypass "
+                         "mounting.")
+    conda_env: Optional[Path] = \
+        param.ClassSelector(class_=Path, default=Path("hi-ml/hi-ml-cpath/environment.yml"), allow_None=True,
+                            doc="The Conda environment file that should be used when submitting the present run to "
+                                "AzureML. If not specified, the hi-ml-cpath environment file will be used.")
+    wait_for_completion: bool = param.Boolean(default=False,
+                                              doc="If True, wait for AML Run to complete before proceeding. "
+                                                  "If False, submit the run to AML and exit")
+
+
+class MontageConfig(AzureRunConfig):
+    level: int = \
+        param.Integer(default=1,
+                      doc="Resolution downsample level, e.g. if lowest resolution is 40x and the available "
+                          "downsample levels are [1.0, 4.0, 16.0] then level = 1 corresponds to 10x magnification")
+    exclude_by_slide_id: Optional[Path] = \
+        param.ClassSelector(class_=Path, default=None, allow_None=True,
+                            doc="Provide a file that contains slide IDs that should be excluded. File format is "
+                                "CSV, the first column is used as the slide ID. If the file is empty, no slides "
+                                "will be excluded.")
+    include_by_slide_id: Optional[Path] = \
+        param.ClassSelector(class_=Path, default=None, allow_None=True,
+                            doc="Provide a file that contains slide IDs that should be included. File format is "
+                                "CSV, the first column is used as the slide ID. If the file is empty, no montage "
+                                "will be produced.")
+    image_glob_pattern: str = \
+        param.String(default="",
+                     doc="When provided, use this pattern in rglob to find the files that should be included in the "
+                         "montage. Example: '**/*.tiff' to find all TIFF files recursive. You may have to escape "
+                         "the pattern in your shell.")
+    width: int = \
+        param.Integer(default=60_000,
+                      doc="The width of the montage in pixels")
+    output_path: Path = \
+        param.ClassSelector(class_=Path,
+                            default=Path("outputs"),
+                            doc="The folder where the montage will be saved")
+    parallel: int = \
+        param.Integer(default=8,
+                      doc="The number of parallel processes to use when creating the montage.")
+    backend: str = \
+        param.String(default="openslide",
+                     doc="The backend to use for reading the slides. Can be 'openslide' or 'cucim'")
+
+
+def create_montage_argparser() -> ArgumentParser:
+    return create_argparser(
+        MontageConfig(),
+        usage="python create_montage.py --dataset <dataset_folder> --image_glob_pattern '**/*.tiff' --width 1000",
+        description="Create an overview image with thumbnails of all slides in a dataset.")
--- a/hi-ml-cpath/testhisto/test_data/montages/montage_excluded.png
+++ b/hi-ml-cpath/testhisto/test_data/montages/montage_excluded.png
--- a/hi-ml-cpath/testhisto/test_data/montages/montage_from_folder.png
+++ b/hi-ml-cpath/testhisto/test_data/montages/montage_from_folder.png
--- a/hi-ml-cpath/testhisto/test_data/montages/montage_from_random_thumbs.png
+++ b/hi-ml-cpath/testhisto/test_data/montages/montage_from_random_thumbs.png
--- a/hi-ml-cpath/testhisto/test_data/montages/montage_included.png
+++ b/hi-ml-cpath/testhisto/test_data/montages/montage_included.png
--- a/hi-ml-cpath/testhisto/test_data/montages/montage_via_args.png
+++ b/hi-ml-cpath/testhisto/test_data/montages/montage_via_args.png
--- a/hi-ml-cpath/testhisto/test_data/montages/montage_with_masks.png
+++ b/hi-ml-cpath/testhisto/test_data/montages/montage_with_masks.png
--- a/hi-ml-cpath/testhisto/test_data/montages/montage_without_masks.png
+++ b/hi-ml-cpath/testhisto/test_data/montages/montage_without_masks.png
--- a/hi-ml-cpath/testhisto/testhisto/mocks/slides_generator.py
+++ b/hi-ml-cpath/testhisto/testhisto/mocks/slides_generator.py
@ -53,6 +53,7 @@ class MockPandaSlidesGenerator(MockHistoDataGenerator):
        :param n_tiles_list: A list to use different n_tiles per slide for randomly positioned tiles.
        :param kwargs: Same params passed to MockHistoDataGenerator.
        """
+        self.generated_files: List[str] = []
        super().__init__(**kwargs)

        self.n_levels = n_levels
@ -120,13 +121,14 @@ class MockPandaSlidesGenerator(MockHistoDataGenerator):
                        else tiles[i % self.n_tiles].numpy()
                    )
                    # fill the square diagonal with tile repeated n_repeat_tile times along X and Y axis.
-                    fill_square = np.tile(tile, (self.n_repeat_tile, self.n_repeat_tile))
+                    fill_square: Union[np.ndarray, float] = np.tile(tile, (self.n_repeat_tile, self.n_repeat_tile))
                    dump_tiles.append(tile)

            elif self.mock_type == MockHistoDataType.FAKE:
                if i == 0 or self.n_tiles > 1:
                    # pick a random fake value to fill in the square diagonal.
-                    fill_square = np.random.uniform(0, self.background_val / (self.n_repeat_diag + 1) * (i + 1))
+                    upper = self.background_val / (self.n_repeat_diag + 1) * (i + 1)
+                    fill_square = np.random.uniform(0, upper)
                    dump_tiles.append(
                        np.full(
                            shape=(self.n_channels, self.tile_size, self.tile_size),
@ -222,6 +224,7 @@ class MockPandaSlidesGenerator(MockHistoDataGenerator):

            slide_tiff_filename = self.dest_data_path / "train_images" / f"_{slide_counter}.tiff"
            self._save_mock_wsi_as_tiff_file(slide_tiff_filename, wsi_levels)
+            self.generated_files.append(str(slide_tiff_filename))

            if dump_tiles is not None:
                dump_tiles_filename = self.dest_data_path / "dump_tiles" / f"_{slide_counter}.npy"
--- a/hi-ml-cpath/testhisto/testhisto/utils/test_montage.py
+++ b/hi-ml-cpath/testhisto/testhisto/utils/test_montage.py
@ -0,0 +1,416 @@
+import shutil
+from pathlib import Path
+from unittest import mock
+import numpy as np
+import pandas as pd
+import pytest
+from PIL import Image
+from pandas.testing import assert_frame_equal
+from typing import Generator, List
+
+from health_cpath.utils.montage import (
+    MONTAGE_FILE,
+    MontageCreation,
+    dataset_from_folder,
+    dataset_to_records,
+    make_montage,
+    make_montage_from_dir,
+    restrict_dataset,
+)
+from health_cpath.datasets.base_dataset import SlidesDataset
+from health_cpath.datasets.panda_dataset import PandaDataset
+from health_cpath.scripts.create_montage import main as script_main
+from health_cpath.utils.naming import SlideKey
+from testhisto.mocks.base_data_generator import MockHistoDataType
+from testhisto.mocks.slides_generator import MockPandaSlidesGenerator
+from testhisto.utils.utils_testhisto import assert_binary_files_match, full_ml_test_data_path, wait_until_file_exists
+
+
+# Set this to True to update all stored images in the test_data folder.
+UPDATE_STORED_RESULTS = False
+
+NUM_SLIDES = 6
+
+
+def expected_results_folder() -> Path:
+    """Gets the path to the folder where the expected montage results are stored.
+
+    :return: The path to the folder where the expected results are stored.
+    """
+    return full_ml_test_data_path("montages")
+
+
+def _create_slides_images(tmp_path: Path) -> MockPandaSlidesGenerator:
+    print(f"Result folder: {tmp_path}")
+    wsi_generator = MockPandaSlidesGenerator(
+        dest_data_path=tmp_path,
+        mock_type=MockHistoDataType.FAKE,
+        n_tiles=4,
+        n_slides=NUM_SLIDES,
+        n_channels=3,
+        n_levels=3,
+        tile_size=28,
+        background_val=255,
+    )
+    wsi_generator.generate_mock_histo_data()
+    print(f"Generated images in {tmp_path}")
+    return wsi_generator
+
+
+@pytest.fixture(scope="module")
+def temp_panda_dataset(tmp_path_factory: pytest.TempPathFactory) -> Generator:
+    """A fixture that creates a PandaDataset object with randomly created slides.
+    """
+    tmp_path = tmp_path_factory.mktemp("mock_panda")
+    _create_slides_images(tmp_path)
+    usecols = [PandaDataset.SLIDE_ID_COLUMN, PandaDataset.MASK_COLUMN]
+    yield PandaDataset(root=tmp_path, dataframe_kwargs={"usecols": usecols + list(PandaDataset.METADATA_COLUMNS)})
+
+
+@pytest.fixture(scope="module")
+def temp_slides(tmp_path_factory: pytest.TempPathFactory) -> Generator:
+    """A fixture that creates a folder (Path object) with randomly created slides.
+    """
+    tmp_path = tmp_path_factory.mktemp("mock_wsi")
+    _create_slides_images(tmp_path)
+    yield tmp_path
+
+
+@pytest.fixture(scope="module")
+def temp_slides_dataset(tmp_path_factory: pytest.TempPathFactory) -> Generator:
+    """A fixture that creates a SlidesDataset object with randomly created slides.
+    """
+    tmp_path = tmp_path_factory.mktemp("mock_slides")
+    wsi_generator = _create_slides_images(tmp_path)
+    # Create a CSV file with the 3 required columns for montage creation. Mask is optional.
+    metadata = {
+        SlideKey.SLIDE_ID: [f"ID {i}" for i in range(NUM_SLIDES)],
+        SlideKey.IMAGE: wsi_generator.generated_files,
+        SlideKey.LABEL: [f"Label {i}" for i in range(NUM_SLIDES)],
+    }
+    df = pd.DataFrame(data=metadata)
+    csv_filename = tmp_path / SlidesDataset.DEFAULT_CSV_FILENAME
+    df.to_csv(csv_filename, index=False)
+    # Tests fail non-deterministically, saying that the dataset file does not exist (yet). Hence, wait.
+    wait_until_file_exists(csv_filename)
+    yield SlidesDataset(root=tmp_path)
+
+
+def _create_folder_with_images(tmp_path: Path, num_images: int = 4, image_size: int = 20) -> None:
+    """Creates a folder with images.
+
+    :param tmp_path: The path to the folder where the images should be stored.
+    :param num_images: The number of slides that should be created.
+    """
+    np.random.seed(42)
+    tmp_path.mkdir(parents=True, exist_ok=True)
+    for i in range(num_images):
+        image_path = tmp_path / f"image_{i}.png"
+        image_np = np.random.uniform(0, 255, size=(image_size, image_size, 3)).astype(np.uint8)
+        image = Image.fromarray(image_np)
+        image.save(image_path)
+
+
+def test_montage_from_dir(tmp_path: Path) -> None:
+    """Test montage creation from a directory of images."""
+    print(f"Result folder: {tmp_path}")
+    np.random.seed(42)
+    # Create a directory of images
+    image_dir = tmp_path / "images"
+    thumb_size = 20
+    _create_folder_with_images(image_dir, num_images=4, image_size=thumb_size)
+
+    # Create a montage from the directory
+    file_name = "montage_from_random_thumbs.png"
+    montage_path = tmp_path / file_name
+    montage_image = make_montage_from_dir(image_dir, num_cols=2)
+    # We have 2 columns, so the montage should be 2x the size of the thumbnail, plus a 2px border and space in between
+    pad = 2
+    expected_size = 2 * thumb_size + 3 * pad
+    assert montage_image.size == (expected_size, expected_size)
+
+    montage_image.save(montage_path)
+    assert montage_path.is_file()
+
+    expected_file = expected_results_folder() / file_name
+    if UPDATE_STORED_RESULTS:
+        shutil.copyfile(montage_path, expected_file)
+    assert_binary_files_match(montage_path, expected_file)
+
+
+@pytest.mark.parametrize("use_masks", [True, False])
+def test_montage_from_dataset(tmp_path: Path, temp_panda_dataset: PandaDataset, use_masks: bool) -> None:
+    """Test if a montage can be generated from a slides dataset that uses masks."""
+    dataset = dataset_to_records(temp_panda_dataset)
+    montage = tmp_path / "montage.png"
+    make_montage(dataset, out_path=montage, width=1000, num_parallel=1, masks=use_masks)
+    assert montage.is_file()
+    expected_file = expected_results_folder() / ("montage_with_masks.png" if use_masks else "montage_without_masks.png")
+    if UPDATE_STORED_RESULTS:
+        shutil.copyfile(montage, expected_file)
+    assert_binary_files_match(montage, expected_file)
+
+
+def test_restrict_dataset() -> None:
+    column = "image_id"
+    dataset = pd.DataFrame({column: ["a", "b", "c"]})
+    included = restrict_dataset(dataset, column, ["a"], include=True)
+    assert len(included) == 1
+    assert included.iloc[0][column] == "a"
+    excluded = restrict_dataset(dataset, column, ["a"], include=False)
+    assert len(excluded) == 2
+    assert excluded.iloc[0][column] == "b"
+    assert excluded.iloc[1][column] == "c"
+
+    # Check the case when the requested value is not in the dataset
+    included2 = restrict_dataset(dataset, column, ["nope"], include=True)
+    assert len(included2) == 0
+    excluded2 = restrict_dataset(dataset, column, ["nope"], include=False)
+    assert len(excluded2) == 3
+
+
+def test_restrict_dataset_with_index() -> None:
+    column = "image_id"
+    index_column = "index"
+    dataset = pd.DataFrame({column: ["a", "b", "c"], index_column: ["0", "1", "2"]})
+    dataset = dataset.set_index(index_column)
+    included = restrict_dataset(dataset, index_column, ["1"], include=True)
+    assert len(included) == 1
+    assert included.iloc[0][column] == "b"
+    excluded = restrict_dataset(dataset, index_column, ["1"], include=False)
+    assert len(excluded) == 2
+    assert excluded.iloc[0][column] == "a"
+    assert excluded.iloc[1][column] == "c"
+
+    # Check the case when the requested value is not in the dataset
+    included2 = restrict_dataset(dataset, column, ["nope"], include=True)
+    assert len(included2) == 0
+    excluded2 = restrict_dataset(dataset, column, ["nope"], include=False)
+    assert len(excluded2) == 3
+
+
+@pytest.mark.parametrize("exclude_items", [True, False])
+def test_montage_included_and_excluded1(
+        tmp_path: Path,
+        temp_slides_dataset: SlidesDataset,
+        exclude_items: bool) -> None:
+    """Check that a montage with exclusion list is handled correctly."""
+    config = MontageCreation()
+    out_path = tmp_path / "montage"
+    out_path.mkdir(exist_ok=True)
+    config.output_path = out_path
+    config.width = 1000
+    config.parallel = 1
+    config.montage_from_included_and_excluded_slides(
+        temp_slides_dataset,
+        items=["ID 0", "ID 1"],
+        exclude_items=exclude_items,
+    )
+    expected_file = expected_results_folder() / ("montage_excluded.png" if exclude_items else "montage_included.png")
+    montage_file = out_path / MONTAGE_FILE
+    assert montage_file.is_file()
+    if UPDATE_STORED_RESULTS:
+        shutil.copyfile(montage_file, expected_file)
+    assert_binary_files_match(montage_file, expected_file)
+
+
+def test_montage_included_and_excluded2(tmp_path: Path, temp_slides_dataset: SlidesDataset) -> None:
+    """Check that a montage with exclusion list supplies the correct set of images."""
+    out_path = tmp_path / "montage"
+    out_path.mkdir(exist_ok=True)
+    config = MontageCreation()
+    config.output_path = out_path
+    config.parallel = 1
+    config.width = 1000
+    for exclude_items in [True, False]:
+        with mock.patch("health_cpath.utils.montage.make_montage") as mock_montage:
+            montage_file = config.montage_from_included_and_excluded_slides(
+                temp_slides_dataset,
+                items=["ID 0", "ID 1"],
+                exclude_items=exclude_items,
+            )
+            assert montage_file is not None
+            assert mock_montage.call_count == 1
+            records = mock_montage.call_args_list[0][1]["records"]
+            assert isinstance(records, List)
+            slide_ids = sorted([d[SlideKey.SLIDE_ID] for d in records])
+            if exclude_items:
+                assert slide_ids == ["ID 2", "ID 3", "ID 4", "ID 5"]
+            else:
+                assert slide_ids == ["ID 0", "ID 1"]
+
+
+def test_dataset_from_folder_unique(tmp_path: Path) -> None:
+    """Test if a plain dataframe can be created from files in a folder.
+    This tests the case where the file names alone are unique."""
+
+    file_names = ["file1.txt", "file2.txt"]
+    full_files = [tmp_path / file_name for file_name in file_names]
+    for f in full_files:
+        f.touch()
+
+    df = dataset_from_folder(tmp_path)
+    expected_df = pd.DataFrame({SlideKey.SLIDE_ID: file_names, SlideKey.IMAGE: map(str, full_files)})
+    assert_frame_equal(df, expected_df)
+
+
+def test_dataset_from_folder_duplicate_files(tmp_path: Path) -> None:
+    """Test if a plain dataframe can be created from files in a folder.
+    This tests the case where the file names alone are not unique."""
+
+    # Place files of the same name in different folders. The dataset should still be created, with the full path
+    # as the slide ID.
+    file_names = ["folder1/file.txt", "folder2/file.txt"]
+    full_files = [tmp_path / file_name for file_name in file_names]
+    for f in full_files:
+        f.parent.mkdir(exist_ok=True)
+        f.touch()
+
+    df = dataset_from_folder(tmp_path)
+    expected_df = pd.DataFrame({SlideKey.SLIDE_ID: file_names, SlideKey.IMAGE: map(str, full_files)})
+    assert_frame_equal(df, expected_df)
+
+
+def test_dataset_from_folder_fails(tmp_path: Path) -> None:
+    """Test if dataframe creation fails if the argument is not a folder."""
+    with pytest.raises(ValueError, match="does not exist or is not a directory"):
+        dataset_from_folder(tmp_path / "file.txt")
+
+
+def test_montage_from_folder(tmp_path: Path, temp_slides: Path) -> None:
+    """Test if a montage can be created from files in a folder."""
+    dataset = dataset_from_folder(temp_slides, glob_pattern="**/*.tiff")
+    assert len(dataset) == NUM_SLIDES
+    config = MontageCreation()
+    config.output_path = tmp_path
+    config.width = 1000
+    config.parallel = 2
+    result_file = config.montage_from_included_and_excluded_slides(dataset)
+    assert result_file is not None
+    assert result_file.is_file()
+    expected_file = expected_results_folder() / "montage_from_folder.png"
+    if UPDATE_STORED_RESULTS:
+        shutil.copyfile(result_file, expected_file)
+    assert_binary_files_match(result_file, expected_file)
+
+
+def test_montage_from_folder_full(tmp_path: Path, temp_slides: Path) -> None:
+    """Test if a montage can be created from files in a folder, using the commandline entrypoint."""
+    config = MontageCreation()
+    config.image_glob_pattern = "**/*.tiff"
+    config.width = 1000
+    config.parallel = 1
+    config.output_path = tmp_path / "outputs"
+    # Cucim is the only backend that supports TIFF files as created in the test images, openslide fails.
+    config.backend = "cucim"
+    config.create_montage(input_folder=temp_slides)
+    assert (config.output_path / "montage.png").is_file()
+
+
+def test_montage_fails(tmp_path: Path) -> None:
+    """Test if montage creation exits gracefully if files can't be read."""
+    # Create a single invalid TIFF file. The code will fail when trying to read the images (during thumbnail
+    # creation), but it should still reach the point where it creates the montage. There, it will fail because
+    # there is no thumbnails present.
+    image_file = tmp_path / "image.tiff"
+    image_file.touch()
+    config = MontageCreation()
+    config.image_glob_pattern = "**/*.tiff"
+    config.width = 1000
+    config.parallel = 1
+    config.input_folder = tmp_path
+    with pytest.raises(ValueError, match="Failed to create montage"):
+        config.create_montage(input_folder=tmp_path)
+
+
+def test_montage_no_images(tmp_path: Path) -> None:
+    """Test if montage creation fails if no files are present"""
+    config = MontageCreation()
+    config.input_folder = tmp_path
+    config.image_glob_pattern = "**/*.tiff"
+    with pytest.raises(ValueError, match="No images found"):
+        config.create_montage(input_folder=tmp_path)
+
+
+def test_exclusion_list(tmp_path: Path) -> None:
+    """Test if exclusion lists are read correctly from a CSV file and passed to the montage creation function."""
+    config = MontageCreation()
+    assert config.read_exclusion_list() == []
+
+    ids = ["id1"]
+    exclusion_csv = tmp_path / "exclusion.csv"
+    exclusion_df = pd.DataFrame({"col1": ids, "col2": ["something else"]})
+    exclusion_df.to_csv(exclusion_csv, index=False)
+    config.exclude_by_slide_id = exclusion_csv
+    assert config.read_exclusion_list() == ids
+
+    config.image_glob_pattern = "*.png"
+    (tmp_path / "image.png").touch()
+    with mock.patch.object(config, "montage_from_included_and_excluded_slides") as mock_mont:
+        config.create_montage(input_folder=tmp_path)
+        assert mock_mont.call_count == 1
+        assert mock_mont.call_args[1]["items"] == ids
+        assert mock_mont.call_args[1]["exclude_items"]
+
+
+def test_raises_if_no_glob(tmp_path: Path) -> None:
+    """Test for exception if no file pattern specified."""
+    config = MontageCreation()
+    with pytest.raises(ValueError, match="Unable to load dataset"):
+        config.create_montage(input_folder=tmp_path)
+
+
+def test_raises_if_no_images(tmp_path: Path) -> None:
+    """Test for exception if no file pattern specified."""
+    config = MontageCreation()
+    config.image_glob_pattern = "*.png"
+    with pytest.raises(ValueError, match="No images found in folder"):
+        config.create_montage(input_folder=tmp_path)
+
+
+def test_read_dataset_if_csv_present(temp_slides_dataset: SlidesDataset) -> None:
+    """Test if a SlidesDataset can be read from a folder that contains a dataset.csv file."""
+    dataset_path = temp_slides_dataset.root_dir
+    config = MontageCreation()
+    dataset = config.read_dataset(dataset_path)
+    assert isinstance(dataset, SlidesDataset)
+
+
+def test_read_dataset_fails_if_no_csv_present(tmp_path: Path) -> None:
+    """Test behaviour for reading SlidesDataset if not dataset.csv file is present."""
+    config = MontageCreation()
+    with pytest.raises(ValueError, match="Unable to load dataset"):
+        config.read_dataset(tmp_path)
+
+
+def test_montage_from_slides_dataset(tmp_path: Path, temp_slides_dataset: SlidesDataset) -> None:
+    """Test if a montage can be created via SlidesDataset, when the folder contains a dataset.csv file."""
+    dataset_path = temp_slides_dataset.root_dir
+    config = MontageCreation()
+    config.width = 200
+    config.parallel = 1
+    outputs = tmp_path / "outputs"
+    config.output_path = outputs
+    config.create_montage(input_folder=dataset_path)
+    montage = outputs / MONTAGE_FILE
+    assert montage.is_file()
+
+
+def test_montage_via_args(tmp_path: Path, temp_slides: Path) -> None:
+    """Test if montage creation can be invoked correctly via commandline args."""
+    outputs = tmp_path / "outputs"
+    with mock.patch("sys.argv",
+                    [
+                        "",
+                        "--dataset", str(temp_slides),
+                        "--image_glob_pattern", "**/*.tiff",
+                        "--output_path", str(outputs),
+                        "--width", "200"
+                    ]):
+        script_main()
+        montage = outputs / MONTAGE_FILE
+        assert montage.is_file()
+        expected_file = expected_results_folder() / "montage_via_args.png"
+        if UPDATE_STORED_RESULTS:
+            shutil.copyfile(montage, expected_file)
+        assert_binary_files_match(montage, expected_file)
--- a/hi-ml-cpath/testhisto/testhisto/utils/utils_testhisto.py
+++ b/hi-ml-cpath/testhisto/testhisto/utils/utils_testhisto.py
@ -2,7 +2,9 @@
 #  Copyright (c) Microsoft Corporation. All rights reserved.
 #  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
 #  ------------------------------------------------------------------------------------------
+import logging
 import os
+import time
 from pathlib import Path
 from typing import Any, Callable, Collection, Mapping, Sequence

@ -118,3 +120,18 @@ def run_distributed(fn: Callable[..., None], args: Sequence[Any] = (), world_siz
    :param world_size: Total number of distributed subprocesses to spawn.
    """
    torch.multiprocessing.spawn(_run_distributed_process, args=(world_size, fn, args), nprocs=world_size)
+
+
+def wait_until_file_exists(filename: Path, timeout_sec: float = 10.0, sleep_sec: float = 0.1) -> None:
+    """Wait until the given file exists. If the file does not exist after the given timeout, an exception is raised.
+
+    :param filename: The file to wait for.
+    :param timeout_sec: The maximum time to wait until the file exists.
+    :param sleep_sec: The time to sleep between repeated checks if the file exists already.
+    :raises TimeoutError: If the file does not exist after the given timeout."""
+    current_time = time.time()
+    while not filename.exists():
+        logging.info(f"Waiting for file {filename}. Total wait time so far: {time.time() - current_time} seconds.")
+        time.sleep(sleep_sec)
+        if time.time() - current_time > timeout_sec:
+            raise TimeoutError(f"File {filename} still does not exist after waiting for {timeout_sec} seconds")