Add Tensorboard ability to read from different folders (#112)

* Update run_tensorboard to work with logs in different folders * fix flake8 complaints * fix test and changelog * move types-requests to test_requirements for running mypy * update test * move tensorboard example into docs * shorten command line arg names * Address PR comments
2021-09-21 15:52:54 +01:00 · 2021-09-21 15:52:54 +01:00 · 46b2d3faf0
--- a/.github/workflows/check_changelog.yml
+++ b/.github/workflows/check_changelog.yml
@ -24,7 +24,7 @@ jobs:
          if [[ "$changed_files" =~ CHANGELOG\.md ]]
          then
            changelog=$(cat ./CHANGELOG.md)
-            if [[ "$changelog" =~ https://github\.com/microsoft/InnerEye-DeepLearning/pull/$PR_NUMBER ]]
+            if [[ "$changelog" =~ https://github\.com/microsoft/hi-ml/pull/$PR_NUMBER ]]
            then
              echo "Changelog has been updated and contains the PR number."
            else
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -13,10 +13,10 @@ created.
 ## Upcoming

 ### Added
- ([#111](https://github.com/microsoft/InnerEye-DeepLearning/pull/111)) Adding changelog. Displaying changelog in sphinx docu. Ensure changelog is updated.
+- ([#111](https://github.com/microsoft/hi-ml/pull/111)) Adding changelog. Displaying changelog in sphinx docu. Ensure changelog is updated.

 ### Changed
-
+- ([#112](https://github.com/microsoft/hi-ml/pull/112)) Update himl_tensorboard to work with files not in 'logs' directory
 ### Fixed

 ### Removed
--- a/docs/source/commandline_tools.md
+++ b/docs/source/commandline_tools.md
@ -7,19 +7,22 @@ From the command line, run the command
 ```himl-tb```

 specifying one of 
-`[--experiment_name] [--latest_run_file] [--run_recovery_ids]` 
+`[--experiment] [--latest_run_file] [--run_recovery_ids] [--run_ids]` 

 This will start a TensorBoard session, by default running on port 6006. To use an alternative port, specify this with `--port`.

-If `--experiment_name` is provided, the most recent Run from this experiment will be visualised.
+If `--experiment` is provided, the most recent Run from this experiment will be visualised.
 If `--latest_run_file` is provided, the script will expect to find a RunId in this file.
 Alternatively you can specify the Runs to visualise via  `--run_recovery_ids` or `--run_ids`.
-You can specify the location where TensorBoard logs will be stored, using the `--run_logs_dir` argument.

-If you choose to specify `--experiment_name`, you can also specify `--num_runs` to view and/or `--tags` to filter by.
+By default, this tool expects that your TensorBoard logs live in a folder named 'logs' and will create a similarly named folder in your root directory. If your TensorBoard logs are stored elsewhere, you can specify this with the `--log_dir` argument.

-If your AML config path is not ROOT_DIR/config.json, you must also specify `--config_path`.
+If you choose to specify `--experiment`, you can also specify `--num_runs` to view and/or `--tags` to filter by.

+If your AML config path is not ROOT_DIR/config.json, you must also specify `--config_file`.
+
+To see an example of how to create TensorBoard logs using PyTorch on AML, see the 
+[AML submitting script](examples/9/aml_sample.rst) which submits the following [pytorch sample script](examples/9/pytorch_sample.rst). Note that to run this, you'll need to create an environment with pytorch and tensorboard as dependencies, as a minimum. See an [example conda environemnt](examples/9/tensorboard_env.rst). This will create an experiment named 'tensorboard_test' on your Workspace, with a single run. Go to outputs + logs -> outputs to see the tensorboard events file.
 ## Download files from AML Runs

 From the command line, run the command 
@ -27,15 +30,14 @@ From the command line, run the command
 ```himl-download```

 specifying one of 
-`[--experiment_name] [--latest_run_file] [--run_recovery_ids] [--run_ids]` 
+`[--experiment] [--latest_run_file] [--run_recovery_ids] [--run_ids]` 

-If `--experiment_name` is provided, the most recent Run from this experiment will be downloaded.
+If `--experiment` is provided, the most recent Run from this experiment will be downloaded.
 If `--latest_run_file` is provided, the script will expect to find a RunId in this file.
-Alternatively you can specify the Runs to download via  `--run_recovery_ids` or `--run_ids`.
+Alternatively you can specify the Run to download via  `--run_recovery_ids` or `--run_ids`.

-The files associated with your Run(s) will be downloaded to the location specified with `--output_dir` (by default ROOT_DIR/outputs)
+The files associated with your Run will be downloaded to the location specified with `--output_dir` (by default ROOT_DIR/outputs)

-If you choose to specify `--experiment_name`, you can also specify `--num_runs` to view and/or `--tags` to filter by.
-
-If your AML config path is not `ROOT_DIR/config.json`, you must also specify `--config_path`.
+If you choose to specify `--experiment`, you can also specify `--tags` to filter by.

+If your AML config path is not `ROOT_DIR/config.json`, you must also specify `--config_file`.
--- a/docs/source/examples/9/aml_sample.py
+++ b/docs/source/examples/9/aml_sample.py
@ -0,0 +1,24 @@
+#  ------------------------------------------------------------------------------------------
+#  Copyright (c) Microsoft Corporation. All rights reserved.
+#  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
+#  ------------------------------------------------------------------------------------------
+from azureml.core import Environment, Experiment, ScriptRunConfig, Workspace
+
+
+def main() -> None:
+    ws = Workspace.from_config("config.json")
+    experiment = Experiment(ws, "tensorboard_test")
+    config = ScriptRunConfig(
+        source_directory='.',
+        script="pytorch_sample.py",
+        compute_target="<name of compute target>"
+    )
+    env = Environment.from_conda_specification("TensorboardTestEnv", "tensorboard_env.yml")
+    config.run_config.environment = env
+
+    run = experiment.submit(config)
+    run.wait_for_completion()
+
+
+if __name__ == "__main__":
+    main()
--- a/docs/source/examples/9/aml_sample.rst
+++ b/docs/source/examples/9/aml_sample.rst
@ -0,0 +1,2 @@
+.. literalinclude:: aml_sample.py
+   :language: python
--- a/docs/source/examples/9/pytorch_sample.py
+++ b/docs/source/examples/9/pytorch_sample.py
@ -0,0 +1,32 @@
+#  ------------------------------------------------------------------------------------------
+# Adapted from the example at https://pytorch.org/tutorials/recipes/recipes/tensorboard_with_pytorch.html
+#  ------------------------------------------------------------------------------------------
+from pathlib import Path
+import torch
+from torch.utils.tensorboard import SummaryWriter
+
+
+def main() -> None:
+    log_dir = Path("outputs")
+    log_dir.mkdir(exist_ok=True)
+    writer = SummaryWriter(log_dir=str(log_dir))
+
+    x = torch.arange(-20, 20, 0.1).view(-1, 1)
+    y = -2 * x + 0.1 * torch.randn(x.size())
+
+    model = torch.nn.Linear(1, 1)
+    criterion = torch.nn.MSELoss()
+    optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
+
+    for epoch in range(10):
+        y1 = model(x)
+        loss = criterion(y1, y)
+        writer.add_scalar("Loss/train", loss, epoch)
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+    writer.flush()
+
+
+if __name__ == "__main__":
+    main()
--- a/docs/source/examples/9/pytorch_sample.rst
+++ b/docs/source/examples/9/pytorch_sample.rst
@ -0,0 +1,2 @@
+.. literalinclude:: pytorch_sample.py
+   :language: python
--- a/docs/source/examples/9/tensorboard_env.rst
+++ b/docs/source/examples/9/tensorboard_env.rst
@ -0,0 +1,2 @@
+.. literalinclude:: tensorboard_env.yml
+   :language: yaml
--- a/docs/source/examples/9/tensorboard_env.yml
+++ b/docs/source/examples/9/tensorboard_env.yml
@ -0,0 +1,10 @@
+name: TensorboardTestEnv
+channels:
+  - defaults
+  - pytorch
+dependencies:
+  - pip=20.1.1
+  - python=3.7.3
+  - pytorch=1.4.0
+  - pip:
+    - tensorboard==2.2.1
--- a/src/health/azure/azure_util.py
+++ b/src/health/azure/azure_util.py
@ -438,13 +438,13 @@ def determine_run_id_source(args: Namespace) -> AzureRunIdSource:
    """
    if "latest_run_file" in args and args.latest_run_file is not None:
        return AzureRunIdSource.LATEST_RUN_FILE
-    if "experiment_name" in args and args.experiment_name is not None:
+    if "experiment" in args and args.experiment is not None:
        return AzureRunIdSource.EXPERIMENT_LATEST
    if "run_recovery_ids" in args and args.run_recovery_ids is not None:
        return AzureRunIdSource.RUN_RECOVERY_ID
    if "run_ids" in args and args.run_ids is not None:
        return AzureRunIdSource.RUN_ID
-    raise ValueError("One of latest_run_file, experiment_name, run_recovery_ids or run_ids must be provided")
+    raise ValueError("One of latest_run_file, experiment, run_recovery_ids or run_ids must be provided")


 def get_aml_runs_from_latest_run_file(args: Namespace, workspace: Workspace) -> List[Run]:
--- a/src/health/azure/himl_download.py
+++ b/src/health/azure/himl_download.py
@ -22,7 +22,7 @@ def determine_output_dir_name(args: Namespace, run_id_source: AzureRunIdSource,
    :return: The path in which to store the AML Run files
    """
    if run_id_source == AzureRunIdSource.EXPERIMENT_LATEST:
-        output_path = output_dir / args.experiment_name
+        output_path = output_dir / args.experiment
    elif run_id_source == AzureRunIdSource.LATEST_RUN_FILE:
        output_path = output_dir / Path(args.latest_run_file).stem
    elif run_id_source == AzureRunIdSource.RUN_RECOVERY_ID:
@ -57,7 +57,7 @@ def main() -> None:  # pragma: no cover
        help="Optional path to most_recent_run.txt where the ID of the latest run is stored"
    )
    parser.add_argument(
-        "--experiment_name",
+        "--experiment",
        type=str,
        required=False,
        help="The name of the AML Experiment that you wish to download Run files from"
--- a/src/health/azure/himl_tensorboard.py
+++ b/src/health/azure/himl_tensorboard.py
@ -3,17 +3,99 @@
 #  Copyright (c) Microsoft Corporation. All rights reserved.
 #  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
 #  ------------------------------------------------------------------------------------------
+import os
+import sys
+import logging
 from argparse import ArgumentParser
 from pathlib import Path
+from requests import Session
+from typing import Any, Optional

+from azureml._run_impl.run_watcher import RunWatcher
 from azureml.tensorboard import Tensorboard

 from health.azure.azure_util import get_aml_runs, determine_run_id_source
 from health.azure.himl import get_workspace

+from concurrent.futures import ThreadPoolExecutor
+from subprocess import PIPE, Popen
+from threading import Event

 ROOT_DIR = Path.cwd()
 OUTPUT_DIR = ROOT_DIR / "outputs"
+TENSORBOARD_DIR = ROOT_DIR / "tensorboard_logs"
+
+
+class WrappedTensorboard(Tensorboard):
+    def __init__(self, remote_root: str, **kwargs: Any) -> None:
+        super().__init__(**kwargs)
+        self.remote_root = remote_root
+
+    def start(self) -> Optional[str]:
+        """
+        Start the Tensorboard instance, and begin processing logs.
+
+        :return: The URL for accessing the Tensorboard instance.
+        """
+        self._tb_proc: Optional[Popen]
+        if self._tb_proc is not None:
+            return None
+
+        self._executor = ThreadPoolExecutor()
+        self._event = Event()
+        self._session = Session()
+
+        # Make a run watcher for each run we are monitoring
+        self._run_watchers = []
+        local_log_dirs = []
+        for run in self._runs:
+            run_local_root = os.path.join(self._local_root, run.id)
+            local_log_dirs.append(f"{run.id}:{run_local_root}")
+            run_watcher = RunWatcher(
+                run,
+                local_root=run_local_root,
+                remote_root=self.remote_root,
+                executor=self._executor,
+                event=self._event,
+                session=self._session)
+            self._run_watchers.append(run_watcher)
+
+        for w in self._run_watchers:
+            self._executor.submit(w.refresh_requeue)
+
+        # We use sys.executable here to ensure that we can import modules from the same environment
+        # as the current process.
+        # (using just "python" results in the global environment, which might not have a Tensorboard module)
+        # sometimes, sys.executable might not give us what we want (i.e. in a notebook), and then we just have to hope
+        # that "python" will give us something useful
+        python_binary = sys.executable or "python"
+        python_command = [
+            python_binary, "-m", "tensorboard.main",
+            "--port", str(self._port)
+        ]
+        if len(local_log_dirs) > 1:
+            # logdir_spec is not recommended but it is the only working way to display multiple dirs
+            logdir_str = ','.join(local_log_dirs)
+            python_command.append("--logdir_spec")
+            logging.info("Loading tensorboard files for > 1 run. You may notice reduced functionality as noted "
+                         "here: https://github.com/tensorflow/tensorboard#logdir--logdir_spec-legacy-mode ")
+        else:
+            logdir_str = run_local_root
+            python_command.append("--logdir")
+
+        python_command.append(logdir_str)
+
+        self._tb_proc = Popen(
+            python_command,
+            stderr=PIPE, stdout=PIPE, universal_newlines=True)
+        if os.name == "nt":
+            self._win32_kill_subprocess_on_exit(self._tb_proc)
+
+        url = self._wait_for_url()
+        # in notebooks, this shows as a clickable link (whereas the returned value is not parsed in output)
+        logging.info(f"Tensorboard running at: {url}")
+
+        return url


 def main() -> None:  # pragma: no cover
@ -33,11 +115,11 @@ def main() -> None:  # pragma: no cover
        help="The port to run Tensorboard on"
    )
    parser.add_argument(
-        "--run_logs_dir",
+        "--log_dir",
        type=str,
-        default="tensorboard_logs",
+        default="outputs",
        required=False,
-        help="Path to directory in which to store Tensorboard logs"
+        help="Path to directory in which Tensorboard  files (summarywriter and TB logs) are stored"
    )
    parser.add_argument(
        "--latest_run_file",
@ -46,7 +128,7 @@ def main() -> None:  # pragma: no cover
        help="Optional path to most_recent_run.txt where details on latest run are stored"
    )
    parser.add_argument(
-        "--experiment_name",
+        "--experiment",
        type=str,
        required=False,
        help="The name of the AML Experiment that you wish to view Runs from"
@ -65,6 +147,12 @@ def main() -> None:  # pragma: no cover
        required=False,
        help="Optional run recovery ids of the runs to plot"
    )
+    parser.add_argument(
+        "--run_ids",
+        default=[],
+        nargs="+",
+        help="Optional run ids of the runs to plot"
+    )

    args = parser.parse_args()

@ -73,21 +161,26 @@ def main() -> None:  # pragma: no cover
        raise ValueError(
            "You must provide a config.json file in the root folder to connect"
            "to an AML workspace. This can be downloaded from your AML workspace (see README.md)"
-            )
+        )

    workspace = get_workspace(aml_workspace=None, workspace_config_path=config_path)

    run_id_source = determine_run_id_source(args)
    runs = get_aml_runs(args, workspace, run_id_source)
+
+    print(f"Runs:\n{runs}")
    if len(runs) == 0:
        raise ValueError("No runs were found")

-    # start Tensorboard
-    print(f"runs: {runs}")
+    local_logs_dir = ROOT_DIR / args.log_dir
+    local_logs_dir.mkdir(exist_ok=True, parents=True)

-    run_logs_dir = OUTPUT_DIR / args.run_logs_dir
-    run_logs_dir.mkdir(exist_ok=True)
-    ts = Tensorboard(runs=runs, local_root=str(run_logs_dir), port=args.port)
+    remote_logs_dir = local_logs_dir.relative_to(ROOT_DIR)
+
+    ts = WrappedTensorboard(remote_root=str(remote_logs_dir) + '/',
+                            runs=runs,
+                            local_root=str(local_logs_dir),
+                            port='6006')

    ts.start()
    print("=============================================================================\n\n")
--- a/test_requirements.txt
+++ b/test_requirements.txt
@ -5,3 +5,4 @@ pylint==2.9.5
 pycobertura==2.0.1
 pytest==6.2.2
 pytest-cov==2.11.1
+types-requests==2.25.6
--- a/testhiml/health/azure/test_azure_util.py
+++ b/testhiml/health/azure/test_azure_util.py
@ -430,7 +430,7 @@ class MockRun:
 def test_determine_run_id_source(tmp_path: Path) -> None:
    parser = ArgumentParser()
    parser.add_argument("--latest_run_file", type=str)
-    parser.add_argument("--experiment_name", type=str)
+    parser.add_argument("--experiment", type=str)
    parser.add_argument("--run_recovery_ids", type=str)
    parser.add_argument("--run_ids", type=str)

@ -440,7 +440,7 @@ def test_determine_run_id_source(tmp_path: Path) -> None:
    assert util.determine_run_id_source(mock_args) == util.AzureRunIdSource.LATEST_RUN_FILE

    # If experiment name is provided, expect source to be experiment
-    mock_args = parser.parse_args(["--experiment_name", "fake_experiment"])
+    mock_args = parser.parse_args(["--experiment", "fake_experiment"])
    assert util.determine_run_id_source(mock_args) == util.AzureRunIdSource.EXPERIMENT_LATEST

    # If run recovery id is provided, expect source to be that
--- a/testhiml/health/azure/test_data/dummy_summarywriter_logs/id123/events.out.tfevents.1631695551.MSRC-11200758.38884.0
+++ b/testhiml/health/azure/test_data/dummy_summarywriter_logs/id123/events.out.tfevents.1631695551.MSRC-11200758.38884.0
--- a/testhiml/health/azure/test_download_aml_run.py
+++ b/testhiml/health/azure/test_download_aml_run.py
@ -21,7 +21,7 @@ def test_download_aml_run_args(tmp_path: Path) -> None:

    with pytest.raises(Exception) as e:
        subprocess.Popen(["python", DOWNLOAD_SCRIPT_PATH, "--output_dir", str(tmp_output_dir)])
-        assert 'One of latest_run_file, experiment_name, run_recovery_id ' \
+        assert 'One of latest_run_file, experiment, run_recovery_id ' \
               'or run_id must be provided' in str(e)


@ -45,13 +45,13 @@ def test_determine_output_dir_name(tmp_path: Path) -> None:

    parser = ArgumentParser()
    parser.add_argument("--latest_run_file", type=str)
-    parser.add_argument("--experiment_name", type=str)
+    parser.add_argument("--experiment", type=str)
    parser.add_argument("--run_recovery_id", type=str)
    parser.add_argument("--run_id", type=str)

    # if experiment name is provided, expect that to be included in the directory
    mock_experiment_name = "fake-experiment"
-    mock_args = parser.parse_args(["--experiment_name", mock_experiment_name])
+    mock_args = parser.parse_args(["--experiment", mock_experiment_name])
    run_id_source = AzureRunIdSource.EXPERIMENT_LATEST
    output_dir = himl_download.determine_output_dir_name(mock_args, run_id_source, mock_output_dir)
    assert output_dir == mock_output_dir / mock_experiment_name
--- a/testhiml/health/azure/test_run_tensorboard.py
+++ b/testhiml/health/azure/test_run_tensorboard.py
@ -5,7 +5,14 @@
 import pytest
 import subprocess

+from pathlib import Path
+from unittest import mock
+
 from health.azure import himl_tensorboard
+from health.azure.himl_tensorboard import WrappedTensorboard, ROOT_DIR
+
+from azureml.core import Experiment, Workspace
+

 TENSORBOARD_SCRIPT_PATH = himl_tensorboard.__file__

@ -19,7 +26,7 @@ def test_run_tensorboard_args() -> None:
    # if no required args are passed, will fail
    with pytest.raises(Exception) as e:
        subprocess.Popen(["python", TENSORBOARD_SCRIPT_PATH])
-        assert "One of latest_run_file, experiment_name, run_recovery_ids" \
+        assert "One of latest_run_file, experiment, run_recovery_ids" \
               " or run_ids must be provided" in str(e)


@ -35,3 +42,43 @@ def test_run_tensorboard_no_runs() -> None:
    with pytest.raises(Exception) as e:
        subprocess.Popen(["python", TENSORBOARD_SCRIPT_PATH, "--run_recovery_ids", "madeuprun"])
        assert "No runs were found" in str(e)
+
+
+def test_wrapped_tensorboard_local_logs(tmp_path: Path) -> None:
+    mock_run = mock.MagicMock()
+    mock_run.id = "id123"
+    local_root = Path("test_data") / "dummy_summarywriter_logs"
+    remote_root = tmp_path / "tensorboard_logs"
+    ts = WrappedTensorboard(remote_root=str(remote_root), local_root=str(local_root), runs=[mock_run])
+    url = ts.start()
+    assert url is not None
+    assert ts.remote_root == str(remote_root)
+    assert ts._local_root == str(local_root)
+    ts.stop()
+
+
+@pytest.mark.skip
+def test_wrapped_tensorboard_remote_logs(tmp_path: Path) -> None:
+    """
+    This test expects an experiment called 'tensorboard_test' in your workspace, with at least 1 associated run
+    See the scripts in test_tensorboard to create this Experiment & Run.
+    :param tmp_path:
+    :return:
+    """
+    # get the latest run in this experiment
+    ws = Workspace.from_config(ROOT_DIR / "config.json")
+    expt = Experiment(ws, 'tensorboard_test')
+    run = next(expt.get_runs())
+
+    log_dir = "outputs"
+
+    local_root = tmp_path / log_dir
+    local_root.mkdir(exist_ok=True)
+    remote_root = str(local_root.relative_to(tmp_path)) + "/"
+
+    ts = WrappedTensorboard(remote_root=remote_root, local_root=str(local_root), runs=[run], port=6006)
+    url = ts.start()
+    assert url == "http://localhost:6006/"
+    assert ts.remote_root == str(remote_root)
+    assert ts._local_root == str(local_root)
+    ts.stop()