Visualized mnist500task (#5131)

2020-11-30 17:32:47 +08:00 · 2020-11-30 17:32:47 +08:00 · c15214fa2b
--- a/examples/mnist_500_tasks/README.md
+++ b/examples/mnist_500_tasks/README.md
@ -7,4 +7,36 @@ Here we provide a CPU-only job with 500 tasks on a taskrole. The example use Con
 | ConvNet | CPU | 6h30m10s (500*5 epoch) | [Details](metrics/ConvNet_CPU_500Task.JPG) | 95.15% (lr: 0.0101)  98.53% (lr: 0.1001)  98.95% (lr: 0.9981)| [CPU_500Task_MNIST.yaml](yaml/CPU_500Task_MNIST.yaml) |

 ## Usage
-To quickly submit a training job to the OpenPAI cluster, users can directly submit the corresponding yaml file as mentioned above (in the yaml folder). 
+Before running this example, you should first make sure that you have at least one permitted storage in OpenPAI. If you don’t know how to use storage, please refer to [our doc](https://openpai.readthedocs.io/).
+
+Before submitting yaml file as mentioned above (in the yaml folder), you need to update the following commands with your own storage path:
+
+`master` taskrole:
+```
+python get_results.py --number=500 --data_path /mnt/confignfs/mnist500_result/
+-->
+python get_results.py --number=500 --data_path <your own storage path>/mnist500_result/
+```
+
+`taskrole` taskerole:
+```
+mount -t nfs4 10.151.40.235:/data data
+-->
+mount -t nfs4 <NFS_SERVER:/NFS_PATH> data
+```
+
+Now you can submit the yaml file to try this example, and **don't forget** to select the storage you want to use in the `data` area on the right side of the page. 
+
+## Visualization of results
+
+When all instances in `taskrole` run successfully, you can view the visualized results through the running `master`. The following figure shows the final status of the successful job. It should be noted that the visualized results can only be viewed when the `master` is running. This taskrole will keep running until the user manually stops it.
+
+<img src="./images/final_status.JPG" width="40%" height="40%" />
+
+You can access jupyter notebook by visiting `<master_IP>:8888` in the browser. Then, click on the file `show_results.ipynb`.
+
+<img src="./images/show_results_file.JPG" width="40%" height="40%" />
+
+Run it and get the following visualized result.
+
+<img src="./images/show_results.JPG" width="80%" height="80%" />
--- a/examples/mnist_500_tasks/images/final_status.JPG
+++ b/examples/mnist_500_tasks/images/final_status.JPG
--- a/examples/mnist_500_tasks/images/show_results.JPG
+++ b/examples/mnist_500_tasks/images/show_results.JPG
--- a/examples/mnist_500_tasks/images/show_results_file.JPG
+++ b/examples/mnist_500_tasks/images/show_results_file.JPG
--- a/examples/mnist_500_tasks/src/get_results.py
+++ b/examples/mnist_500_tasks/src/get_results.py
@ -0,0 +1,44 @@
+  
+import os
+import csv
+import time
+import argparse
+import shutil
+
+def summary(filepath, result_path):
+    with open(filepath, 'r') as f:
+        csv_read = csv.reader(f)
+        with open(result_path, 'a') as r:
+            csv_write = csv.writer(r)
+            for line in csv_read:
+                csv_write.writerow(line)
+
+def main():
+    parser = argparse.ArgumentParser(description='Display Results')
+    parser.add_argument('--number', type=int, default=500, 
+                        help='The number of learning rates')
+    parser.add_argument('--data_path', default='./mnist500_result/', 
+                        help='The number of learning rates')
+    args = parser.parse_args()
+    
+    path = args.data_path
+    if not os.path.exists(path):
+        os.makedirs(path)
+    # Waiting for all results
+    while(len([lists for lists in os.listdir(path)]) < args.number):
+        for file in os.listdir('.'):
+            if file[-4:]=='.csv':
+                shutil.move(file, os.path.join(path, file))
+        time.sleep(1)
+    for file in os.listdir('.'):
+        if file[-4:]=='.csv':
+            shutil.move(file, os.path.join(path, file))
+
+    for file in os.listdir(path):
+        filepath = os.path.join(path, file)
+        if os.path.isfile(filepath) and file[-4:]=='.csv':
+            summary(filepath, 'results.csv')
+
+
+if __name__ == '__main__':
+    main()
--- a/examples/mnist_500_tasks/src/mnist_lr_500.py
+++ b/examples/mnist_500_tasks/src/mnist_lr_500.py
@ -6,7 +6,7 @@ import torch.nn.functional as F
 import torch.optim as optim
 from torchvision import datasets, transforms
 from torch.optim.lr_scheduler import StepLR
-
+import csv

 class Net(nn.Module):
    def __init__(self):
@ -68,7 +68,14 @@ def test(model, device, test_loader):
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))
+    
+    return 100. * correct / len(test_loader.dataset)

+def write_result(filepath, lr, acc):
+    with open(filepath, 'a') as f:
+        csv_write = csv.writer(f)
+        data = [lr, acc]
+        csv_write.writerow(data)

 def main():
    # Training settings
@ -95,6 +102,8 @@ def main():
                        help='For Saving the current Model')
    parser.add_argument('--task_index', default=0, 
                        help='Multi-task Index')
+    parser.add_argument('--result_file', default='results.csv', 
+                        help='Accuracy of different learning rates')
    args = parser.parse_args()
    use_cuda = not args.no_cuda and torch.cuda.is_available()

@ -131,13 +140,13 @@ def main():
    scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)
    for epoch in range(1, args.epochs + 1):
        train(args, model, device, train_loader, optimizer, epoch)
-        test(model, device, test_loader)
+        acc = test(model, device, test_loader)
        scheduler.step()

+    write_result(args.result_file, lr, acc)
    if args.save_model:
-        torch.save(model.state_dict(), "mnist_cnn.pt")
-    
+        torch.save(model.state_dict(), "mnist_cnn.pt")    


 if __name__ == '__main__':
-    main()
+    main()
--- a/examples/mnist_500_tasks/src/show_results.ipynb
+++ b/examples/mnist_500_tasks/src/show_results.ipynb
@ -0,0 +1,37 @@
+{
+ "metadata": {
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": 3
+  },
+  "orig_nbformat": 2
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2,
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "results = np.genfromtxt('./results.csv', delimiter=\",\", names=[\"LR\",\"ACC\"])\n",
+    "plt.plot(results[\"LR\"], results[\"ACC\"], 'o')\n",
+    "plt.xlabel('Learning Rate')\n",
+    "plt.ylabel('Accuracy')\n",
+    "plt.show()"
+   ]
+  }
+ ]
+}
--- a/examples/mnist_500_tasks/yaml/CPU_500Task_MNIST.yaml
+++ b/examples/mnist_500_tasks/yaml/CPU_500Task_MNIST.yaml
@ -7,8 +7,8 @@ prerequisites:
    uri: 'openpai/standard:python_3.6-pytorch_1.4.0-cpu'
    name: docker_image_0
 taskRoles:
-  taskrole:
-    instances: 500
+  master:
+    instances: 1
    completion:
      minFailedInstances: 1
    taskRetryCount: 0
@ -16,13 +16,43 @@ taskRoles:
    resourcePerInstance:
      gpu: 0
      cpu: 1
-      memoryMB: 51200
+      memoryMB: 50000
    commands:
      - >-
-        wget https://raw.githubusercontent.com/microsoft/pai/master/examples/mnist_500_tasks/src/mnist_lr_500.py
+        wget
+        https://raw.githubusercontent.com/microsoft/pai/master/examples/mnist_500_tasks/src/get_results.py
+      - >-
+        python get_results.py --number=500 --data_path
+        /mnt/confignfs/mnist500_result/
+      - >-
+        wget
+        https://raw.githubusercontent.com/microsoft/pai/master/examples/mnist_500_tasks/src/show_results.ipynb
+      - jupyter notebook
+  taskrole:
+    instances: 500
+    completion:
+      minFailedInstances: 1
+      minSucceededInstances: -1
+    taskRetryCount: 0
+    dockerImage: docker_image_0
+    resourcePerInstance:
+      gpu: 0
+      cpu: 1
+      memoryMB: 50000
+    commands:
+      - >-
+        wget
+        https://raw.githubusercontent.com/microsoft/pai/master/examples/mnist_500_tasks/src/mnist_lr_500.py
      - >-
        python mnist_lr_500.py --epoch 5
        --task_index=$PAI_CURRENT_TASK_ROLE_CURRENT_TASK_INDEX
+      - apt-get update
+      - apt-get install --assume-yes nfs-common
+      - mkdir -p data/mnist500_result
+      - 'mount -t nfs4 10.151.40.235:/data data'
+      - >-
+        cp results.csv
+        data/mnist500_result/results_$PAI_CURRENT_TASK_ROLE_CURRENT_TASK_INDEX.csv
 defaults:
  virtualCluster: default
 extras:
@ -31,3 +61,7 @@ extras:
    - plugin: ssh
      parameters:
        jobssh: true
+    - plugin: teamwise_storage
+      parameters:
+        storageConfigNames:
+          - confignfs