зеркало из https://github.com/microsoft/pai.git
Visualized mnist500task (#5131)
This commit is contained in:
Родитель
1f3d691e6b
Коммит
c15214fa2b
|
@ -7,4 +7,36 @@ Here we provide a CPU-only job with 500 tasks on a taskrole. The example use Con
|
|||
| ConvNet | CPU | 6h30m10s (500*5 epoch) | [Details](metrics/ConvNet_CPU_500Task.JPG) | 95.15% (lr: 0.0101) 98.53% (lr: 0.1001) 98.95% (lr: 0.9981)| [CPU_500Task_MNIST.yaml](yaml/CPU_500Task_MNIST.yaml) |
|
||||
|
||||
## Usage
|
||||
To quickly submit a training job to the OpenPAI cluster, users can directly submit the corresponding yaml file as mentioned above (in the yaml folder).
|
||||
Before running this example, you should first make sure that you have at least one permitted storage in OpenPAI. If you don’t know how to use storage, please refer to [our doc](https://openpai.readthedocs.io/).
|
||||
|
||||
Before submitting yaml file as mentioned above (in the yaml folder), you need to update the following commands with your own storage path:
|
||||
|
||||
`master` taskrole:
|
||||
```
|
||||
python get_results.py --number=500 --data_path /mnt/confignfs/mnist500_result/
|
||||
-->
|
||||
python get_results.py --number=500 --data_path <your own storage path>/mnist500_result/
|
||||
```
|
||||
|
||||
`taskrole` taskerole:
|
||||
```
|
||||
mount -t nfs4 10.151.40.235:/data data
|
||||
-->
|
||||
mount -t nfs4 <NFS_SERVER:/NFS_PATH> data
|
||||
```
|
||||
|
||||
Now you can submit the yaml file to try this example, and **don't forget** to select the storage you want to use in the `data` area on the right side of the page.
|
||||
|
||||
## Visualization of results
|
||||
|
||||
When all instances in `taskrole` run successfully, you can view the visualized results through the running `master`. The following figure shows the final status of the successful job. It should be noted that the visualized results can only be viewed when the `master` is running. This taskrole will keep running until the user manually stops it.
|
||||
|
||||
<img src="./images/final_status.JPG" width="40%" height="40%" />
|
||||
|
||||
You can access jupyter notebook by visiting `<master_IP>:8888` in the browser. Then, click on the file `show_results.ipynb`.
|
||||
|
||||
<img src="./images/show_results_file.JPG" width="40%" height="40%" />
|
||||
|
||||
Run it and get the following visualized result.
|
||||
|
||||
<img src="./images/show_results.JPG" width="80%" height="80%" />
|
Двоичный файл не отображается.
После Ширина: | Высота: | Размер: 15 KiB |
Двоичный файл не отображается.
После Ширина: | Высота: | Размер: 36 KiB |
Двоичный файл не отображается.
После Ширина: | Высота: | Размер: 21 KiB |
|
@ -0,0 +1,44 @@
|
|||
|
||||
import os
|
||||
import csv
|
||||
import time
|
||||
import argparse
|
||||
import shutil
|
||||
|
||||
def summary(filepath, result_path):
|
||||
with open(filepath, 'r') as f:
|
||||
csv_read = csv.reader(f)
|
||||
with open(result_path, 'a') as r:
|
||||
csv_write = csv.writer(r)
|
||||
for line in csv_read:
|
||||
csv_write.writerow(line)
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Display Results')
|
||||
parser.add_argument('--number', type=int, default=500,
|
||||
help='The number of learning rates')
|
||||
parser.add_argument('--data_path', default='./mnist500_result/',
|
||||
help='The number of learning rates')
|
||||
args = parser.parse_args()
|
||||
|
||||
path = args.data_path
|
||||
if not os.path.exists(path):
|
||||
os.makedirs(path)
|
||||
# Waiting for all results
|
||||
while(len([lists for lists in os.listdir(path)]) < args.number):
|
||||
for file in os.listdir('.'):
|
||||
if file[-4:]=='.csv':
|
||||
shutil.move(file, os.path.join(path, file))
|
||||
time.sleep(1)
|
||||
for file in os.listdir('.'):
|
||||
if file[-4:]=='.csv':
|
||||
shutil.move(file, os.path.join(path, file))
|
||||
|
||||
for file in os.listdir(path):
|
||||
filepath = os.path.join(path, file)
|
||||
if os.path.isfile(filepath) and file[-4:]=='.csv':
|
||||
summary(filepath, 'results.csv')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
|
@ -6,7 +6,7 @@ import torch.nn.functional as F
|
|||
import torch.optim as optim
|
||||
from torchvision import datasets, transforms
|
||||
from torch.optim.lr_scheduler import StepLR
|
||||
|
||||
import csv
|
||||
|
||||
class Net(nn.Module):
|
||||
def __init__(self):
|
||||
|
@ -68,7 +68,14 @@ def test(model, device, test_loader):
|
|||
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
|
||||
test_loss, correct, len(test_loader.dataset),
|
||||
100. * correct / len(test_loader.dataset)))
|
||||
|
||||
return 100. * correct / len(test_loader.dataset)
|
||||
|
||||
def write_result(filepath, lr, acc):
|
||||
with open(filepath, 'a') as f:
|
||||
csv_write = csv.writer(f)
|
||||
data = [lr, acc]
|
||||
csv_write.writerow(data)
|
||||
|
||||
def main():
|
||||
# Training settings
|
||||
|
@ -95,6 +102,8 @@ def main():
|
|||
help='For Saving the current Model')
|
||||
parser.add_argument('--task_index', default=0,
|
||||
help='Multi-task Index')
|
||||
parser.add_argument('--result_file', default='results.csv',
|
||||
help='Accuracy of different learning rates')
|
||||
args = parser.parse_args()
|
||||
use_cuda = not args.no_cuda and torch.cuda.is_available()
|
||||
|
||||
|
@ -131,13 +140,13 @@ def main():
|
|||
scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)
|
||||
for epoch in range(1, args.epochs + 1):
|
||||
train(args, model, device, train_loader, optimizer, epoch)
|
||||
test(model, device, test_loader)
|
||||
acc = test(model, device, test_loader)
|
||||
scheduler.step()
|
||||
|
||||
write_result(args.result_file, lr, acc)
|
||||
if args.save_model:
|
||||
torch.save(model.state_dict(), "mnist_cnn.pt")
|
||||
|
||||
torch.save(model.state_dict(), "mnist_cnn.pt")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
main()
|
|
@ -0,0 +1,37 @@
|
|||
{
|
||||
"metadata": {
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": 3
|
||||
},
|
||||
"orig_nbformat": 2
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2,
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"\n",
|
||||
"results = np.genfromtxt('./results.csv', delimiter=\",\", names=[\"LR\",\"ACC\"])\n",
|
||||
"plt.plot(results[\"LR\"], results[\"ACC\"], 'o')\n",
|
||||
"plt.xlabel('Learning Rate')\n",
|
||||
"plt.ylabel('Accuracy')\n",
|
||||
"plt.show()"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
|
@ -7,8 +7,8 @@ prerequisites:
|
|||
uri: 'openpai/standard:python_3.6-pytorch_1.4.0-cpu'
|
||||
name: docker_image_0
|
||||
taskRoles:
|
||||
taskrole:
|
||||
instances: 500
|
||||
master:
|
||||
instances: 1
|
||||
completion:
|
||||
minFailedInstances: 1
|
||||
taskRetryCount: 0
|
||||
|
@ -16,13 +16,43 @@ taskRoles:
|
|||
resourcePerInstance:
|
||||
gpu: 0
|
||||
cpu: 1
|
||||
memoryMB: 51200
|
||||
memoryMB: 50000
|
||||
commands:
|
||||
- >-
|
||||
wget https://raw.githubusercontent.com/microsoft/pai/master/examples/mnist_500_tasks/src/mnist_lr_500.py
|
||||
wget
|
||||
https://raw.githubusercontent.com/microsoft/pai/master/examples/mnist_500_tasks/src/get_results.py
|
||||
- >-
|
||||
python get_results.py --number=500 --data_path
|
||||
/mnt/confignfs/mnist500_result/
|
||||
- >-
|
||||
wget
|
||||
https://raw.githubusercontent.com/microsoft/pai/master/examples/mnist_500_tasks/src/show_results.ipynb
|
||||
- jupyter notebook
|
||||
taskrole:
|
||||
instances: 500
|
||||
completion:
|
||||
minFailedInstances: 1
|
||||
minSucceededInstances: -1
|
||||
taskRetryCount: 0
|
||||
dockerImage: docker_image_0
|
||||
resourcePerInstance:
|
||||
gpu: 0
|
||||
cpu: 1
|
||||
memoryMB: 50000
|
||||
commands:
|
||||
- >-
|
||||
wget
|
||||
https://raw.githubusercontent.com/microsoft/pai/master/examples/mnist_500_tasks/src/mnist_lr_500.py
|
||||
- >-
|
||||
python mnist_lr_500.py --epoch 5
|
||||
--task_index=$PAI_CURRENT_TASK_ROLE_CURRENT_TASK_INDEX
|
||||
- apt-get update
|
||||
- apt-get install --assume-yes nfs-common
|
||||
- mkdir -p data/mnist500_result
|
||||
- 'mount -t nfs4 10.151.40.235:/data data'
|
||||
- >-
|
||||
cp results.csv
|
||||
data/mnist500_result/results_$PAI_CURRENT_TASK_ROLE_CURRENT_TASK_INDEX.csv
|
||||
defaults:
|
||||
virtualCluster: default
|
||||
extras:
|
||||
|
@ -31,3 +61,7 @@ extras:
|
|||
- plugin: ssh
|
||||
parameters:
|
||||
jobssh: true
|
||||
- plugin: teamwise_storage
|
||||
parameters:
|
||||
storageConfigNames:
|
||||
- confignfs
|
||||
|
|
Загрузка…
Ссылка в новой задаче