pai/marketplace-v2/horovod-pytorch-synthetic-b...

73 строки
1.8 KiB
YAML

protocolVersion: 2
name: horovod_pytorch
type: job
version: horovod0.16.4-tf1.12.0-torch1.1.0-mxnet1.4.1-py3.5
contributor: OpenPAI
description: |
This is a distributed synthetic benchmark for Horovod with PyTorch backend running on OpenPAI.
It runs [Horovod with Open MPI](https://github.com/horovod/horovod/blob/master/docs/mpirun.rst).
parameters:
model: resnet50
batchsize: 64
# Make sure IFNAME fits the node
# NCCL options for Ethernet
nccl: >-
-x NCCL_DEBUG=INFO
-x NCCL_SOCKET_IFNAME=eth0
# NCCL options for InfiniBand
# nccl: >-
# -x NCCL_DEBUG=INFO
# -x NCCL_IB_DISABLE=0
# -x NCCL_IB_GDR_LEVEL=1
# -x NCCL_IB_HCA=mlx5_0:1
# -x NCCL_SOCKET_IFNAME=ib0
# -x HOROVOD_MPI_THREADS_DISABLE=1
prerequisites:
- protocolVersion: 2
name: horovod_official
type: dockerimage
contributor : Horovod
uri : horovod/horovod:0.16.4-tf1.12.0-torch1.1.0-mxnet1.4.1-py3.5
taskRoles:
master:
instances: 1
completion:
minSucceededInstances: 1
dockerImage: horovod_official
resourcePerInstance:
cpu: 16
memoryMB: 16384
gpu: 4
commands:
- sleep 10
- >
mpirun --allow-run-as-root
-np 8 -H master-0:4,worker-0:4
-bind-to none -map-by slot
-mca pml ob1
-mca btl ^openib
-mca btl_tcp_if_exclude lo,docker0
<% $parameters.nccl %>
-x PATH -x LD_LIBRARY_PATH
python pytorch_synthetic_benchmark.py
--model <% $parameters.model %>
--batch-size <% $parameters.batchsize %>
worker:
instances: 1
dockerImage: horovod_official
resourcePerInstance:
cpu: 16
memoryMB: 16384
gpu: 4
commands:
- sleep infinity
extras:
com.microsoft.pai.runtimeplugin:
- plugin: ssh
parameters:
jobssh: true
sshbarrier: true