зеркало из https://github.com/microsoft/pai.git
73 строки
1.8 KiB
YAML
73 строки
1.8 KiB
YAML
protocolVersion: 2
|
|
name: horovod_pytorch
|
|
type: job
|
|
version: horovod0.16.4-tf1.12.0-torch1.1.0-mxnet1.4.1-py3.5
|
|
contributor: OpenPAI
|
|
description: |
|
|
This is a distributed synthetic benchmark for Horovod with PyTorch backend running on OpenPAI.
|
|
It runs [Horovod with Open MPI](https://github.com/horovod/horovod/blob/master/docs/mpirun.rst).
|
|
parameters:
|
|
model: resnet50
|
|
batchsize: 64
|
|
# Make sure IFNAME fits the node
|
|
# NCCL options for Ethernet
|
|
nccl: >-
|
|
-x NCCL_DEBUG=INFO
|
|
-x NCCL_SOCKET_IFNAME=eth0
|
|
# NCCL options for InfiniBand
|
|
# nccl: >-
|
|
# -x NCCL_DEBUG=INFO
|
|
# -x NCCL_IB_DISABLE=0
|
|
# -x NCCL_IB_GDR_LEVEL=1
|
|
# -x NCCL_IB_HCA=mlx5_0:1
|
|
# -x NCCL_SOCKET_IFNAME=ib0
|
|
# -x HOROVOD_MPI_THREADS_DISABLE=1
|
|
|
|
prerequisites:
|
|
- protocolVersion: 2
|
|
name: horovod_official
|
|
type: dockerimage
|
|
contributor : Horovod
|
|
uri : horovod/horovod:0.16.4-tf1.12.0-torch1.1.0-mxnet1.4.1-py3.5
|
|
|
|
taskRoles:
|
|
master:
|
|
instances: 1
|
|
completion:
|
|
minSucceededInstances: 1
|
|
dockerImage: horovod_official
|
|
resourcePerInstance:
|
|
cpu: 16
|
|
memoryMB: 16384
|
|
gpu: 4
|
|
commands:
|
|
- sleep 10
|
|
- >
|
|
mpirun --allow-run-as-root
|
|
-np 8 -H master-0:4,worker-0:4
|
|
-bind-to none -map-by slot
|
|
-mca pml ob1
|
|
-mca btl ^openib
|
|
-mca btl_tcp_if_exclude lo,docker0
|
|
<% $parameters.nccl %>
|
|
-x PATH -x LD_LIBRARY_PATH
|
|
python pytorch_synthetic_benchmark.py
|
|
--model <% $parameters.model %>
|
|
--batch-size <% $parameters.batchsize %>
|
|
worker:
|
|
instances: 1
|
|
dockerImage: horovod_official
|
|
resourcePerInstance:
|
|
cpu: 16
|
|
memoryMB: 16384
|
|
gpu: 4
|
|
commands:
|
|
- sleep infinity
|
|
|
|
extras:
|
|
com.microsoft.pai.runtimeplugin:
|
|
- plugin: ssh
|
|
parameters:
|
|
jobssh: true
|
|
sshbarrier: true
|