Add TensorFlow Example to leverage HivedScheduler (#34)
This commit is contained in:
Родитель
d432b57875
Коммит
80492e5c53
|
@ -75,6 +75,11 @@ A specialized wrapper can be built on top of FrameworkController to optimize for
|
|||
* [OpenPAI Controller Wrapper (Job RestServer)](https://github.com/microsoft/pai/tree/master/src/rest-server): A wrapper client optimized for AI applications
|
||||
* [NNI Controller Wrapper (TrainingService)](https://github.com/microsoft/nni/blob/master/docs/en_US/TrainingService/FrameworkControllerMode.md): A wrapper client optimized for AutoML applications
|
||||
|
||||
### Recommended Kubernetes Scheduler
|
||||
FrameworkController can directly leverage many [Kubernetes Schedulers](https://kubernetes.io/docs/tasks/administer-cluster/configure-multiple-schedulers) and among them we recommend these best fits:
|
||||
* [Kubernetes Default Scheduler](https://kubernetes.io/docs/concepts/scheduling/kube-scheduler/#kube-scheduler): A General-Purpose Kubernetes Scheduler
|
||||
* [HivedScheduler](https://github.com/microsoft/pai/tree/master/subprojects/hivedscheduler): A Kubernetes Scheduler Extender optimized for GPUs ([Example](example/framework/scenario/tensorflow/gpu/tensorflowdistributedtrainingwithhivedscheduledgpu.yaml))
|
||||
|
||||
### Similar Offering On Other Cluster Manager
|
||||
* [YARN FrameworkLauncher](https://github.com/Microsoft/pai/blob/master/subprojects/frameworklauncher/yarn): Similar offering natively supports [Apache YARN](http://hadoop.apache.org)
|
||||
|
||||
|
|
|
@ -5,7 +5,8 @@
|
|||
2. Automatically clean up PS when the whole FrameworkAttempt is completed
|
||||
3. No need to adjust existing TensorFlow image
|
||||
4. No need to setup [Kubernetes DNS](https://kubernetes.io/docs/concepts/services-networking/dns-pod-service) and [Kubernetes Service](https://kubernetes.io/docs/concepts/services-networking/service)
|
||||
5. [Common Feature](../../../../README.md#Feature)
|
||||
5. Easy to leverage [HivedScheduler](https://github.com/microsoft/pai/tree/master/subprojects/hivedscheduler) to be GPU Multi-Tenant and Topology-Aware
|
||||
6. [Common Feature](../../../../README.md#Feature)
|
||||
|
||||
## Prerequisite
|
||||
1. See `[PREREQUISITE]` in each specific Framework yaml file.
|
||||
|
|
|
@ -7,7 +7,7 @@
|
|||
apiVersion: frameworkcontroller.microsoft.com/v1
|
||||
kind: Framework
|
||||
metadata:
|
||||
name: tensorflowdistributedtrainingwithgpu
|
||||
name: tensorflowdistributedtrainingwithdefaultscheduledgpu
|
||||
spec:
|
||||
executionType: Start
|
||||
retryPolicy:
|
|
@ -0,0 +1,217 @@
|
|||
# For the full spec setting and usage, see ./pkg/apis/frameworkcontroller/v1/types.go
|
||||
# For the full frameworkbarrier usage, see ./pkg/barrier/barrier.go
|
||||
|
||||
############################### Prerequisite ###################################
|
||||
# See "[PREREQUISITE]" in this file.
|
||||
################################################################################
|
||||
apiVersion: frameworkcontroller.microsoft.com/v1
|
||||
kind: Framework
|
||||
metadata:
|
||||
name: tensorflowdistributedtrainingwithhivedscheduledgpu
|
||||
spec:
|
||||
executionType: Start
|
||||
retryPolicy:
|
||||
fancyRetryPolicy: true
|
||||
maxRetryCount: 2
|
||||
taskRoles:
|
||||
- name: ps
|
||||
taskNumber: 2
|
||||
frameworkAttemptCompletionPolicy:
|
||||
minFailedTaskCount: 1
|
||||
minSucceededTaskCount: -1
|
||||
task:
|
||||
retryPolicy:
|
||||
fancyRetryPolicy: false
|
||||
maxRetryCount: 0
|
||||
pod:
|
||||
metadata:
|
||||
annotations:
|
||||
hivedscheduler.microsoft.com/pod-scheduling-spec: |-
|
||||
virtualCluster: VC2
|
||||
priority: 1000
|
||||
gpuType: DGX2-V100
|
||||
gpuNumber: 1
|
||||
affinityGroup: null
|
||||
spec:
|
||||
# [PREREQUISITE]
|
||||
# Do not specify the schedulerName if the HivedScheduler is directly
|
||||
# called by the k8s default scheduler.
|
||||
schedulerName: hivedscheduler
|
||||
restartPolicy: Never
|
||||
# [PREREQUISITE]
|
||||
# User needs to setup the k8s cluster networking model and aware the
|
||||
# potential network overhead, if he want to disable the hostNetwork to
|
||||
# avoid the coordination of the containerPort usage.
|
||||
# And for this example, if the hostNetwork is disabled, it only needs
|
||||
# at least 1 node, otherwise, it needs at least 3 nodes since all the
|
||||
# 3 workers are specified with the same containerPort.
|
||||
# See https://kubernetes.io/docs/concepts/cluster-administration/networking
|
||||
hostNetwork: false
|
||||
containers:
|
||||
- name: tensorflow
|
||||
# Using official image to demonstrate this example.
|
||||
# The image contains and only contains tensorflow official code.
|
||||
image: frameworkcontroller/tensorflow-examples:gpu
|
||||
# For the tf_cnn_benchmarks usage, see
|
||||
# https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks
|
||||
workingDir: /tensorflow/benchmarks/scripts/tf_cnn_benchmarks
|
||||
# Using /mnt/frameworkbarrier/injector.sh to inject environment variables
|
||||
# without the need for image invasion and k8s DNS:
|
||||
# FB_{UpperCase({TaskRoleName})}_ADDRESSES=
|
||||
# {Task[0].PodIP}:${FB_{UpperCase({TaskRoleName})}_PORT},...,
|
||||
# {Task[TaskRole.TaskNumber-1].PodIP}:${FB_{UpperCase({TaskRoleName})}_PORT}
|
||||
# See more in ./example/framework/extension/frameworkbarrier.yaml
|
||||
command: [
|
||||
"sh", "-c",
|
||||
"FB_PS_PORT=4001 FB_WORKER_PORT=5001 . /mnt/frameworkbarrier/injector.sh &&
|
||||
python tf_cnn_benchmarks.py --job_name=ps --task_index=${FC_TASK_INDEX}
|
||||
--ps_hosts=${FB_PS_ADDRESSES} --worker_hosts=${FB_WORKER_ADDRESSES}
|
||||
--variable_update=parameter_server --cross_replica_sync=false
|
||||
--model=alexnet --batch_size=8 --num_batches=10
|
||||
--device=gpu --local_parameter_device=gpu --num_gpus=1 --data_format=NCHW
|
||||
--data_name=cifar10 --data_dir=/mnt/data/cifar-10-batches-py
|
||||
--train_dir=/mnt/data/${FC_FRAMEWORK_NAME}/output"]
|
||||
ports:
|
||||
- containerPort: 4001
|
||||
resources:
|
||||
limits:
|
||||
# [PREREQUISITE]
|
||||
# User needs to setup HivedScheduler for the k8s cluster.
|
||||
# See https://github.com/microsoft/pai/tree/master/subprojects/hivedscheduler
|
||||
hivedscheduler.microsoft.com/pod-scheduling-enable: 1
|
||||
cpu: 3
|
||||
memory: 96Gi
|
||||
env:
|
||||
- name: NVIDIA_VISIBLE_DEVICES
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.annotations['hivedscheduler.microsoft.com/pod-gpu-isolation']
|
||||
volumeMounts:
|
||||
- name: frameworkbarrier-volume
|
||||
mountPath: /mnt/frameworkbarrier
|
||||
- name: data-volume
|
||||
mountPath: /mnt/data
|
||||
# [PREREQUISITE]
|
||||
# User needs to create a service account for frameworkbarrier, if the
|
||||
# k8s cluster enforces authorization.
|
||||
# See more in ./example/framework/extension/frameworkbarrier.yaml
|
||||
serviceAccountName: frameworkbarrier
|
||||
initContainers:
|
||||
- name: frameworkbarrier
|
||||
# Using official image to demonstrate this example.
|
||||
image: frameworkcontroller/frameworkbarrier
|
||||
# Using k8s inClusterConfig, so usually, no need to specify
|
||||
# KUBE_APISERVER_ADDRESS or KUBECONFIG
|
||||
#env:
|
||||
#- name: KUBE_APISERVER_ADDRESS
|
||||
# value: {http[s]://host:port}
|
||||
#- name: KUBECONFIG
|
||||
# value: {Pod Local KubeConfig File Path}
|
||||
volumeMounts:
|
||||
- name: frameworkbarrier-volume
|
||||
mountPath: /mnt/frameworkbarrier
|
||||
volumes:
|
||||
- name: frameworkbarrier-volume
|
||||
emptyDir: {}
|
||||
- name: data-volume
|
||||
# [PREREQUISITE]
|
||||
# User needs to specify his own data-volume for input data and
|
||||
# output model.
|
||||
# The data-volume must be a distributed shared file system, so that
|
||||
# data can be "handed off" between Pods, such as nfs, cephfs or
|
||||
# glusterfs, etc.
|
||||
# See https://kubernetes.io/docs/concepts/storage/volumes.
|
||||
#
|
||||
# And then he needs to download and extract the example input data
|
||||
# from:
|
||||
# https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
|
||||
# to:
|
||||
# {Volume Shared Directory}/cifar-10-batches-py
|
||||
#
|
||||
# For example:
|
||||
#nfs:
|
||||
# server: {NFS Server Host}
|
||||
# path: {NFS Shared Directory}
|
||||
- name: worker
|
||||
taskNumber: 3
|
||||
frameworkAttemptCompletionPolicy:
|
||||
minFailedTaskCount: 1
|
||||
# Succeed the FrameworkAttempt immediately if worker's all Tasks succeeded.
|
||||
minSucceededTaskCount: 3
|
||||
task:
|
||||
retryPolicy:
|
||||
fancyRetryPolicy: false
|
||||
maxRetryCount: 0
|
||||
pod:
|
||||
metadata:
|
||||
annotations:
|
||||
hivedscheduler.microsoft.com/pod-scheduling-spec: |-
|
||||
virtualCluster: VC2
|
||||
priority: 1000
|
||||
gpuType: DGX2-V100
|
||||
gpuNumber: 1
|
||||
affinityGroup: null
|
||||
spec:
|
||||
# [PREREQUISITE]
|
||||
# Same as ps TaskRole.
|
||||
schedulerName: hivedscheduler
|
||||
restartPolicy: Never
|
||||
# [PREREQUISITE]
|
||||
# Same as ps TaskRole.
|
||||
hostNetwork: false
|
||||
containers:
|
||||
- name: tensorflow
|
||||
image: frameworkcontroller/tensorflow-examples:gpu
|
||||
workingDir: /tensorflow/benchmarks/scripts/tf_cnn_benchmarks
|
||||
command: [
|
||||
"sh", "-c",
|
||||
"FB_PS_PORT=4001 FB_WORKER_PORT=5001 . /mnt/frameworkbarrier/injector.sh &&
|
||||
python tf_cnn_benchmarks.py --job_name=worker --task_index=${FC_TASK_INDEX}
|
||||
--ps_hosts=${FB_PS_ADDRESSES} --worker_hosts=${FB_WORKER_ADDRESSES}
|
||||
--variable_update=parameter_server --cross_replica_sync=false
|
||||
--model=alexnet --batch_size=8 --num_batches=10
|
||||
--device=gpu --local_parameter_device=gpu --num_gpus=1 --data_format=NCHW
|
||||
--data_name=cifar10 --data_dir=/mnt/data/cifar-10-batches-py
|
||||
--train_dir=/mnt/data/${FC_FRAMEWORK_NAME}/output"]
|
||||
ports:
|
||||
- containerPort: 5001
|
||||
resources:
|
||||
limits:
|
||||
# [PREREQUISITE]
|
||||
# Same as ps TaskRole.
|
||||
hivedscheduler.microsoft.com/pod-scheduling-enable: 1
|
||||
cpu: 3
|
||||
memory: 96Gi
|
||||
env:
|
||||
- name: NVIDIA_VISIBLE_DEVICES
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.annotations['hivedscheduler.microsoft.com/pod-gpu-isolation']
|
||||
volumeMounts:
|
||||
- name: frameworkbarrier-volume
|
||||
mountPath: /mnt/frameworkbarrier
|
||||
- name: data-volume
|
||||
mountPath: /mnt/data
|
||||
# [PREREQUISITE]
|
||||
# Same as ps TaskRole.
|
||||
serviceAccountName: frameworkbarrier
|
||||
initContainers:
|
||||
- name: frameworkbarrier
|
||||
image: frameworkcontroller/frameworkbarrier
|
||||
#env:
|
||||
#- name: KUBE_APISERVER_ADDRESS
|
||||
# value: {http[s]://host:port}
|
||||
#- name: KUBECONFIG
|
||||
# value: {Pod Local KubeConfig File Path}
|
||||
volumeMounts:
|
||||
- name: frameworkbarrier-volume
|
||||
mountPath: /mnt/frameworkbarrier
|
||||
volumes:
|
||||
- name: frameworkbarrier-volume
|
||||
emptyDir: {}
|
||||
- name: data-volume
|
||||
# [PREREQUISITE]
|
||||
# Same as ps TaskRole.
|
||||
#nfs:
|
||||
# server: {NFS Server Host}
|
||||
# path: {NFS Shared Directory}
|
Загрузка…
Ссылка в новой задаче