From 80492e5c53073e38de07c5ce9c3f37c517395cbd Mon Sep 17 00:00:00 2001 From: Yuqi Wang <32826762+yqwang-ms@users.noreply.github.com> Date: Fri, 2 Aug 2019 12:19:32 +0800 Subject: [PATCH] Add TensorFlow Example to leverage HivedScheduler (#34) --- README.md | 5 + .../framework/scenario/tensorflow/README.md | 3 +- ...butedtrainingwithdefaultscheduledgpu.yaml} | 2 +- ...tributedtrainingwithhivedscheduledgpu.yaml | 217 ++++++++++++++++++ 4 files changed, 225 insertions(+), 2 deletions(-) rename example/framework/scenario/tensorflow/gpu/{tensorflowdistributedtrainingwithgpu.yaml => tensorflowdistributedtrainingwithdefaultscheduledgpu.yaml} (99%) create mode 100644 example/framework/scenario/tensorflow/gpu/tensorflowdistributedtrainingwithhivedscheduledgpu.yaml diff --git a/README.md b/README.md index 565752b..d710059 100644 --- a/README.md +++ b/README.md @@ -75,6 +75,11 @@ A specialized wrapper can be built on top of FrameworkController to optimize for * [OpenPAI Controller Wrapper (Job RestServer)](https://github.com/microsoft/pai/tree/master/src/rest-server): A wrapper client optimized for AI applications * [NNI Controller Wrapper (TrainingService)](https://github.com/microsoft/nni/blob/master/docs/en_US/TrainingService/FrameworkControllerMode.md): A wrapper client optimized for AutoML applications +### Recommended Kubernetes Scheduler +FrameworkController can directly leverage many [Kubernetes Schedulers](https://kubernetes.io/docs/tasks/administer-cluster/configure-multiple-schedulers) and among them we recommend these best fits: +* [Kubernetes Default Scheduler](https://kubernetes.io/docs/concepts/scheduling/kube-scheduler/#kube-scheduler): A General-Purpose Kubernetes Scheduler +* [HivedScheduler](https://github.com/microsoft/pai/tree/master/subprojects/hivedscheduler): A Kubernetes Scheduler Extender optimized for GPUs ([Example](example/framework/scenario/tensorflow/gpu/tensorflowdistributedtrainingwithhivedscheduledgpu.yaml)) + ### Similar Offering On Other Cluster Manager * [YARN FrameworkLauncher](https://github.com/Microsoft/pai/blob/master/subprojects/frameworklauncher/yarn): Similar offering natively supports [Apache YARN](http://hadoop.apache.org) diff --git a/example/framework/scenario/tensorflow/README.md b/example/framework/scenario/tensorflow/README.md index 8f0a685..31ab4a7 100644 --- a/example/framework/scenario/tensorflow/README.md +++ b/example/framework/scenario/tensorflow/README.md @@ -5,7 +5,8 @@ 2. Automatically clean up PS when the whole FrameworkAttempt is completed 3. No need to adjust existing TensorFlow image 4. No need to setup [Kubernetes DNS](https://kubernetes.io/docs/concepts/services-networking/dns-pod-service) and [Kubernetes Service](https://kubernetes.io/docs/concepts/services-networking/service) -5. [Common Feature](../../../../README.md#Feature) +5. Easy to leverage [HivedScheduler](https://github.com/microsoft/pai/tree/master/subprojects/hivedscheduler) to be GPU Multi-Tenant and Topology-Aware +6. [Common Feature](../../../../README.md#Feature) ## Prerequisite 1. See `[PREREQUISITE]` in each specific Framework yaml file. diff --git a/example/framework/scenario/tensorflow/gpu/tensorflowdistributedtrainingwithgpu.yaml b/example/framework/scenario/tensorflow/gpu/tensorflowdistributedtrainingwithdefaultscheduledgpu.yaml similarity index 99% rename from example/framework/scenario/tensorflow/gpu/tensorflowdistributedtrainingwithgpu.yaml rename to example/framework/scenario/tensorflow/gpu/tensorflowdistributedtrainingwithdefaultscheduledgpu.yaml index ef4ccc1..209e4ef 100644 --- a/example/framework/scenario/tensorflow/gpu/tensorflowdistributedtrainingwithgpu.yaml +++ b/example/framework/scenario/tensorflow/gpu/tensorflowdistributedtrainingwithdefaultscheduledgpu.yaml @@ -7,7 +7,7 @@ apiVersion: frameworkcontroller.microsoft.com/v1 kind: Framework metadata: - name: tensorflowdistributedtrainingwithgpu + name: tensorflowdistributedtrainingwithdefaultscheduledgpu spec: executionType: Start retryPolicy: diff --git a/example/framework/scenario/tensorflow/gpu/tensorflowdistributedtrainingwithhivedscheduledgpu.yaml b/example/framework/scenario/tensorflow/gpu/tensorflowdistributedtrainingwithhivedscheduledgpu.yaml new file mode 100644 index 0000000..8c9843b --- /dev/null +++ b/example/framework/scenario/tensorflow/gpu/tensorflowdistributedtrainingwithhivedscheduledgpu.yaml @@ -0,0 +1,217 @@ +# For the full spec setting and usage, see ./pkg/apis/frameworkcontroller/v1/types.go +# For the full frameworkbarrier usage, see ./pkg/barrier/barrier.go + +############################### Prerequisite ################################### +# See "[PREREQUISITE]" in this file. +################################################################################ +apiVersion: frameworkcontroller.microsoft.com/v1 +kind: Framework +metadata: + name: tensorflowdistributedtrainingwithhivedscheduledgpu +spec: + executionType: Start + retryPolicy: + fancyRetryPolicy: true + maxRetryCount: 2 + taskRoles: + - name: ps + taskNumber: 2 + frameworkAttemptCompletionPolicy: + minFailedTaskCount: 1 + minSucceededTaskCount: -1 + task: + retryPolicy: + fancyRetryPolicy: false + maxRetryCount: 0 + pod: + metadata: + annotations: + hivedscheduler.microsoft.com/pod-scheduling-spec: |- + virtualCluster: VC2 + priority: 1000 + gpuType: DGX2-V100 + gpuNumber: 1 + affinityGroup: null + spec: + # [PREREQUISITE] + # Do not specify the schedulerName if the HivedScheduler is directly + # called by the k8s default scheduler. + schedulerName: hivedscheduler + restartPolicy: Never + # [PREREQUISITE] + # User needs to setup the k8s cluster networking model and aware the + # potential network overhead, if he want to disable the hostNetwork to + # avoid the coordination of the containerPort usage. + # And for this example, if the hostNetwork is disabled, it only needs + # at least 1 node, otherwise, it needs at least 3 nodes since all the + # 3 workers are specified with the same containerPort. + # See https://kubernetes.io/docs/concepts/cluster-administration/networking + hostNetwork: false + containers: + - name: tensorflow + # Using official image to demonstrate this example. + # The image contains and only contains tensorflow official code. + image: frameworkcontroller/tensorflow-examples:gpu + # For the tf_cnn_benchmarks usage, see + # https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks + workingDir: /tensorflow/benchmarks/scripts/tf_cnn_benchmarks + # Using /mnt/frameworkbarrier/injector.sh to inject environment variables + # without the need for image invasion and k8s DNS: + # FB_{UpperCase({TaskRoleName})}_ADDRESSES= + # {Task[0].PodIP}:${FB_{UpperCase({TaskRoleName})}_PORT},..., + # {Task[TaskRole.TaskNumber-1].PodIP}:${FB_{UpperCase({TaskRoleName})}_PORT} + # See more in ./example/framework/extension/frameworkbarrier.yaml + command: [ + "sh", "-c", + "FB_PS_PORT=4001 FB_WORKER_PORT=5001 . /mnt/frameworkbarrier/injector.sh && + python tf_cnn_benchmarks.py --job_name=ps --task_index=${FC_TASK_INDEX} + --ps_hosts=${FB_PS_ADDRESSES} --worker_hosts=${FB_WORKER_ADDRESSES} + --variable_update=parameter_server --cross_replica_sync=false + --model=alexnet --batch_size=8 --num_batches=10 + --device=gpu --local_parameter_device=gpu --num_gpus=1 --data_format=NCHW + --data_name=cifar10 --data_dir=/mnt/data/cifar-10-batches-py + --train_dir=/mnt/data/${FC_FRAMEWORK_NAME}/output"] + ports: + - containerPort: 4001 + resources: + limits: + # [PREREQUISITE] + # User needs to setup HivedScheduler for the k8s cluster. + # See https://github.com/microsoft/pai/tree/master/subprojects/hivedscheduler + hivedscheduler.microsoft.com/pod-scheduling-enable: 1 + cpu: 3 + memory: 96Gi + env: + - name: NVIDIA_VISIBLE_DEVICES + valueFrom: + fieldRef: + fieldPath: metadata.annotations['hivedscheduler.microsoft.com/pod-gpu-isolation'] + volumeMounts: + - name: frameworkbarrier-volume + mountPath: /mnt/frameworkbarrier + - name: data-volume + mountPath: /mnt/data + # [PREREQUISITE] + # User needs to create a service account for frameworkbarrier, if the + # k8s cluster enforces authorization. + # See more in ./example/framework/extension/frameworkbarrier.yaml + serviceAccountName: frameworkbarrier + initContainers: + - name: frameworkbarrier + # Using official image to demonstrate this example. + image: frameworkcontroller/frameworkbarrier + # Using k8s inClusterConfig, so usually, no need to specify + # KUBE_APISERVER_ADDRESS or KUBECONFIG + #env: + #- name: KUBE_APISERVER_ADDRESS + # value: {http[s]://host:port} + #- name: KUBECONFIG + # value: {Pod Local KubeConfig File Path} + volumeMounts: + - name: frameworkbarrier-volume + mountPath: /mnt/frameworkbarrier + volumes: + - name: frameworkbarrier-volume + emptyDir: {} + - name: data-volume + # [PREREQUISITE] + # User needs to specify his own data-volume for input data and + # output model. + # The data-volume must be a distributed shared file system, so that + # data can be "handed off" between Pods, such as nfs, cephfs or + # glusterfs, etc. + # See https://kubernetes.io/docs/concepts/storage/volumes. + # + # And then he needs to download and extract the example input data + # from: + # https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz + # to: + # {Volume Shared Directory}/cifar-10-batches-py + # + # For example: + #nfs: + # server: {NFS Server Host} + # path: {NFS Shared Directory} + - name: worker + taskNumber: 3 + frameworkAttemptCompletionPolicy: + minFailedTaskCount: 1 + # Succeed the FrameworkAttempt immediately if worker's all Tasks succeeded. + minSucceededTaskCount: 3 + task: + retryPolicy: + fancyRetryPolicy: false + maxRetryCount: 0 + pod: + metadata: + annotations: + hivedscheduler.microsoft.com/pod-scheduling-spec: |- + virtualCluster: VC2 + priority: 1000 + gpuType: DGX2-V100 + gpuNumber: 1 + affinityGroup: null + spec: + # [PREREQUISITE] + # Same as ps TaskRole. + schedulerName: hivedscheduler + restartPolicy: Never + # [PREREQUISITE] + # Same as ps TaskRole. + hostNetwork: false + containers: + - name: tensorflow + image: frameworkcontroller/tensorflow-examples:gpu + workingDir: /tensorflow/benchmarks/scripts/tf_cnn_benchmarks + command: [ + "sh", "-c", + "FB_PS_PORT=4001 FB_WORKER_PORT=5001 . /mnt/frameworkbarrier/injector.sh && + python tf_cnn_benchmarks.py --job_name=worker --task_index=${FC_TASK_INDEX} + --ps_hosts=${FB_PS_ADDRESSES} --worker_hosts=${FB_WORKER_ADDRESSES} + --variable_update=parameter_server --cross_replica_sync=false + --model=alexnet --batch_size=8 --num_batches=10 + --device=gpu --local_parameter_device=gpu --num_gpus=1 --data_format=NCHW + --data_name=cifar10 --data_dir=/mnt/data/cifar-10-batches-py + --train_dir=/mnt/data/${FC_FRAMEWORK_NAME}/output"] + ports: + - containerPort: 5001 + resources: + limits: + # [PREREQUISITE] + # Same as ps TaskRole. + hivedscheduler.microsoft.com/pod-scheduling-enable: 1 + cpu: 3 + memory: 96Gi + env: + - name: NVIDIA_VISIBLE_DEVICES + valueFrom: + fieldRef: + fieldPath: metadata.annotations['hivedscheduler.microsoft.com/pod-gpu-isolation'] + volumeMounts: + - name: frameworkbarrier-volume + mountPath: /mnt/frameworkbarrier + - name: data-volume + mountPath: /mnt/data + # [PREREQUISITE] + # Same as ps TaskRole. + serviceAccountName: frameworkbarrier + initContainers: + - name: frameworkbarrier + image: frameworkcontroller/frameworkbarrier + #env: + #- name: KUBE_APISERVER_ADDRESS + # value: {http[s]://host:port} + #- name: KUBECONFIG + # value: {Pod Local KubeConfig File Path} + volumeMounts: + - name: frameworkbarrier-volume + mountPath: /mnt/frameworkbarrier + volumes: + - name: frameworkbarrier-volume + emptyDir: {} + - name: data-volume + # [PREREQUISITE] + # Same as ps TaskRole. + #nfs: + # server: {NFS Server Host} + # path: {NFS Shared Directory}