From 80492e5c53073e38de07c5ce9c3f37c517395cbd Mon Sep 17 00:00:00 2001
From: Yuqi Wang <32826762+yqwang-ms@users.noreply.github.com>
Date: Fri, 2 Aug 2019 12:19:32 +0800
Subject: [PATCH]  Add TensorFlow Example to leverage HivedScheduler (#34)

---
 README.md                                     |   5 +
 .../framework/scenario/tensorflow/README.md   |   3 +-
 ...butedtrainingwithdefaultscheduledgpu.yaml} |   2 +-
 ...tributedtrainingwithhivedscheduledgpu.yaml | 217 ++++++++++++++++++
 4 files changed, 225 insertions(+), 2 deletions(-)
 rename example/framework/scenario/tensorflow/gpu/{tensorflowdistributedtrainingwithgpu.yaml => tensorflowdistributedtrainingwithdefaultscheduledgpu.yaml} (99%)
 create mode 100644 example/framework/scenario/tensorflow/gpu/tensorflowdistributedtrainingwithhivedscheduledgpu.yaml

diff --git a/README.md b/README.md
index 565752b..d710059 100644
--- a/README.md
+++ b/README.md
@@ -75,6 +75,11 @@ A specialized wrapper can be built on top of FrameworkController to optimize for
 * [OpenPAI Controller Wrapper (Job RestServer)](https://github.com/microsoft/pai/tree/master/src/rest-server): A wrapper client optimized for AI applications
 * [NNI Controller Wrapper (TrainingService)](https://github.com/microsoft/nni/blob/master/docs/en_US/TrainingService/FrameworkControllerMode.md): A wrapper client optimized for AutoML applications
 
+### Recommended Kubernetes Scheduler
+FrameworkController can directly leverage many [Kubernetes Schedulers](https://kubernetes.io/docs/tasks/administer-cluster/configure-multiple-schedulers) and among them we recommend these best fits:
+* [Kubernetes Default Scheduler](https://kubernetes.io/docs/concepts/scheduling/kube-scheduler/#kube-scheduler): A General-Purpose Kubernetes Scheduler
+* [HivedScheduler](https://github.com/microsoft/pai/tree/master/subprojects/hivedscheduler): A Kubernetes Scheduler Extender optimized for GPUs ([Example](example/framework/scenario/tensorflow/gpu/tensorflowdistributedtrainingwithhivedscheduledgpu.yaml))
+
 ### Similar Offering On Other Cluster Manager
 * [YARN FrameworkLauncher](https://github.com/Microsoft/pai/blob/master/subprojects/frameworklauncher/yarn): Similar offering natively supports [Apache YARN](http://hadoop.apache.org)
 
diff --git a/example/framework/scenario/tensorflow/README.md b/example/framework/scenario/tensorflow/README.md
index 8f0a685..31ab4a7 100644
--- a/example/framework/scenario/tensorflow/README.md
+++ b/example/framework/scenario/tensorflow/README.md
@@ -5,7 +5,8 @@
 2. Automatically clean up PS when the whole FrameworkAttempt is completed
 3. No need to adjust existing TensorFlow image
 4. No need to setup [Kubernetes DNS](https://kubernetes.io/docs/concepts/services-networking/dns-pod-service) and [Kubernetes Service](https://kubernetes.io/docs/concepts/services-networking/service)
-5. [Common Feature](../../../../README.md#Feature)
+5. Easy to leverage [HivedScheduler](https://github.com/microsoft/pai/tree/master/subprojects/hivedscheduler) to be GPU Multi-Tenant and Topology-Aware
+6. [Common Feature](../../../../README.md#Feature)
 
 ## Prerequisite
 1. See `[PREREQUISITE]` in each specific Framework yaml file.
diff --git a/example/framework/scenario/tensorflow/gpu/tensorflowdistributedtrainingwithgpu.yaml b/example/framework/scenario/tensorflow/gpu/tensorflowdistributedtrainingwithdefaultscheduledgpu.yaml
similarity index 99%
rename from example/framework/scenario/tensorflow/gpu/tensorflowdistributedtrainingwithgpu.yaml
rename to example/framework/scenario/tensorflow/gpu/tensorflowdistributedtrainingwithdefaultscheduledgpu.yaml
index ef4ccc1..209e4ef 100644
--- a/example/framework/scenario/tensorflow/gpu/tensorflowdistributedtrainingwithgpu.yaml
+++ b/example/framework/scenario/tensorflow/gpu/tensorflowdistributedtrainingwithdefaultscheduledgpu.yaml
@@ -7,7 +7,7 @@
 apiVersion: frameworkcontroller.microsoft.com/v1
 kind: Framework
 metadata:
-  name: tensorflowdistributedtrainingwithgpu
+  name: tensorflowdistributedtrainingwithdefaultscheduledgpu
 spec:
   executionType: Start
   retryPolicy:
diff --git a/example/framework/scenario/tensorflow/gpu/tensorflowdistributedtrainingwithhivedscheduledgpu.yaml b/example/framework/scenario/tensorflow/gpu/tensorflowdistributedtrainingwithhivedscheduledgpu.yaml
new file mode 100644
index 0000000..8c9843b
--- /dev/null
+++ b/example/framework/scenario/tensorflow/gpu/tensorflowdistributedtrainingwithhivedscheduledgpu.yaml
@@ -0,0 +1,217 @@
+# For the full spec setting and usage, see ./pkg/apis/frameworkcontroller/v1/types.go
+# For the full frameworkbarrier usage, see ./pkg/barrier/barrier.go
+
+############################### Prerequisite ###################################
+# See "[PREREQUISITE]" in this file.
+################################################################################
+apiVersion: frameworkcontroller.microsoft.com/v1
+kind: Framework
+metadata:
+  name: tensorflowdistributedtrainingwithhivedscheduledgpu
+spec:
+  executionType: Start
+  retryPolicy:
+    fancyRetryPolicy: true
+    maxRetryCount: 2
+  taskRoles:
+  - name: ps
+    taskNumber: 2
+    frameworkAttemptCompletionPolicy:
+      minFailedTaskCount: 1
+      minSucceededTaskCount: -1
+    task:
+      retryPolicy:
+        fancyRetryPolicy: false
+        maxRetryCount: 0
+      pod:
+        metadata:
+          annotations:
+            hivedscheduler.microsoft.com/pod-scheduling-spec: |-
+              virtualCluster: VC2
+              priority: 1000
+              gpuType: DGX2-V100
+              gpuNumber: 1
+              affinityGroup: null
+        spec:
+          # [PREREQUISITE]
+          # Do not specify the schedulerName if the HivedScheduler is directly
+          # called by the k8s default scheduler.
+          schedulerName: hivedscheduler
+          restartPolicy: Never
+          # [PREREQUISITE]
+          # User needs to setup the k8s cluster networking model and aware the
+          # potential network overhead, if he want to disable the hostNetwork to
+          # avoid the coordination of the containerPort usage.
+          # And for this example, if the hostNetwork is disabled, it only needs
+          # at least 1 node, otherwise, it needs at least 3 nodes since all the
+          # 3 workers are specified with the same containerPort.
+          # See https://kubernetes.io/docs/concepts/cluster-administration/networking
+          hostNetwork: false
+          containers:
+          - name: tensorflow
+            # Using official image to demonstrate this example.
+            # The image contains and only contains tensorflow official code.
+            image: frameworkcontroller/tensorflow-examples:gpu
+            # For the tf_cnn_benchmarks usage, see
+            # https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks
+            workingDir: /tensorflow/benchmarks/scripts/tf_cnn_benchmarks
+            # Using /mnt/frameworkbarrier/injector.sh to inject environment variables
+            # without the need for image invasion and k8s DNS:
+            # FB_{UpperCase({TaskRoleName})}_ADDRESSES=
+            #   {Task[0].PodIP}:${FB_{UpperCase({TaskRoleName})}_PORT},...,
+            #   {Task[TaskRole.TaskNumber-1].PodIP}:${FB_{UpperCase({TaskRoleName})}_PORT}
+            # See more in ./example/framework/extension/frameworkbarrier.yaml
+            command: [
+            "sh", "-c",
+            "FB_PS_PORT=4001 FB_WORKER_PORT=5001 . /mnt/frameworkbarrier/injector.sh &&
+            python tf_cnn_benchmarks.py --job_name=ps --task_index=${FC_TASK_INDEX}
+            --ps_hosts=${FB_PS_ADDRESSES} --worker_hosts=${FB_WORKER_ADDRESSES}
+            --variable_update=parameter_server --cross_replica_sync=false
+            --model=alexnet --batch_size=8 --num_batches=10
+            --device=gpu --local_parameter_device=gpu --num_gpus=1 --data_format=NCHW
+            --data_name=cifar10 --data_dir=/mnt/data/cifar-10-batches-py
+            --train_dir=/mnt/data/${FC_FRAMEWORK_NAME}/output"]
+            ports:
+            - containerPort: 4001
+            resources:
+              limits:
+                # [PREREQUISITE]
+                # User needs to setup HivedScheduler for the k8s cluster.
+                # See https://github.com/microsoft/pai/tree/master/subprojects/hivedscheduler
+                hivedscheduler.microsoft.com/pod-scheduling-enable: 1
+                cpu: 3
+                memory: 96Gi
+            env:
+            - name: NVIDIA_VISIBLE_DEVICES
+              valueFrom:
+                fieldRef:
+                  fieldPath: metadata.annotations['hivedscheduler.microsoft.com/pod-gpu-isolation']
+            volumeMounts:
+            - name: frameworkbarrier-volume
+              mountPath: /mnt/frameworkbarrier
+            - name: data-volume
+              mountPath: /mnt/data
+          # [PREREQUISITE]
+          # User needs to create a service account for frameworkbarrier, if the
+          # k8s cluster enforces authorization.
+          # See more in ./example/framework/extension/frameworkbarrier.yaml
+          serviceAccountName: frameworkbarrier
+          initContainers:
+          - name: frameworkbarrier
+            # Using official image to demonstrate this example.
+            image: frameworkcontroller/frameworkbarrier
+            # Using k8s inClusterConfig, so usually, no need to specify
+            # KUBE_APISERVER_ADDRESS or KUBECONFIG
+            #env:
+            #- name: KUBE_APISERVER_ADDRESS
+            #  value: {http[s]://host:port}
+            #- name: KUBECONFIG
+            #  value: {Pod Local KubeConfig File Path}
+            volumeMounts:
+            - name: frameworkbarrier-volume
+              mountPath: /mnt/frameworkbarrier
+          volumes:
+          - name: frameworkbarrier-volume
+            emptyDir: {}
+          - name: data-volume
+            # [PREREQUISITE]
+            # User needs to specify his own data-volume for input data and
+            # output model.
+            # The data-volume must be a distributed shared file system, so that
+            # data can be "handed off" between Pods, such as nfs, cephfs or
+            # glusterfs, etc.
+            # See https://kubernetes.io/docs/concepts/storage/volumes.
+            #
+            # And then he needs to download and extract the example input data
+            # from:
+            #   https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
+            # to:
+            #   {Volume Shared Directory}/cifar-10-batches-py
+            #
+            # For example:
+            #nfs:
+            #  server: {NFS Server Host}
+            #  path: {NFS Shared Directory}
+  - name: worker
+    taskNumber: 3
+    frameworkAttemptCompletionPolicy:
+      minFailedTaskCount: 1
+      # Succeed the FrameworkAttempt immediately if worker's all Tasks succeeded.
+      minSucceededTaskCount: 3
+    task:
+      retryPolicy:
+        fancyRetryPolicy: false
+        maxRetryCount: 0
+      pod:
+        metadata:
+          annotations:
+            hivedscheduler.microsoft.com/pod-scheduling-spec: |-
+              virtualCluster: VC2
+              priority: 1000
+              gpuType: DGX2-V100
+              gpuNumber: 1
+              affinityGroup: null
+        spec:
+          # [PREREQUISITE]
+          # Same as ps TaskRole.
+          schedulerName: hivedscheduler
+          restartPolicy: Never
+          # [PREREQUISITE]
+          # Same as ps TaskRole.
+          hostNetwork: false
+          containers:
+          - name: tensorflow
+            image: frameworkcontroller/tensorflow-examples:gpu
+            workingDir: /tensorflow/benchmarks/scripts/tf_cnn_benchmarks
+            command: [
+            "sh", "-c",
+            "FB_PS_PORT=4001 FB_WORKER_PORT=5001 . /mnt/frameworkbarrier/injector.sh &&
+            python tf_cnn_benchmarks.py --job_name=worker --task_index=${FC_TASK_INDEX}
+            --ps_hosts=${FB_PS_ADDRESSES} --worker_hosts=${FB_WORKER_ADDRESSES}
+            --variable_update=parameter_server --cross_replica_sync=false
+            --model=alexnet --batch_size=8 --num_batches=10
+            --device=gpu --local_parameter_device=gpu --num_gpus=1 --data_format=NCHW
+            --data_name=cifar10 --data_dir=/mnt/data/cifar-10-batches-py
+            --train_dir=/mnt/data/${FC_FRAMEWORK_NAME}/output"]
+            ports:
+            - containerPort: 5001
+            resources:
+              limits:
+                # [PREREQUISITE]
+                # Same as ps TaskRole.
+                hivedscheduler.microsoft.com/pod-scheduling-enable: 1
+                cpu: 3
+                memory: 96Gi
+            env:
+            - name: NVIDIA_VISIBLE_DEVICES
+              valueFrom:
+                fieldRef:
+                  fieldPath: metadata.annotations['hivedscheduler.microsoft.com/pod-gpu-isolation']
+            volumeMounts:
+            - name: frameworkbarrier-volume
+              mountPath: /mnt/frameworkbarrier
+            - name: data-volume
+              mountPath: /mnt/data
+          # [PREREQUISITE]
+          # Same as ps TaskRole.
+          serviceAccountName: frameworkbarrier
+          initContainers:
+          - name: frameworkbarrier
+            image: frameworkcontroller/frameworkbarrier
+            #env:
+            #- name: KUBE_APISERVER_ADDRESS
+            #  value: {http[s]://host:port}
+            #- name: KUBECONFIG
+            #  value: {Pod Local KubeConfig File Path}
+            volumeMounts:
+            - name: frameworkbarrier-volume
+              mountPath: /mnt/frameworkbarrier
+          volumes:
+          - name: frameworkbarrier-volume
+            emptyDir: {}
+          - name: data-volume
+            # [PREREQUISITE]
+            # Same as ps TaskRole.
+            #nfs:
+            #  server: {NFS Server Host}
+            #  path: {NFS Shared Directory}