[quick-start] hived config generator's log update and support amd device (#4253)

2020-03-07 14:07:39 +08:00 · 2020-03-07 14:07:39 +08:00 · a278fb5d42
--- a/contrib/kubespray/roles/kubectl/tasks/main.yml
+++ b/contrib/kubespray/roles/kubectl/tasks/main.yml
@ -21,7 +21,7 @@
  copy:
    src: "{{ kube_admin_conf_path }}"
    dest: "{{ kubeconfig_path }}/config"
-    mode: preserve
+    mode: "0644"
    backup: yes

 - name: run the equivalent of "apt-get update" as a separate step
--- a/contrib/kubespray/script/openpai-generator.py
+++ b/contrib/kubespray/script/openpai-generator.py
@ -84,6 +84,43 @@ def generate_template_file(template_file_path, output_path, map_table):
    write_generated_file(output_path, generated_template)


+def pod_is_ready_or_not(label_key, label_value, service_name):
+
+    label_selector_str="{0}={1}".format(label_key, label_value)
+
+    config.load_kube_config()
+    v1 = client.CoreV1Api()
+
+    try:
+        pod_list = v1.list_pod_for_all_namespaces(label_selector=label_selector_str, watch=False)
+    except ApiException as e:
+        logger.error("Exception when calling CoreV1Api->list_pod_for_all_namespaces: %s\n" % e)
+        return False
+
+    if len(pod_list.items) == 0:
+        logger.warning("No pod can be dectected.")
+        return False
+
+    ready = 0
+    unready = 0
+    for pod in pod_list.items:
+        if pod.status.container_statuses is None:
+            unready = unready + 1
+        for container in pod.status.container_statuses:
+            if container.ready != True:
+                unready = unready + 1
+            else:
+                ready = ready + 1
+
+    if unready != 0:
+        logger.info("{0} is not ready.".format(service_name))
+        logger.info("Total: {0}".format(ready + unready))
+        logger.info("Ready: {1}",format(ready))
+        return False
+
+    return True
+
+
 def get_kubernetes_node_info_from_API():
    config.load_kube_config()
    api_instance = client.CoreV1Api()
@ -99,6 +136,8 @@ def get_kubernetes_node_info_from_API():
            gpu_resource = 0
            if 'nvidia.com/gpu' in node.status.allocatable:
                gpu_resource = int(parse_quantity(node.status.allocatable['nvidia.com/gpu']))
+            if 'amd.com/gpu' in node.status.allocatable:
+                gpu_resource = int(parse_quantity(node.status.allocatable['amd.com/gpu']))
            ret[node.metadata.name] = {
                "cpu-resource": int(parse_quantity(node.status.allocatable['cpu'])),
                "mem-resource": int(parse_quantity(node.status.allocatable['memory']) / 1024 / 1024 ),
@ -110,6 +149,26 @@ def get_kubernetes_node_info_from_API():
    return ret


+def wait_nvidia_device_plugin_ready(total_time=3600):
+    while pod_is_ready_or_not("name", "nvidia-device-plugin-ds", "Nvidia-Device-Plugin") != True:
+        logger.info("Nvidia-Device-Plugin is not ready yet. Please wait for a moment!")
+        time.sleep(10)
+        total_time = total_time - 10
+        if total_time < 0:
+            logger.error("An issue occure when starting up Nvidia-Device-Plugin")
+            sys.exit(1)
+
+
+def wait_amd_device_plugin_ready(total_time=3600):
+    while pod_is_ready_or_not("name", "amdgpu-dp-ds", "AMD-Device-Plugin") != True:
+        logger.info("AMD-Device-Plugin is not ready yet. Please wait for a moment!")
+        time.sleep(10)
+        total_time = total_time - 10
+        if total_time < 0:
+            logger.error("An issue occure when starting up AMD-Device-Plugin")
+            sys.exit(1)
+
+
 def hived_config_prepare(worker_dict, node_resource_dict):
    hived_config = dict()
    hived_config["nodelist"] = []
@ -122,8 +181,8 @@ def hived_config_prepare(worker_dict, node_resource_dict):
        if key not in worker_dict:
            continue
        if node_resource_dict[key]["gpu-resource"] == 0:
-            logger.error("Allocatable GPU number in {0} is 0, Hived doesn't support worker node with 0 GPU".format(key))
-            logger.error("Please remove {0} from your workerlist, or check if the NVIDIA device plugin is running healthy on the node.".format(key))
+            logger.error("Allocatable GPU number in {0} is 0, current quick start script does not allow.".format(key))
+            logger.error("Please remove {0} from your workerlist, or check if the device plugin is running healthy on the node.".format(key))
            sys.exit(1)
        min_cpu = min(min_cpu, node_resource_dict[key]["cpu-resource"])
        min_mem = min(min_mem, node_resource_dict[key]["mem-resource"])
@ -159,6 +218,8 @@ def main():
    head_node = master_list[0]

    worker_dict = csv_reader_ret_dict(args.worklist)
+    wait_nvidia_device_plugin_ready()
+    wait_amd_device_plugin_ready()
    node_resource_dict = get_kubernetes_node_info_from_API()
    hived_config = hived_config_prepare(worker_dict, node_resource_dict)

--- a/contrib/kubespray/script/service-boot.sh
+++ b/contrib/kubespray/script/service-boot.sh
@ -56,10 +56,20 @@ echo "branch name: ${OPENPAI_BRANCH_NAME}"
 git checkout ${OPENPAI_BRANCH_NAME}
 git pull

+echo "starting nvidia device plugin to detect nvidia gpu resource"
 kubectl apply --overwrite=true -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/1.0.0-beta4/nvidia-device-plugin.yml || exit $?
+sleep 5
+
+echo "starting AMD device plugin to detect AMD gpu resource"
+kubectl apply --overwrite=true -f https://raw.githubusercontent.com/RadeonOpenCompute/k8s-device-plugin/master/k8s-ds-amdgpu-dp.yaml || exit $?
+sleep5

 python3 /root/pai/contrib/kubespray/script/openpai-generator.py -m /quick-start-config/master.csv -w /quick-start-config/worker.csv -c /quick-start-config/config.yml -o /cluster-configuration || exit $?

+kubectl delete ds nvidia-device-plugin-daemonset -n kube-system || exit $?
+kubectl delete ds amdgpu-device-plugin-daemonset -n kube-system || exit $?
+sleep 5
+
 echo y | pip3 uninstall kubernetes==11.0.0b2
 pip3 install kubernetes

--- a/contrib/kubespray/set-kubectl.yml
+++ b/contrib/kubespray/set-kubectl.yml
@ -6,7 +6,7 @@
      setup:
      delegate_to: localhost

-    - name: set ansible control host IP fact
+    - name: set local user's home path
      set_fact:
        local_home_path: "{{ hostvars[inventory_hostname]['ansible_env']['HOME'] }}"
      delegate_to: 127.0.0.1