[quick-start] hived config generator's log update and support amd device (#4253)

This commit is contained in:
YundongYe 2020-03-07 14:07:39 +08:00 коммит произвёл GitHub
Родитель 2386c93313
Коммит a278fb5d42
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
4 изменённых файлов: 75 добавлений и 4 удалений

Просмотреть файл

@ -21,7 +21,7 @@
copy:
src: "{{ kube_admin_conf_path }}"
dest: "{{ kubeconfig_path }}/config"
mode: preserve
mode: "0644"
backup: yes
- name: run the equivalent of "apt-get update" as a separate step

Просмотреть файл

@ -84,6 +84,43 @@ def generate_template_file(template_file_path, output_path, map_table):
write_generated_file(output_path, generated_template)
def pod_is_ready_or_not(label_key, label_value, service_name):
label_selector_str="{0}={1}".format(label_key, label_value)
config.load_kube_config()
v1 = client.CoreV1Api()
try:
pod_list = v1.list_pod_for_all_namespaces(label_selector=label_selector_str, watch=False)
except ApiException as e:
logger.error("Exception when calling CoreV1Api->list_pod_for_all_namespaces: %s\n" % e)
return False
if len(pod_list.items) == 0:
logger.warning("No pod can be dectected.")
return False
ready = 0
unready = 0
for pod in pod_list.items:
if pod.status.container_statuses is None:
unready = unready + 1
for container in pod.status.container_statuses:
if container.ready != True:
unready = unready + 1
else:
ready = ready + 1
if unready != 0:
logger.info("{0} is not ready.".format(service_name))
logger.info("Total: {0}".format(ready + unready))
logger.info("Ready: {1}",format(ready))
return False
return True
def get_kubernetes_node_info_from_API():
config.load_kube_config()
api_instance = client.CoreV1Api()
@ -99,6 +136,8 @@ def get_kubernetes_node_info_from_API():
gpu_resource = 0
if 'nvidia.com/gpu' in node.status.allocatable:
gpu_resource = int(parse_quantity(node.status.allocatable['nvidia.com/gpu']))
if 'amd.com/gpu' in node.status.allocatable:
gpu_resource = int(parse_quantity(node.status.allocatable['amd.com/gpu']))
ret[node.metadata.name] = {
"cpu-resource": int(parse_quantity(node.status.allocatable['cpu'])),
"mem-resource": int(parse_quantity(node.status.allocatable['memory']) / 1024 / 1024 ),
@ -110,6 +149,26 @@ def get_kubernetes_node_info_from_API():
return ret
def wait_nvidia_device_plugin_ready(total_time=3600):
while pod_is_ready_or_not("name", "nvidia-device-plugin-ds", "Nvidia-Device-Plugin") != True:
logger.info("Nvidia-Device-Plugin is not ready yet. Please wait for a moment!")
time.sleep(10)
total_time = total_time - 10
if total_time < 0:
logger.error("An issue occure when starting up Nvidia-Device-Plugin")
sys.exit(1)
def wait_amd_device_plugin_ready(total_time=3600):
while pod_is_ready_or_not("name", "amdgpu-dp-ds", "AMD-Device-Plugin") != True:
logger.info("AMD-Device-Plugin is not ready yet. Please wait for a moment!")
time.sleep(10)
total_time = total_time - 10
if total_time < 0:
logger.error("An issue occure when starting up AMD-Device-Plugin")
sys.exit(1)
def hived_config_prepare(worker_dict, node_resource_dict):
hived_config = dict()
hived_config["nodelist"] = []
@ -122,8 +181,8 @@ def hived_config_prepare(worker_dict, node_resource_dict):
if key not in worker_dict:
continue
if node_resource_dict[key]["gpu-resource"] == 0:
logger.error("Allocatable GPU number in {0} is 0, Hived doesn't support worker node with 0 GPU".format(key))
logger.error("Please remove {0} from your workerlist, or check if the NVIDIA device plugin is running healthy on the node.".format(key))
logger.error("Allocatable GPU number in {0} is 0, current quick start script does not allow.".format(key))
logger.error("Please remove {0} from your workerlist, or check if the device plugin is running healthy on the node.".format(key))
sys.exit(1)
min_cpu = min(min_cpu, node_resource_dict[key]["cpu-resource"])
min_mem = min(min_mem, node_resource_dict[key]["mem-resource"])
@ -159,6 +218,8 @@ def main():
head_node = master_list[0]
worker_dict = csv_reader_ret_dict(args.worklist)
wait_nvidia_device_plugin_ready()
wait_amd_device_plugin_ready()
node_resource_dict = get_kubernetes_node_info_from_API()
hived_config = hived_config_prepare(worker_dict, node_resource_dict)

Просмотреть файл

@ -56,10 +56,20 @@ echo "branch name: ${OPENPAI_BRANCH_NAME}"
git checkout ${OPENPAI_BRANCH_NAME}
git pull
echo "starting nvidia device plugin to detect nvidia gpu resource"
kubectl apply --overwrite=true -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/1.0.0-beta4/nvidia-device-plugin.yml || exit $?
sleep 5
echo "starting AMD device plugin to detect AMD gpu resource"
kubectl apply --overwrite=true -f https://raw.githubusercontent.com/RadeonOpenCompute/k8s-device-plugin/master/k8s-ds-amdgpu-dp.yaml || exit $?
sleep5
python3 /root/pai/contrib/kubespray/script/openpai-generator.py -m /quick-start-config/master.csv -w /quick-start-config/worker.csv -c /quick-start-config/config.yml -o /cluster-configuration || exit $?
kubectl delete ds nvidia-device-plugin-daemonset -n kube-system || exit $?
kubectl delete ds amdgpu-device-plugin-daemonset -n kube-system || exit $?
sleep 5
echo y | pip3 uninstall kubernetes==11.0.0b2
pip3 install kubernetes

Просмотреть файл

@ -6,7 +6,7 @@
setup:
delegate_to: localhost
- name: set ansible control host IP fact
- name: set local user's home path
set_fact:
local_home_path: "{{ hostvars[inventory_hostname]['ansible_env']['HOME'] }}"
delegate_to: 127.0.0.1