From 388bd4f9451b26d83707f80a27432dd7d4365782 Mon Sep 17 00:00:00 2001 From: Binyang2014 Date: Fri, 22 May 2020 11:35:48 +0800 Subject: [PATCH] Use hash func to generate port number (#11) Refer to issue: microsoft/pai#4384 Using hash function: (int(md5(podUid + taskPortName + taskPortIndex)[0:12] ,16) + int(md5(podUid + taskPortName + taskPortIndex)[12:24] ,16) + int(md5(podUid + taskPortName + taskPortIndex)[24:32] ,16)) % (globalPortEnd - globalPortStart) + globalPortStart to generate portnumber. If port conflict happens, the task will failed. Retried task will has different podUid, and new task will be given different port number. --- src/__init__.py | 0 src/init | 4 +- src/init.d/{parser.py => framework_parser.py} | 75 +- src/init.d/port.py | 5 + test/framework.json | 893 ++++++++++++++++++ test/test_framework_parser.py | 74 ++ 6 files changed, 1029 insertions(+), 22 deletions(-) create mode 100644 src/__init__.py rename src/init.d/{parser.py => framework_parser.py} (71%) create mode 100644 test/framework.json create mode 100644 test/test_framework_parser.py diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/init b/src/init index c45578e..a51cf8e 100644 --- a/src/init +++ b/src/init @@ -153,12 +153,12 @@ cp ${PAI_CONFIG_DIR}/runtime-exit-spec.yaml ${PAI_RUNTIME_DIR} # generate runtime env variables # priority=10 CHILD_PROCESS="ENV_GENERATOR" -python ${PAI_INIT_DIR}/parser.py genenv framework.json > ${PAI_RUNTIME_DIR}/runtime_env.sh +python ${PAI_INIT_DIR}/framework_parser.py genenv framework.json > ${PAI_RUNTIME_DIR}/runtime_env.sh # generate jobconfig # priority=11 CHILD_PROCESS="CONFIG_GENERATOR" -python ${PAI_INIT_DIR}/parser.py genconf framework.json > ${PAI_RUNTIME_DIR}/job_config.yaml +python ${PAI_INIT_DIR}/framework_parser.py genconf framework.json > ${PAI_RUNTIME_DIR}/job_config.yaml # Init plugins # priority=12 diff --git a/src/init.d/parser.py b/src/init.d/framework_parser.py similarity index 71% rename from src/init.d/parser.py rename to src/init.d/framework_parser.py index dfc85d7..bcceb76 100644 --- a/src/init.d/parser.py +++ b/src/init.d/framework_parser.py @@ -18,6 +18,7 @@ import argparse import base64 +import hashlib import logging import gzip import json @@ -42,6 +43,31 @@ def decompress_field(field): return obj +def generate_seq_ports_num(port_start, port_count, task_index): + base = port_start + port_count * task_index + return [str(port_num) for port_num in range(base, base + port_count)] + + +def generate_hashed_ports_num(pod_uid, port_name, port_count, port_start, + port_end): + """ Random generate the port number + + The algorithm is: + (int(md5(podUid + portName + portIndex)[0:12] ,16) + + int(md5(podUid + portName + portIndex)[12:24] ,16) + + int(md5(podUid + portName + portIndex)[24:32] ,16)) % (port_end - port_start) + port_start + """ + port_list = [] + for i in range(port_count): + raw_str = "[{}][{}][{}]".format(pod_uid, port_name, str(i)) + hash_str = hashlib.md5(raw_str.encode("utf8")).hexdigest() + port_list.append( + str((int(hash_str[:12], 16) + int(hash_str[12:24], 16) + + int(hash_str[24:], 16)) % (port_end - port_start) + + port_start)) + return port_list + + def generate_runtime_env(framework): #pylint: disable=too-many-locals """Generate runtime env variables for tasks. @@ -96,41 +122,50 @@ def generate_runtime_env(framework): #pylint: disable=too-many-locals for task in taskrole["taskStatuses"]: index = task["index"] current_ip = task["attemptStatus"]["podHostIP"] + pod_uid = task["attemptStatus"]["podUID"] + task_ports = {} taskrole_instances.append("{}:{}".format(name, index)) - get_port_base = lambda port_name, p=ports, i=index: int(p[ - port_name]["start"]) + int(p[port_name]["count"]) * int(i) + use_port_hash = True + if "ports" in ports and "schedulePortStart" in ports and "schedulePortEnd" in ports: + port_start = ports["schedulePortStart"] + port_end = ports["schedulePortEnd"] + port_list = ports["ports"] + else: + # for backward compatibility + use_port_hash = False + port_list = ports - # export ip/port for task role, current ip maybe None for non-gang-allocation - if current_ip: - export("PAI_HOST_IP_{}_{}".format(name, index), current_ip) - host_list.append("{}:{}".format(current_ip, - get_port_base("http"))) - - for port in ports.keys(): - start, count = get_port_base(port), int(ports[port]["count"]) - current_port_str = ",".join( - str(x) for x in range(start, start + count)) + for port in port_list.keys(): + count = int(port_list[port]["count"]) + task_ports[port] = generate_hashed_ports_num( + pod_uid, port, count, port_start, + port_end) if use_port_hash else generate_seq_ports_num( + port_list[port]["start"], count, index) + current_port_str = ",".join(task_ports[port]) export("PAI_PORT_LIST_{}_{}_{}".format(name, index, port), current_port_str) export("PAI_{}_{}_{}_PORT".format(name, index, port), current_port_str) + # export ip/port for task role, current ip maybe None for non-gang-allocation + if current_ip: + export("PAI_HOST_IP_{}_{}".format(name, index), current_ip) + host_list.append("{}:{}".format(current_ip, + task_ports["http"][0])) + # export ip/port for current container if (current_taskrole_name == name and current_task_index == str(index)): export("PAI_CURRENT_CONTAINER_IP", current_ip) - export("PAI_CURRENT_CONTAINER_PORT", get_port_base("http")) + export("PAI_CURRENT_CONTAINER_PORT", task_ports["http"][0]) export("PAI_CONTAINER_HOST_IP", current_ip) - export("PAI_CONTAINER_HOST_PORT", get_port_base("http")) - export("PAI_CONTAINER_SSH_PORT", get_port_base("ssh")) + export("PAI_CONTAINER_HOST_PORT", task_ports["http"][0]) + export("PAI_CONTAINER_SSH_PORT", task_ports["ssh"][0]) port_str = "" - for port in ports.keys(): - start, count = get_port_base(port), int( - ports[port]["count"]) - current_port_str = ",".join( - str(x) for x in range(start, start + count)) + for port in port_list.keys(): + current_port_str = ",".join(task_ports[port]) export("PAI_CONTAINER_HOST_{}_PORT_LIST".format(port), current_port_str) port_str += "{}:{};".format(port, current_port_str) diff --git a/src/init.d/port.py b/src/init.d/port.py index b57bf18..5532474 100644 --- a/src/init.d/port.py +++ b/src/init.d/port.py @@ -49,8 +49,13 @@ def check_port(portno): def check_port_list_env(port_list_env): + ports = {} for each in re.split(":|;|,", port_list_env): if each.isdigit(): + if each in ports: + LOGGER.error("Port %s has conflict.", each) + sys.exit(10) + ports[each] = True check_port(int(each)) diff --git a/test/framework.json b/test/framework.json new file mode 100644 index 0000000..f270b9e --- /dev/null +++ b/test/framework.json @@ -0,0 +1,893 @@ +{ + "apiVersion": "frameworkcontroller.microsoft.com/v1", + "kind": "Framework", + "metadata": { + "annotations": { + "config": "protocolVersion: 2\nname: test\ntype: job\njobRetryCount: 0\nprerequisites:\n - type: dockerimage\n uri: 'openpai/standard:python_3.6-pytorch_1.2.0-gpu'\n name: docker_image_0\ntaskRoles:\n taskrole:\n instances: 1\n completion:\n minFailedInstances: 1\n minSucceededInstances: -1\n taskRetryCount: 0\n dockerImage: docker_image_0\n resourcePerInstance:\n gpu: 1\n cpu: 4\n memoryMB: 8192\n ports:\n tcp: 3\n udp: 3\n commands:\n - printenv\n taskrole_1:\n instances: 1\n completion:\n minFailedInstances: 1\n minSucceededInstances: -1\n taskRetryCount: 0\n dockerImage: docker_image_0\n resourcePerInstance:\n gpu: 1\n cpu: 4\n memoryMB: 8192\n ports:\n mpi: 2\n tensorflow: 1\n commands:\n - printenv\ndefaults:\n virtualCluster: default\nextras:\n com.microsoft.pai.runtimeplugin:\n - plugin: ssh\n parameters:\n jobssh: true\n", + "jobName": "test", + "logPathInfix": "51b333b433467483e9e16fcff34ceeda", + "totalGpuNumber": "2" + }, + "creationTimestamp": "2020-05-22T03:18:43Z", + "generation": 19, + "labels": { + "userName": "test_user", + "virtualCluster": "default" + }, + "name": "51b333b433467483e9e16fcff34ceeda", + "namespace": "default", + "resourceVersion": "48376615", + "selfLink": "/apis/frameworkcontroller.microsoft.com/v1/namespaces/default/frameworks/51b333b433467483e9e16fcff34ceeda", + "uid": "f0f0e75a-9bda-11ea-830b-000d3ab25bb6" + }, + "spec": { + "description": "", + "executionType": "Start", + "retryPolicy": { + "fancyRetryPolicy": true, + "maxRetryCount": 0 + }, + "taskRoles": [ + { + "frameworkAttemptCompletionPolicy": { + "minFailedTaskCount": 1, + "minSucceededTaskCount": -1 + }, + "name": "taskrole", + "task": { + "pod": { + "metadata": { + "annotations": { + "container.apparmor.security.beta.kubernetes.io/app": "unconfined", + "rest-server/port-scheduling-spec": "{\"schedulePortStart\":20000,\"schedulePortEnd\":40000,\"ports\":{\"tcp\":{\"count\":3},\"udp\":{\"count\":3},\"ssh\":{\"count\":1},\"http\":{\"count\":1}}}" + }, + "creationTimestamp": null, + "labels": { + "type": "kube-launcher-task", + "userName": "test_user", + "virtualCluster": "default" + } + }, + "spec": { + "affinity": { + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "pai-worker", + "operator": "In", + "values": ["true"] + } + ] + } + ] + } + } + }, + "containers": [ + { + "command": ["/usr/local/pai/runtime"], + "env": [ + { + "name": "PAI_FRAMEWORK_NAME", + "value": "test_user~test" + }, + { + "name": "PAI_JOB_NAME", + "value": "test_user~test" + }, + { + "name": "PAI_USER_NAME", + "value": "test_user" + }, + { + "name": "PAI_DEFAULT_FS_URI" + }, + { + "name": "PAI_TASK_ROLE_COUNT", + "value": "2" + }, + { + "name": "PAI_TASK_ROLE_LIST", + "value": "taskrole,taskrole_1" + }, + { + "name": "PAI_TASK_ROLE_TASK_COUNT_taskrole", + "value": "1" + }, + { + "name": "PAI_RESOURCE_taskrole", + "value": "1,4,8192,0" + }, + { + "name": "PAI_MIN_FAILED_TASK_COUNT_taskrole", + "value": "1" + }, + { + "name": "PAI_MIN_SUCCEEDED_TASK_COUNT_taskrole", + "value": "-1" + }, + { + "name": "PAI_TASK_ROLE_TASK_COUNT_taskrole_1", + "value": "1" + }, + { + "name": "PAI_RESOURCE_taskrole_1", + "value": "1,4,8192,0" + }, + { + "name": "PAI_MIN_FAILED_TASK_COUNT_taskrole_1", + "value": "1" + }, + { + "name": "PAI_MIN_SUCCEEDED_TASK_COUNT_taskrole_1", + "value": "-1" + }, + { + "name": "PAI_USERNAME", + "value": "test_user" + }, + { + "name": "PAI_TASKS_NUM", + "value": "2" + }, + { + "name": "PAI_JOB_TASK_COUNT", + "value": "2" + }, + { + "name": "PAI_TASK_ROLES_NUM", + "value": "2" + }, + { + "name": "PAI_JOB_TASK_ROLE_COUNT", + "value": "2" + }, + { + "name": "PAI_JOB_TASK_ROLE_LIST", + "value": "taskrole,taskrole_1" + }, + { + "name": "PAI_CURRENT_TASK_ROLE_NAME", + "value": "taskrole" + }, + { + "name": "PAI_CURRENT_TASK_ROLE_CURRENT_TASK_INDEX", + "valueFrom": { + "fieldRef": { + "fieldPath": "metadata.annotations['FC_TASK_INDEX']" + } + } + }, + { + "name": "PAI_TASK_INDEX", + "valueFrom": { + "fieldRef": { + "fieldPath": "metadata.annotations['FC_TASK_INDEX']" + } + } + } + ], + "image": "openpai/standard:python_3.6-pytorch_1.2.0-gpu", + "imagePullPolicy": "Always", + "name": "app", + "resources": { + "limits": { + "cpu": "4", + "github.com/fuse": "1", + "memory": "8Gi", + "nvidia.com/gpu": "1" + } + }, + "securityContext": { + "capabilities": { + "add": ["SYS_ADMIN", "IPC_LOCK", "DAC_READ_SEARCH"], + "drop": ["MKNOD"] + } + }, + "terminationMessagePath": "/tmp/pai-termination-log", + "volumeMounts": [ + { + "mountPath": "/dev/shm", + "name": "dshm" + }, + { + "mountPath": "/usr/local/pai", + "name": "pai-vol" + }, + { + "mountPath": "/usr/local/pai/logs", + "name": "host-log", + "subPath": "test_user/51b333b433467483e9e16fcff34ceeda/taskrole" + }, + { + "mountPath": "/usr/local/pai/ssh-secret", + "name": "job-ssh-secret-volume", + "readOnly": true + } + ] + } + ], + "hostNetwork": true, + "imagePullSecrets": [ + { + "name": "pai-secret" + } + ], + "initContainers": [ + { + "env": [ + { + "name": "USER_CMD", + "value": "printenv" + }, + { + "name": "KUBE_APISERVER_ADDRESS", + "value": "http://10.151.40.4:8080" + }, + { + "name": "GANG_ALLOCATION", + "value": "true" + }, + { + "name": "PAI_FRAMEWORK_NAME", + "value": "test_user~test" + }, + { + "name": "PAI_JOB_NAME", + "value": "test_user~test" + }, + { + "name": "PAI_USER_NAME", + "value": "test_user" + }, + { + "name": "PAI_DEFAULT_FS_URI" + }, + { + "name": "PAI_TASK_ROLE_COUNT", + "value": "2" + }, + { + "name": "PAI_TASK_ROLE_LIST", + "value": "taskrole,taskrole_1" + }, + { + "name": "PAI_TASK_ROLE_TASK_COUNT_taskrole", + "value": "1" + }, + { + "name": "PAI_RESOURCE_taskrole", + "value": "1,4,8192,0" + }, + { + "name": "PAI_MIN_FAILED_TASK_COUNT_taskrole", + "value": "1" + }, + { + "name": "PAI_MIN_SUCCEEDED_TASK_COUNT_taskrole", + "value": "-1" + }, + { + "name": "PAI_TASK_ROLE_TASK_COUNT_taskrole_1", + "value": "1" + }, + { + "name": "PAI_RESOURCE_taskrole_1", + "value": "1,4,8192,0" + }, + { + "name": "PAI_MIN_FAILED_TASK_COUNT_taskrole_1", + "value": "1" + }, + { + "name": "PAI_MIN_SUCCEEDED_TASK_COUNT_taskrole_1", + "value": "-1" + }, + { + "name": "PAI_USERNAME", + "value": "test_user" + }, + { + "name": "PAI_TASKS_NUM", + "value": "2" + }, + { + "name": "PAI_JOB_TASK_COUNT", + "value": "2" + }, + { + "name": "PAI_TASK_ROLES_NUM", + "value": "2" + }, + { + "name": "PAI_JOB_TASK_ROLE_COUNT", + "value": "2" + }, + { + "name": "PAI_JOB_TASK_ROLE_LIST", + "value": "taskrole,taskrole_1" + }, + { + "name": "PAI_CURRENT_TASK_ROLE_NAME", + "value": "taskrole" + }, + { + "name": "PAI_CURRENT_TASK_ROLE_CURRENT_TASK_INDEX", + "valueFrom": { + "fieldRef": { + "fieldPath": "metadata.annotations['FC_TASK_INDEX']" + } + } + } + ], + "image": "openpai/openpai-runtime:test_user", + "imagePullPolicy": "Always", + "name": "init", + "resources": {}, + "volumeMounts": [ + { + "mountPath": "/usr/local/pai", + "name": "pai-vol" + }, + { + "mountPath": "/usr/local/pai/logs", + "name": "host-log", + "subPath": "test_user/51b333b433467483e9e16fcff34ceeda/taskrole" + }, + { + "mountPath": "/usr/local/pai-config", + "name": "job-exit-spec" + } + ] + } + ], + "priorityClassName": "51b333b433467483e9e16fcff34ceeda-priority", + "restartPolicy": "Never", + "serviceAccountName": "runtime-account", + "volumes": [ + { + "emptyDir": { + "medium": "Memory", + "sizeLimit": "512Mi" + }, + "name": "dshm" + }, + { + "emptyDir": {}, + "name": "pai-vol" + }, + { + "hostPath": { + "path": "/var/log/pai" + }, + "name": "host-log" + }, + { + "name": "job-ssh-secret-volume", + "secret": { + "secretName": "job-ssh-secret" + } + }, + { + "configMap": { + "name": "runtime-exit-spec-configuration" + }, + "name": "job-exit-spec" + } + ] + } + }, + "podGracefulDeletionTimeoutSec": 1800, + "retryPolicy": { + "fancyRetryPolicy": false, + "maxRetryCount": 0 + } + }, + "taskNumber": 1 + }, + { + "frameworkAttemptCompletionPolicy": { + "minFailedTaskCount": 1, + "minSucceededTaskCount": -1 + }, + "name": "taskrole1", + "task": { + "pod": { + "metadata": { + "annotations": { + "container.apparmor.security.beta.kubernetes.io/app": "unconfined", + "rest-server/port-scheduling-spec": "{\"schedulePortStart\":20000,\"schedulePortEnd\":40000,\"ports\":{\"mpi\":{\"count\":2},\"tensorflow\":{\"count\":1},\"ssh\":{\"count\":1},\"http\":{\"count\":1}}}" + }, + "creationTimestamp": null, + "labels": { + "type": "kube-launcher-task", + "userName": "test_user", + "virtualCluster": "default" + } + }, + "spec": { + "affinity": { + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "pai-worker", + "operator": "In", + "values": ["true"] + } + ] + } + ] + } + } + }, + "containers": [ + { + "command": ["/usr/local/pai/runtime"], + "env": [ + { + "name": "PAI_FRAMEWORK_NAME", + "value": "test_user~test" + }, + { + "name": "PAI_JOB_NAME", + "value": "test_user~test" + }, + { + "name": "PAI_USER_NAME", + "value": "test_user" + }, + { + "name": "PAI_DEFAULT_FS_URI" + }, + { + "name": "PAI_TASK_ROLE_COUNT", + "value": "2" + }, + { + "name": "PAI_TASK_ROLE_LIST", + "value": "taskrole,taskrole_1" + }, + { + "name": "PAI_TASK_ROLE_TASK_COUNT_taskrole", + "value": "1" + }, + { + "name": "PAI_RESOURCE_taskrole", + "value": "1,4,8192,0" + }, + { + "name": "PAI_MIN_FAILED_TASK_COUNT_taskrole", + "value": "1" + }, + { + "name": "PAI_MIN_SUCCEEDED_TASK_COUNT_taskrole", + "value": "-1" + }, + { + "name": "PAI_TASK_ROLE_TASK_COUNT_taskrole_1", + "value": "1" + }, + { + "name": "PAI_RESOURCE_taskrole_1", + "value": "1,4,8192,0" + }, + { + "name": "PAI_MIN_FAILED_TASK_COUNT_taskrole_1", + "value": "1" + }, + { + "name": "PAI_MIN_SUCCEEDED_TASK_COUNT_taskrole_1", + "value": "-1" + }, + { + "name": "PAI_USERNAME", + "value": "test_user" + }, + { + "name": "PAI_TASKS_NUM", + "value": "2" + }, + { + "name": "PAI_JOB_TASK_COUNT", + "value": "2" + }, + { + "name": "PAI_TASK_ROLES_NUM", + "value": "2" + }, + { + "name": "PAI_JOB_TASK_ROLE_COUNT", + "value": "2" + }, + { + "name": "PAI_JOB_TASK_ROLE_LIST", + "value": "taskrole,taskrole_1" + }, + { + "name": "PAI_CURRENT_TASK_ROLE_NAME", + "value": "taskrole_1" + }, + { + "name": "PAI_CURRENT_TASK_ROLE_CURRENT_TASK_INDEX", + "valueFrom": { + "fieldRef": { + "fieldPath": "metadata.annotations['FC_TASK_INDEX']" + } + } + }, + { + "name": "PAI_TASK_INDEX", + "valueFrom": { + "fieldRef": { + "fieldPath": "metadata.annotations['FC_TASK_INDEX']" + } + } + } + ], + "image": "openpai/standard:python_3.6-pytorch_1.2.0-gpu", + "imagePullPolicy": "Always", + "name": "app", + "resources": { + "limits": { + "cpu": "4", + "github.com/fuse": "1", + "memory": "8Gi", + "nvidia.com/gpu": "1" + } + }, + "securityContext": { + "capabilities": { + "add": ["SYS_ADMIN", "IPC_LOCK", "DAC_READ_SEARCH"], + "drop": ["MKNOD"] + } + }, + "terminationMessagePath": "/tmp/pai-termination-log", + "volumeMounts": [ + { + "mountPath": "/dev/shm", + "name": "dshm" + }, + { + "mountPath": "/usr/local/pai", + "name": "pai-vol" + }, + { + "mountPath": "/usr/local/pai/logs", + "name": "host-log", + "subPath": "test_user/51b333b433467483e9e16fcff34ceeda/taskrole1" + }, + { + "mountPath": "/usr/local/pai/ssh-secret", + "name": "job-ssh-secret-volume", + "readOnly": true + } + ] + } + ], + "hostNetwork": true, + "imagePullSecrets": [ + { + "name": "pai-secret" + } + ], + "initContainers": [ + { + "env": [ + { + "name": "USER_CMD", + "value": "printenv" + }, + { + "name": "KUBE_APISERVER_ADDRESS", + "value": "http://10.151.40.4:8080" + }, + { + "name": "GANG_ALLOCATION", + "value": "true" + }, + { + "name": "PAI_FRAMEWORK_NAME", + "value": "test_user~test" + }, + { + "name": "PAI_JOB_NAME", + "value": "test_user~test" + }, + { + "name": "PAI_USER_NAME", + "value": "test_user" + }, + { + "name": "PAI_DEFAULT_FS_URI" + }, + { + "name": "PAI_TASK_ROLE_COUNT", + "value": "2" + }, + { + "name": "PAI_TASK_ROLE_LIST", + "value": "taskrole,taskrole_1" + }, + { + "name": "PAI_TASK_ROLE_TASK_COUNT_taskrole", + "value": "1" + }, + { + "name": "PAI_RESOURCE_taskrole", + "value": "1,4,8192,0" + }, + { + "name": "PAI_MIN_FAILED_TASK_COUNT_taskrole", + "value": "1" + }, + { + "name": "PAI_MIN_SUCCEEDED_TASK_COUNT_taskrole", + "value": "-1" + }, + { + "name": "PAI_TASK_ROLE_TASK_COUNT_taskrole_1", + "value": "1" + }, + { + "name": "PAI_RESOURCE_taskrole_1", + "value": "1,4,8192,0" + }, + { + "name": "PAI_MIN_FAILED_TASK_COUNT_taskrole_1", + "value": "1" + }, + { + "name": "PAI_MIN_SUCCEEDED_TASK_COUNT_taskrole_1", + "value": "-1" + }, + { + "name": "PAI_USERNAME", + "value": "test_user" + }, + { + "name": "PAI_TASKS_NUM", + "value": "2" + }, + { + "name": "PAI_JOB_TASK_COUNT", + "value": "2" + }, + { + "name": "PAI_TASK_ROLES_NUM", + "value": "2" + }, + { + "name": "PAI_JOB_TASK_ROLE_COUNT", + "value": "2" + }, + { + "name": "PAI_JOB_TASK_ROLE_LIST", + "value": "taskrole,taskrole_1" + }, + { + "name": "PAI_CURRENT_TASK_ROLE_NAME", + "value": "taskrole_1" + }, + { + "name": "PAI_CURRENT_TASK_ROLE_CURRENT_TASK_INDEX", + "valueFrom": { + "fieldRef": { + "fieldPath": "metadata.annotations['FC_TASK_INDEX']" + } + } + } + ], + "image": "openpai/openpai-runtime:test_user", + "imagePullPolicy": "Always", + "name": "init", + "resources": {}, + "volumeMounts": [ + { + "mountPath": "/usr/local/pai", + "name": "pai-vol" + }, + { + "mountPath": "/usr/local/pai/logs", + "name": "host-log", + "subPath": "test_user/51b333b433467483e9e16fcff34ceeda/taskrole1" + }, + { + "mountPath": "/usr/local/pai-config", + "name": "job-exit-spec" + } + ] + } + ], + "priorityClassName": "51b333b433467483e9e16fcff34ceeda-priority", + "restartPolicy": "Never", + "serviceAccountName": "runtime-account", + "volumes": [ + { + "emptyDir": { + "medium": "Memory", + "sizeLimit": "512Mi" + }, + "name": "dshm" + }, + { + "emptyDir": {}, + "name": "pai-vol" + }, + { + "hostPath": { + "path": "/var/log/pai" + }, + "name": "host-log" + }, + { + "name": "job-ssh-secret-volume", + "secret": { + "secretName": "job-ssh-secret" + } + }, + { + "configMap": { + "name": "runtime-exit-spec-configuration" + }, + "name": "job-exit-spec" + } + ] + } + }, + "podGracefulDeletionTimeoutSec": 1800, + "retryPolicy": { + "fancyRetryPolicy": false, + "maxRetryCount": 0 + } + }, + "taskNumber": 1 + } + ] + }, + "status": { + "attemptStatus": { + "completionStatus": { + "code": 0, + "diagnostics": "Pod succeeded", + "phrase": "Succeeded", + "trigger": { + "message": "All Tasks are completed and no user specified conditions in FrameworkAttemptCompletionPolicy have ever been triggered: TotalTaskCount: 2, FailedTaskCount: 0", + "taskIndex": 0, + "taskRoleName": "taskrole1" + }, + "type": { + "attributes": [], + "name": "Succeeded" + } + }, + "completionTime": "2020-05-22T03:19:33Z", + "configMapName": "51b333b433467483e9e16fcff34ceeda-attempt", + "configMapUID": "f0f47860-9bda-11ea-830b-000d3ab25bb6", + "id": 0, + "instanceUID": "0_f0f47860-9bda-11ea-830b-000d3ab25bb6", + "runTime": "2020-05-22T03:19:26Z", + "startTime": "2020-05-22T03:18:43Z", + "taskRoleStatuses": [ + { + "name": "taskrole", + "taskStatuses": [ + { + "attemptStatus": { + "completionStatus": { + "code": 0, + "diagnostics": "Pod succeeded", + "phrase": "Succeeded", + "pod": { + "containers": [ + { + "code": 0, + "name": "init", + "reason": "Completed" + }, + { + "code": 0, + "name": "app", + "reason": "Completed" + } + ] + }, + "type": { + "attributes": [], + "name": "Succeeded" + } + }, + "completionTime": "2020-05-22T03:19:32Z", + "id": 0, + "instanceUID": "0_f1020521-9bda-11ea-830b-000d3ab25bb6", + "podHostIP": "10.151.41.8", + "podIP": "10.151.41.8", + "podName": "51b333b433467483e9e16fcff34ceeda-taskrole-0", + "podNodeName": "10.151.41.8", + "podUID": "f1020521-9bda-11ea-830b-000d3ab25bb6", + "runTime": "2020-05-22T03:19:26Z", + "startTime": "2020-05-22T03:18:43Z" + }, + "completionTime": "2020-05-22T03:19:32Z", + "index": 0, + "retryPolicyStatus": { + "accountableRetriedCount": 0, + "retryDelaySec": null, + "totalRetriedCount": 0 + }, + "startTime": "2020-05-22T03:18:43Z", + "state": "Completed", + "transitionTime": "2020-05-22T03:19:32Z" + } + ] + }, + { + "name": "taskrole1", + "taskStatuses": [ + { + "attemptStatus": { + "completionStatus": { + "code": 0, + "diagnostics": "Pod succeeded", + "phrase": "Succeeded", + "pod": { + "containers": [ + { + "code": 0, + "name": "init", + "reason": "Completed" + }, + { + "code": 0, + "name": "app", + "reason": "Completed" + } + ] + }, + "type": { + "attributes": [], + "name": "Succeeded" + } + }, + "completionTime": "2020-05-22T03:19:32Z", + "id": 0, + "instanceUID": "0_f102a4a7-9bda-11ea-830b-000d3ab25bb6", + "podHostIP": "10.151.41.9", + "podIP": "10.151.41.9", + "podName": "51b333b433467483e9e16fcff34ceeda-taskrole1-0", + "podNodeName": "10.151.41.9", + "podUID": "f102a4a7-9bda-11ea-830b-000d3ab25bb6", + "runTime": "2020-05-22T03:19:27Z", + "startTime": "2020-05-22T03:18:43Z" + }, + "completionTime": "2020-05-22T03:19:32Z", + "index": 0, + "retryPolicyStatus": { + "accountableRetriedCount": 0, + "retryDelaySec": null, + "totalRetriedCount": 0 + }, + "startTime": "2020-05-22T03:18:43Z", + "state": "Completed", + "transitionTime": "2020-05-22T03:19:32Z" + } + ] + } + ] + }, + "completionTime": "2020-05-22T03:19:33Z", + "retryPolicyStatus": { + "accountableRetriedCount": 0, + "retryDelaySec": null, + "totalRetriedCount": 0 + }, + "startTime": "2020-05-22T03:18:43Z", + "state": "Completed", + "transitionTime": "2020-05-22T03:19:33Z" + } +} diff --git a/test/test_framework_parser.py b/test/test_framework_parser.py new file mode 100644 index 0000000..4c6e73a --- /dev/null +++ b/test/test_framework_parser.py @@ -0,0 +1,74 @@ +# Copyright (c) Microsoft Corporation +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and +# to permit persons to whom the Software is furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING +# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import json +from io import StringIO +import os +import sys +import unittest + +# pylint: disable=wrong-import-position +sys.path.append( + os.path.join(os.path.dirname(os.path.abspath(__file__)), "../src")) +sys.path.append( + os.path.join(os.path.dirname(os.path.abspath(__file__)), "../src/init.d")) +from framework_parser import generate_runtime_env +from common.utils import init_logger +# pylint: enable=wrong-import-position + +PACKAGE_DIRECTORY_COM = os.path.dirname(os.path.abspath(__file__)) +init_logger() + + +class TestParser(unittest.TestCase): + def setUp(self): + try: + os.chdir(PACKAGE_DIRECTORY_COM) + except Exception: #pylint: disable=broad-except + pass + + def test_generate_runtime_env(self): + os.environ["FC_TASK_INDEX"] = "0" + os.environ["FC_TASKROLE_NAME"] = "taskrole" + test_file = "framework.json" + expect_lines = [ + "export PAI_PORT_LIST_taskrole_0_tcp='29877,22353,29076'", + "export PAI_CONTAINER_HOST_PORT_LIST='tcp:29877,22353,29076;udp:31903,33486,35953;ssh:39080;http:30643;'", + "export PAI_taskrole1_0_mpi_PORT='20966,21891'", + "export PAI_CONTAINER_HOST_http_PORT_LIST='30643'", + "export PAI_PORT_LIST_taskrole_0_udp='31903,33486,35953'", + "export PAI_CONTAINER_SSH_PORT='39080'" + ] + with open(test_file, "r") as f: + framework = json.load(f) + + sys.stdout = temp_stdout = StringIO() + + generate_runtime_env(framework) + runtime_env = temp_stdout.getvalue().splitlines() + + sys.stdout = sys.__stdout__ + + for expect in expect_lines: + self.assertIn(expect, runtime_env) + + del os.environ["FC_TASK_INDEX"] + del os.environ["FC_TASKROLE_NAME"] + + +if __name__ == '__main__': + unittest.main()