Use hash func to generate port number (#11)

Refer to issue: microsoft/pai#4384
Using hash function:

(int(md5(podUid + taskPortName + taskPortIndex)[0:12] ,16) +
 int(md5(podUid + taskPortName + taskPortIndex)[12:24] ,16) +
 int(md5(podUid + taskPortName + taskPortIndex)[24:32] ,16)) % 
 (globalPortEnd - globalPortStart) + globalPortStart
to generate portnumber.

If port conflict happens, the task will failed. Retried task will has different podUid, and new task will be given different port number.
This commit is contained in:
Binyang2014 2020-05-22 11:35:48 +08:00 коммит произвёл GitHub
Родитель 0fed34d82e
Коммит 388bd4f945
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
6 изменённых файлов: 1029 добавлений и 22 удалений

0
src/__init__.py Normal file
Просмотреть файл

Просмотреть файл

@ -153,12 +153,12 @@ cp ${PAI_CONFIG_DIR}/runtime-exit-spec.yaml ${PAI_RUNTIME_DIR}
# generate runtime env variables # generate runtime env variables
# priority=10 # priority=10
CHILD_PROCESS="ENV_GENERATOR" CHILD_PROCESS="ENV_GENERATOR"
python ${PAI_INIT_DIR}/parser.py genenv framework.json > ${PAI_RUNTIME_DIR}/runtime_env.sh python ${PAI_INIT_DIR}/framework_parser.py genenv framework.json > ${PAI_RUNTIME_DIR}/runtime_env.sh
# generate jobconfig # generate jobconfig
# priority=11 # priority=11
CHILD_PROCESS="CONFIG_GENERATOR" CHILD_PROCESS="CONFIG_GENERATOR"
python ${PAI_INIT_DIR}/parser.py genconf framework.json > ${PAI_RUNTIME_DIR}/job_config.yaml python ${PAI_INIT_DIR}/framework_parser.py genconf framework.json > ${PAI_RUNTIME_DIR}/job_config.yaml
# Init plugins # Init plugins
# priority=12 # priority=12

Просмотреть файл

@ -18,6 +18,7 @@
import argparse import argparse
import base64 import base64
import hashlib
import logging import logging
import gzip import gzip
import json import json
@ -42,6 +43,31 @@ def decompress_field(field):
return obj return obj
def generate_seq_ports_num(port_start, port_count, task_index):
base = port_start + port_count * task_index
return [str(port_num) for port_num in range(base, base + port_count)]
def generate_hashed_ports_num(pod_uid, port_name, port_count, port_start,
port_end):
""" Random generate the port number
The algorithm is:
(int(md5(podUid + portName + portIndex)[0:12] ,16) +
int(md5(podUid + portName + portIndex)[12:24] ,16) +
int(md5(podUid + portName + portIndex)[24:32] ,16)) % (port_end - port_start) + port_start
"""
port_list = []
for i in range(port_count):
raw_str = "[{}][{}][{}]".format(pod_uid, port_name, str(i))
hash_str = hashlib.md5(raw_str.encode("utf8")).hexdigest()
port_list.append(
str((int(hash_str[:12], 16) + int(hash_str[12:24], 16) +
int(hash_str[24:], 16)) % (port_end - port_start) +
port_start))
return port_list
def generate_runtime_env(framework): #pylint: disable=too-many-locals def generate_runtime_env(framework): #pylint: disable=too-many-locals
"""Generate runtime env variables for tasks. """Generate runtime env variables for tasks.
@ -96,41 +122,50 @@ def generate_runtime_env(framework): #pylint: disable=too-many-locals
for task in taskrole["taskStatuses"]: for task in taskrole["taskStatuses"]:
index = task["index"] index = task["index"]
current_ip = task["attemptStatus"]["podHostIP"] current_ip = task["attemptStatus"]["podHostIP"]
pod_uid = task["attemptStatus"]["podUID"]
task_ports = {}
taskrole_instances.append("{}:{}".format(name, index)) taskrole_instances.append("{}:{}".format(name, index))
get_port_base = lambda port_name, p=ports, i=index: int(p[ use_port_hash = True
port_name]["start"]) + int(p[port_name]["count"]) * int(i) if "ports" in ports and "schedulePortStart" in ports and "schedulePortEnd" in ports:
port_start = ports["schedulePortStart"]
port_end = ports["schedulePortEnd"]
port_list = ports["ports"]
else:
# for backward compatibility
use_port_hash = False
port_list = ports
# export ip/port for task role, current ip maybe None for non-gang-allocation for port in port_list.keys():
if current_ip: count = int(port_list[port]["count"])
export("PAI_HOST_IP_{}_{}".format(name, index), current_ip) task_ports[port] = generate_hashed_ports_num(
host_list.append("{}:{}".format(current_ip, pod_uid, port, count, port_start,
get_port_base("http"))) port_end) if use_port_hash else generate_seq_ports_num(
port_list[port]["start"], count, index)
for port in ports.keys(): current_port_str = ",".join(task_ports[port])
start, count = get_port_base(port), int(ports[port]["count"])
current_port_str = ",".join(
str(x) for x in range(start, start + count))
export("PAI_PORT_LIST_{}_{}_{}".format(name, index, port), export("PAI_PORT_LIST_{}_{}_{}".format(name, index, port),
current_port_str) current_port_str)
export("PAI_{}_{}_{}_PORT".format(name, index, port), export("PAI_{}_{}_{}_PORT".format(name, index, port),
current_port_str) current_port_str)
# export ip/port for task role, current ip maybe None for non-gang-allocation
if current_ip:
export("PAI_HOST_IP_{}_{}".format(name, index), current_ip)
host_list.append("{}:{}".format(current_ip,
task_ports["http"][0]))
# export ip/port for current container # export ip/port for current container
if (current_taskrole_name == name if (current_taskrole_name == name
and current_task_index == str(index)): and current_task_index == str(index)):
export("PAI_CURRENT_CONTAINER_IP", current_ip) export("PAI_CURRENT_CONTAINER_IP", current_ip)
export("PAI_CURRENT_CONTAINER_PORT", get_port_base("http")) export("PAI_CURRENT_CONTAINER_PORT", task_ports["http"][0])
export("PAI_CONTAINER_HOST_IP", current_ip) export("PAI_CONTAINER_HOST_IP", current_ip)
export("PAI_CONTAINER_HOST_PORT", get_port_base("http")) export("PAI_CONTAINER_HOST_PORT", task_ports["http"][0])
export("PAI_CONTAINER_SSH_PORT", get_port_base("ssh")) export("PAI_CONTAINER_SSH_PORT", task_ports["ssh"][0])
port_str = "" port_str = ""
for port in ports.keys(): for port in port_list.keys():
start, count = get_port_base(port), int( current_port_str = ",".join(task_ports[port])
ports[port]["count"])
current_port_str = ",".join(
str(x) for x in range(start, start + count))
export("PAI_CONTAINER_HOST_{}_PORT_LIST".format(port), export("PAI_CONTAINER_HOST_{}_PORT_LIST".format(port),
current_port_str) current_port_str)
port_str += "{}:{};".format(port, current_port_str) port_str += "{}:{};".format(port, current_port_str)

Просмотреть файл

@ -49,8 +49,13 @@ def check_port(portno):
def check_port_list_env(port_list_env): def check_port_list_env(port_list_env):
ports = {}
for each in re.split(":|;|,", port_list_env): for each in re.split(":|;|,", port_list_env):
if each.isdigit(): if each.isdigit():
if each in ports:
LOGGER.error("Port %s has conflict.", each)
sys.exit(10)
ports[each] = True
check_port(int(each)) check_port(int(each))

893
test/framework.json Normal file
Просмотреть файл

@ -0,0 +1,893 @@
{
"apiVersion": "frameworkcontroller.microsoft.com/v1",
"kind": "Framework",
"metadata": {
"annotations": {
"config": "protocolVersion: 2\nname: test\ntype: job\njobRetryCount: 0\nprerequisites:\n - type: dockerimage\n uri: 'openpai/standard:python_3.6-pytorch_1.2.0-gpu'\n name: docker_image_0\ntaskRoles:\n taskrole:\n instances: 1\n completion:\n minFailedInstances: 1\n minSucceededInstances: -1\n taskRetryCount: 0\n dockerImage: docker_image_0\n resourcePerInstance:\n gpu: 1\n cpu: 4\n memoryMB: 8192\n ports:\n tcp: 3\n udp: 3\n commands:\n - printenv\n taskrole_1:\n instances: 1\n completion:\n minFailedInstances: 1\n minSucceededInstances: -1\n taskRetryCount: 0\n dockerImage: docker_image_0\n resourcePerInstance:\n gpu: 1\n cpu: 4\n memoryMB: 8192\n ports:\n mpi: 2\n tensorflow: 1\n commands:\n - printenv\ndefaults:\n virtualCluster: default\nextras:\n com.microsoft.pai.runtimeplugin:\n - plugin: ssh\n parameters:\n jobssh: true\n",
"jobName": "test",
"logPathInfix": "51b333b433467483e9e16fcff34ceeda",
"totalGpuNumber": "2"
},
"creationTimestamp": "2020-05-22T03:18:43Z",
"generation": 19,
"labels": {
"userName": "test_user",
"virtualCluster": "default"
},
"name": "51b333b433467483e9e16fcff34ceeda",
"namespace": "default",
"resourceVersion": "48376615",
"selfLink": "/apis/frameworkcontroller.microsoft.com/v1/namespaces/default/frameworks/51b333b433467483e9e16fcff34ceeda",
"uid": "f0f0e75a-9bda-11ea-830b-000d3ab25bb6"
},
"spec": {
"description": "",
"executionType": "Start",
"retryPolicy": {
"fancyRetryPolicy": true,
"maxRetryCount": 0
},
"taskRoles": [
{
"frameworkAttemptCompletionPolicy": {
"minFailedTaskCount": 1,
"minSucceededTaskCount": -1
},
"name": "taskrole",
"task": {
"pod": {
"metadata": {
"annotations": {
"container.apparmor.security.beta.kubernetes.io/app": "unconfined",
"rest-server/port-scheduling-spec": "{\"schedulePortStart\":20000,\"schedulePortEnd\":40000,\"ports\":{\"tcp\":{\"count\":3},\"udp\":{\"count\":3},\"ssh\":{\"count\":1},\"http\":{\"count\":1}}}"
},
"creationTimestamp": null,
"labels": {
"type": "kube-launcher-task",
"userName": "test_user",
"virtualCluster": "default"
}
},
"spec": {
"affinity": {
"nodeAffinity": {
"requiredDuringSchedulingIgnoredDuringExecution": {
"nodeSelectorTerms": [
{
"matchExpressions": [
{
"key": "pai-worker",
"operator": "In",
"values": ["true"]
}
]
}
]
}
}
},
"containers": [
{
"command": ["/usr/local/pai/runtime"],
"env": [
{
"name": "PAI_FRAMEWORK_NAME",
"value": "test_user~test"
},
{
"name": "PAI_JOB_NAME",
"value": "test_user~test"
},
{
"name": "PAI_USER_NAME",
"value": "test_user"
},
{
"name": "PAI_DEFAULT_FS_URI"
},
{
"name": "PAI_TASK_ROLE_COUNT",
"value": "2"
},
{
"name": "PAI_TASK_ROLE_LIST",
"value": "taskrole,taskrole_1"
},
{
"name": "PAI_TASK_ROLE_TASK_COUNT_taskrole",
"value": "1"
},
{
"name": "PAI_RESOURCE_taskrole",
"value": "1,4,8192,0"
},
{
"name": "PAI_MIN_FAILED_TASK_COUNT_taskrole",
"value": "1"
},
{
"name": "PAI_MIN_SUCCEEDED_TASK_COUNT_taskrole",
"value": "-1"
},
{
"name": "PAI_TASK_ROLE_TASK_COUNT_taskrole_1",
"value": "1"
},
{
"name": "PAI_RESOURCE_taskrole_1",
"value": "1,4,8192,0"
},
{
"name": "PAI_MIN_FAILED_TASK_COUNT_taskrole_1",
"value": "1"
},
{
"name": "PAI_MIN_SUCCEEDED_TASK_COUNT_taskrole_1",
"value": "-1"
},
{
"name": "PAI_USERNAME",
"value": "test_user"
},
{
"name": "PAI_TASKS_NUM",
"value": "2"
},
{
"name": "PAI_JOB_TASK_COUNT",
"value": "2"
},
{
"name": "PAI_TASK_ROLES_NUM",
"value": "2"
},
{
"name": "PAI_JOB_TASK_ROLE_COUNT",
"value": "2"
},
{
"name": "PAI_JOB_TASK_ROLE_LIST",
"value": "taskrole,taskrole_1"
},
{
"name": "PAI_CURRENT_TASK_ROLE_NAME",
"value": "taskrole"
},
{
"name": "PAI_CURRENT_TASK_ROLE_CURRENT_TASK_INDEX",
"valueFrom": {
"fieldRef": {
"fieldPath": "metadata.annotations['FC_TASK_INDEX']"
}
}
},
{
"name": "PAI_TASK_INDEX",
"valueFrom": {
"fieldRef": {
"fieldPath": "metadata.annotations['FC_TASK_INDEX']"
}
}
}
],
"image": "openpai/standard:python_3.6-pytorch_1.2.0-gpu",
"imagePullPolicy": "Always",
"name": "app",
"resources": {
"limits": {
"cpu": "4",
"github.com/fuse": "1",
"memory": "8Gi",
"nvidia.com/gpu": "1"
}
},
"securityContext": {
"capabilities": {
"add": ["SYS_ADMIN", "IPC_LOCK", "DAC_READ_SEARCH"],
"drop": ["MKNOD"]
}
},
"terminationMessagePath": "/tmp/pai-termination-log",
"volumeMounts": [
{
"mountPath": "/dev/shm",
"name": "dshm"
},
{
"mountPath": "/usr/local/pai",
"name": "pai-vol"
},
{
"mountPath": "/usr/local/pai/logs",
"name": "host-log",
"subPath": "test_user/51b333b433467483e9e16fcff34ceeda/taskrole"
},
{
"mountPath": "/usr/local/pai/ssh-secret",
"name": "job-ssh-secret-volume",
"readOnly": true
}
]
}
],
"hostNetwork": true,
"imagePullSecrets": [
{
"name": "pai-secret"
}
],
"initContainers": [
{
"env": [
{
"name": "USER_CMD",
"value": "printenv"
},
{
"name": "KUBE_APISERVER_ADDRESS",
"value": "http://10.151.40.4:8080"
},
{
"name": "GANG_ALLOCATION",
"value": "true"
},
{
"name": "PAI_FRAMEWORK_NAME",
"value": "test_user~test"
},
{
"name": "PAI_JOB_NAME",
"value": "test_user~test"
},
{
"name": "PAI_USER_NAME",
"value": "test_user"
},
{
"name": "PAI_DEFAULT_FS_URI"
},
{
"name": "PAI_TASK_ROLE_COUNT",
"value": "2"
},
{
"name": "PAI_TASK_ROLE_LIST",
"value": "taskrole,taskrole_1"
},
{
"name": "PAI_TASK_ROLE_TASK_COUNT_taskrole",
"value": "1"
},
{
"name": "PAI_RESOURCE_taskrole",
"value": "1,4,8192,0"
},
{
"name": "PAI_MIN_FAILED_TASK_COUNT_taskrole",
"value": "1"
},
{
"name": "PAI_MIN_SUCCEEDED_TASK_COUNT_taskrole",
"value": "-1"
},
{
"name": "PAI_TASK_ROLE_TASK_COUNT_taskrole_1",
"value": "1"
},
{
"name": "PAI_RESOURCE_taskrole_1",
"value": "1,4,8192,0"
},
{
"name": "PAI_MIN_FAILED_TASK_COUNT_taskrole_1",
"value": "1"
},
{
"name": "PAI_MIN_SUCCEEDED_TASK_COUNT_taskrole_1",
"value": "-1"
},
{
"name": "PAI_USERNAME",
"value": "test_user"
},
{
"name": "PAI_TASKS_NUM",
"value": "2"
},
{
"name": "PAI_JOB_TASK_COUNT",
"value": "2"
},
{
"name": "PAI_TASK_ROLES_NUM",
"value": "2"
},
{
"name": "PAI_JOB_TASK_ROLE_COUNT",
"value": "2"
},
{
"name": "PAI_JOB_TASK_ROLE_LIST",
"value": "taskrole,taskrole_1"
},
{
"name": "PAI_CURRENT_TASK_ROLE_NAME",
"value": "taskrole"
},
{
"name": "PAI_CURRENT_TASK_ROLE_CURRENT_TASK_INDEX",
"valueFrom": {
"fieldRef": {
"fieldPath": "metadata.annotations['FC_TASK_INDEX']"
}
}
}
],
"image": "openpai/openpai-runtime:test_user",
"imagePullPolicy": "Always",
"name": "init",
"resources": {},
"volumeMounts": [
{
"mountPath": "/usr/local/pai",
"name": "pai-vol"
},
{
"mountPath": "/usr/local/pai/logs",
"name": "host-log",
"subPath": "test_user/51b333b433467483e9e16fcff34ceeda/taskrole"
},
{
"mountPath": "/usr/local/pai-config",
"name": "job-exit-spec"
}
]
}
],
"priorityClassName": "51b333b433467483e9e16fcff34ceeda-priority",
"restartPolicy": "Never",
"serviceAccountName": "runtime-account",
"volumes": [
{
"emptyDir": {
"medium": "Memory",
"sizeLimit": "512Mi"
},
"name": "dshm"
},
{
"emptyDir": {},
"name": "pai-vol"
},
{
"hostPath": {
"path": "/var/log/pai"
},
"name": "host-log"
},
{
"name": "job-ssh-secret-volume",
"secret": {
"secretName": "job-ssh-secret"
}
},
{
"configMap": {
"name": "runtime-exit-spec-configuration"
},
"name": "job-exit-spec"
}
]
}
},
"podGracefulDeletionTimeoutSec": 1800,
"retryPolicy": {
"fancyRetryPolicy": false,
"maxRetryCount": 0
}
},
"taskNumber": 1
},
{
"frameworkAttemptCompletionPolicy": {
"minFailedTaskCount": 1,
"minSucceededTaskCount": -1
},
"name": "taskrole1",
"task": {
"pod": {
"metadata": {
"annotations": {
"container.apparmor.security.beta.kubernetes.io/app": "unconfined",
"rest-server/port-scheduling-spec": "{\"schedulePortStart\":20000,\"schedulePortEnd\":40000,\"ports\":{\"mpi\":{\"count\":2},\"tensorflow\":{\"count\":1},\"ssh\":{\"count\":1},\"http\":{\"count\":1}}}"
},
"creationTimestamp": null,
"labels": {
"type": "kube-launcher-task",
"userName": "test_user",
"virtualCluster": "default"
}
},
"spec": {
"affinity": {
"nodeAffinity": {
"requiredDuringSchedulingIgnoredDuringExecution": {
"nodeSelectorTerms": [
{
"matchExpressions": [
{
"key": "pai-worker",
"operator": "In",
"values": ["true"]
}
]
}
]
}
}
},
"containers": [
{
"command": ["/usr/local/pai/runtime"],
"env": [
{
"name": "PAI_FRAMEWORK_NAME",
"value": "test_user~test"
},
{
"name": "PAI_JOB_NAME",
"value": "test_user~test"
},
{
"name": "PAI_USER_NAME",
"value": "test_user"
},
{
"name": "PAI_DEFAULT_FS_URI"
},
{
"name": "PAI_TASK_ROLE_COUNT",
"value": "2"
},
{
"name": "PAI_TASK_ROLE_LIST",
"value": "taskrole,taskrole_1"
},
{
"name": "PAI_TASK_ROLE_TASK_COUNT_taskrole",
"value": "1"
},
{
"name": "PAI_RESOURCE_taskrole",
"value": "1,4,8192,0"
},
{
"name": "PAI_MIN_FAILED_TASK_COUNT_taskrole",
"value": "1"
},
{
"name": "PAI_MIN_SUCCEEDED_TASK_COUNT_taskrole",
"value": "-1"
},
{
"name": "PAI_TASK_ROLE_TASK_COUNT_taskrole_1",
"value": "1"
},
{
"name": "PAI_RESOURCE_taskrole_1",
"value": "1,4,8192,0"
},
{
"name": "PAI_MIN_FAILED_TASK_COUNT_taskrole_1",
"value": "1"
},
{
"name": "PAI_MIN_SUCCEEDED_TASK_COUNT_taskrole_1",
"value": "-1"
},
{
"name": "PAI_USERNAME",
"value": "test_user"
},
{
"name": "PAI_TASKS_NUM",
"value": "2"
},
{
"name": "PAI_JOB_TASK_COUNT",
"value": "2"
},
{
"name": "PAI_TASK_ROLES_NUM",
"value": "2"
},
{
"name": "PAI_JOB_TASK_ROLE_COUNT",
"value": "2"
},
{
"name": "PAI_JOB_TASK_ROLE_LIST",
"value": "taskrole,taskrole_1"
},
{
"name": "PAI_CURRENT_TASK_ROLE_NAME",
"value": "taskrole_1"
},
{
"name": "PAI_CURRENT_TASK_ROLE_CURRENT_TASK_INDEX",
"valueFrom": {
"fieldRef": {
"fieldPath": "metadata.annotations['FC_TASK_INDEX']"
}
}
},
{
"name": "PAI_TASK_INDEX",
"valueFrom": {
"fieldRef": {
"fieldPath": "metadata.annotations['FC_TASK_INDEX']"
}
}
}
],
"image": "openpai/standard:python_3.6-pytorch_1.2.0-gpu",
"imagePullPolicy": "Always",
"name": "app",
"resources": {
"limits": {
"cpu": "4",
"github.com/fuse": "1",
"memory": "8Gi",
"nvidia.com/gpu": "1"
}
},
"securityContext": {
"capabilities": {
"add": ["SYS_ADMIN", "IPC_LOCK", "DAC_READ_SEARCH"],
"drop": ["MKNOD"]
}
},
"terminationMessagePath": "/tmp/pai-termination-log",
"volumeMounts": [
{
"mountPath": "/dev/shm",
"name": "dshm"
},
{
"mountPath": "/usr/local/pai",
"name": "pai-vol"
},
{
"mountPath": "/usr/local/pai/logs",
"name": "host-log",
"subPath": "test_user/51b333b433467483e9e16fcff34ceeda/taskrole1"
},
{
"mountPath": "/usr/local/pai/ssh-secret",
"name": "job-ssh-secret-volume",
"readOnly": true
}
]
}
],
"hostNetwork": true,
"imagePullSecrets": [
{
"name": "pai-secret"
}
],
"initContainers": [
{
"env": [
{
"name": "USER_CMD",
"value": "printenv"
},
{
"name": "KUBE_APISERVER_ADDRESS",
"value": "http://10.151.40.4:8080"
},
{
"name": "GANG_ALLOCATION",
"value": "true"
},
{
"name": "PAI_FRAMEWORK_NAME",
"value": "test_user~test"
},
{
"name": "PAI_JOB_NAME",
"value": "test_user~test"
},
{
"name": "PAI_USER_NAME",
"value": "test_user"
},
{
"name": "PAI_DEFAULT_FS_URI"
},
{
"name": "PAI_TASK_ROLE_COUNT",
"value": "2"
},
{
"name": "PAI_TASK_ROLE_LIST",
"value": "taskrole,taskrole_1"
},
{
"name": "PAI_TASK_ROLE_TASK_COUNT_taskrole",
"value": "1"
},
{
"name": "PAI_RESOURCE_taskrole",
"value": "1,4,8192,0"
},
{
"name": "PAI_MIN_FAILED_TASK_COUNT_taskrole",
"value": "1"
},
{
"name": "PAI_MIN_SUCCEEDED_TASK_COUNT_taskrole",
"value": "-1"
},
{
"name": "PAI_TASK_ROLE_TASK_COUNT_taskrole_1",
"value": "1"
},
{
"name": "PAI_RESOURCE_taskrole_1",
"value": "1,4,8192,0"
},
{
"name": "PAI_MIN_FAILED_TASK_COUNT_taskrole_1",
"value": "1"
},
{
"name": "PAI_MIN_SUCCEEDED_TASK_COUNT_taskrole_1",
"value": "-1"
},
{
"name": "PAI_USERNAME",
"value": "test_user"
},
{
"name": "PAI_TASKS_NUM",
"value": "2"
},
{
"name": "PAI_JOB_TASK_COUNT",
"value": "2"
},
{
"name": "PAI_TASK_ROLES_NUM",
"value": "2"
},
{
"name": "PAI_JOB_TASK_ROLE_COUNT",
"value": "2"
},
{
"name": "PAI_JOB_TASK_ROLE_LIST",
"value": "taskrole,taskrole_1"
},
{
"name": "PAI_CURRENT_TASK_ROLE_NAME",
"value": "taskrole_1"
},
{
"name": "PAI_CURRENT_TASK_ROLE_CURRENT_TASK_INDEX",
"valueFrom": {
"fieldRef": {
"fieldPath": "metadata.annotations['FC_TASK_INDEX']"
}
}
}
],
"image": "openpai/openpai-runtime:test_user",
"imagePullPolicy": "Always",
"name": "init",
"resources": {},
"volumeMounts": [
{
"mountPath": "/usr/local/pai",
"name": "pai-vol"
},
{
"mountPath": "/usr/local/pai/logs",
"name": "host-log",
"subPath": "test_user/51b333b433467483e9e16fcff34ceeda/taskrole1"
},
{
"mountPath": "/usr/local/pai-config",
"name": "job-exit-spec"
}
]
}
],
"priorityClassName": "51b333b433467483e9e16fcff34ceeda-priority",
"restartPolicy": "Never",
"serviceAccountName": "runtime-account",
"volumes": [
{
"emptyDir": {
"medium": "Memory",
"sizeLimit": "512Mi"
},
"name": "dshm"
},
{
"emptyDir": {},
"name": "pai-vol"
},
{
"hostPath": {
"path": "/var/log/pai"
},
"name": "host-log"
},
{
"name": "job-ssh-secret-volume",
"secret": {
"secretName": "job-ssh-secret"
}
},
{
"configMap": {
"name": "runtime-exit-spec-configuration"
},
"name": "job-exit-spec"
}
]
}
},
"podGracefulDeletionTimeoutSec": 1800,
"retryPolicy": {
"fancyRetryPolicy": false,
"maxRetryCount": 0
}
},
"taskNumber": 1
}
]
},
"status": {
"attemptStatus": {
"completionStatus": {
"code": 0,
"diagnostics": "Pod succeeded",
"phrase": "Succeeded",
"trigger": {
"message": "All Tasks are completed and no user specified conditions in FrameworkAttemptCompletionPolicy have ever been triggered: TotalTaskCount: 2, FailedTaskCount: 0",
"taskIndex": 0,
"taskRoleName": "taskrole1"
},
"type": {
"attributes": [],
"name": "Succeeded"
}
},
"completionTime": "2020-05-22T03:19:33Z",
"configMapName": "51b333b433467483e9e16fcff34ceeda-attempt",
"configMapUID": "f0f47860-9bda-11ea-830b-000d3ab25bb6",
"id": 0,
"instanceUID": "0_f0f47860-9bda-11ea-830b-000d3ab25bb6",
"runTime": "2020-05-22T03:19:26Z",
"startTime": "2020-05-22T03:18:43Z",
"taskRoleStatuses": [
{
"name": "taskrole",
"taskStatuses": [
{
"attemptStatus": {
"completionStatus": {
"code": 0,
"diagnostics": "Pod succeeded",
"phrase": "Succeeded",
"pod": {
"containers": [
{
"code": 0,
"name": "init",
"reason": "Completed"
},
{
"code": 0,
"name": "app",
"reason": "Completed"
}
]
},
"type": {
"attributes": [],
"name": "Succeeded"
}
},
"completionTime": "2020-05-22T03:19:32Z",
"id": 0,
"instanceUID": "0_f1020521-9bda-11ea-830b-000d3ab25bb6",
"podHostIP": "10.151.41.8",
"podIP": "10.151.41.8",
"podName": "51b333b433467483e9e16fcff34ceeda-taskrole-0",
"podNodeName": "10.151.41.8",
"podUID": "f1020521-9bda-11ea-830b-000d3ab25bb6",
"runTime": "2020-05-22T03:19:26Z",
"startTime": "2020-05-22T03:18:43Z"
},
"completionTime": "2020-05-22T03:19:32Z",
"index": 0,
"retryPolicyStatus": {
"accountableRetriedCount": 0,
"retryDelaySec": null,
"totalRetriedCount": 0
},
"startTime": "2020-05-22T03:18:43Z",
"state": "Completed",
"transitionTime": "2020-05-22T03:19:32Z"
}
]
},
{
"name": "taskrole1",
"taskStatuses": [
{
"attemptStatus": {
"completionStatus": {
"code": 0,
"diagnostics": "Pod succeeded",
"phrase": "Succeeded",
"pod": {
"containers": [
{
"code": 0,
"name": "init",
"reason": "Completed"
},
{
"code": 0,
"name": "app",
"reason": "Completed"
}
]
},
"type": {
"attributes": [],
"name": "Succeeded"
}
},
"completionTime": "2020-05-22T03:19:32Z",
"id": 0,
"instanceUID": "0_f102a4a7-9bda-11ea-830b-000d3ab25bb6",
"podHostIP": "10.151.41.9",
"podIP": "10.151.41.9",
"podName": "51b333b433467483e9e16fcff34ceeda-taskrole1-0",
"podNodeName": "10.151.41.9",
"podUID": "f102a4a7-9bda-11ea-830b-000d3ab25bb6",
"runTime": "2020-05-22T03:19:27Z",
"startTime": "2020-05-22T03:18:43Z"
},
"completionTime": "2020-05-22T03:19:32Z",
"index": 0,
"retryPolicyStatus": {
"accountableRetriedCount": 0,
"retryDelaySec": null,
"totalRetriedCount": 0
},
"startTime": "2020-05-22T03:18:43Z",
"state": "Completed",
"transitionTime": "2020-05-22T03:19:32Z"
}
]
}
]
},
"completionTime": "2020-05-22T03:19:33Z",
"retryPolicyStatus": {
"accountableRetriedCount": 0,
"retryDelaySec": null,
"totalRetriedCount": 0
},
"startTime": "2020-05-22T03:18:43Z",
"state": "Completed",
"transitionTime": "2020-05-22T03:19:33Z"
}
}

Просмотреть файл

@ -0,0 +1,74 @@
# Copyright (c) Microsoft Corporation
# All rights reserved.
#
# MIT License
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
import json
from io import StringIO
import os
import sys
import unittest
# pylint: disable=wrong-import-position
sys.path.append(
os.path.join(os.path.dirname(os.path.abspath(__file__)), "../src"))
sys.path.append(
os.path.join(os.path.dirname(os.path.abspath(__file__)), "../src/init.d"))
from framework_parser import generate_runtime_env
from common.utils import init_logger
# pylint: enable=wrong-import-position
PACKAGE_DIRECTORY_COM = os.path.dirname(os.path.abspath(__file__))
init_logger()
class TestParser(unittest.TestCase):
def setUp(self):
try:
os.chdir(PACKAGE_DIRECTORY_COM)
except Exception: #pylint: disable=broad-except
pass
def test_generate_runtime_env(self):
os.environ["FC_TASK_INDEX"] = "0"
os.environ["FC_TASKROLE_NAME"] = "taskrole"
test_file = "framework.json"
expect_lines = [
"export PAI_PORT_LIST_taskrole_0_tcp='29877,22353,29076'",
"export PAI_CONTAINER_HOST_PORT_LIST='tcp:29877,22353,29076;udp:31903,33486,35953;ssh:39080;http:30643;'",
"export PAI_taskrole1_0_mpi_PORT='20966,21891'",
"export PAI_CONTAINER_HOST_http_PORT_LIST='30643'",
"export PAI_PORT_LIST_taskrole_0_udp='31903,33486,35953'",
"export PAI_CONTAINER_SSH_PORT='39080'"
]
with open(test_file, "r") as f:
framework = json.load(f)
sys.stdout = temp_stdout = StringIO()
generate_runtime_env(framework)
runtime_env = temp_stdout.getvalue().splitlines()
sys.stdout = sys.__stdout__
for expect in expect_lines:
self.assertIn(expect, runtime_env)
del os.environ["FC_TASK_INDEX"]
del os.environ["FC_TASKROLE_NAME"]
if __name__ == '__main__':
unittest.main()