Use hash func to generate port number (#11)
Refer to issue: microsoft/pai#4384 Using hash function: (int(md5(podUid + taskPortName + taskPortIndex)[0:12] ,16) + int(md5(podUid + taskPortName + taskPortIndex)[12:24] ,16) + int(md5(podUid + taskPortName + taskPortIndex)[24:32] ,16)) % (globalPortEnd - globalPortStart) + globalPortStart to generate portnumber. If port conflict happens, the task will failed. Retried task will has different podUid, and new task will be given different port number.
This commit is contained in:
Родитель
0fed34d82e
Коммит
388bd4f945
4
src/init
4
src/init
|
@ -153,12 +153,12 @@ cp ${PAI_CONFIG_DIR}/runtime-exit-spec.yaml ${PAI_RUNTIME_DIR}
|
||||||
# generate runtime env variables
|
# generate runtime env variables
|
||||||
# priority=10
|
# priority=10
|
||||||
CHILD_PROCESS="ENV_GENERATOR"
|
CHILD_PROCESS="ENV_GENERATOR"
|
||||||
python ${PAI_INIT_DIR}/parser.py genenv framework.json > ${PAI_RUNTIME_DIR}/runtime_env.sh
|
python ${PAI_INIT_DIR}/framework_parser.py genenv framework.json > ${PAI_RUNTIME_DIR}/runtime_env.sh
|
||||||
|
|
||||||
# generate jobconfig
|
# generate jobconfig
|
||||||
# priority=11
|
# priority=11
|
||||||
CHILD_PROCESS="CONFIG_GENERATOR"
|
CHILD_PROCESS="CONFIG_GENERATOR"
|
||||||
python ${PAI_INIT_DIR}/parser.py genconf framework.json > ${PAI_RUNTIME_DIR}/job_config.yaml
|
python ${PAI_INIT_DIR}/framework_parser.py genconf framework.json > ${PAI_RUNTIME_DIR}/job_config.yaml
|
||||||
|
|
||||||
# Init plugins
|
# Init plugins
|
||||||
# priority=12
|
# priority=12
|
||||||
|
|
|
@ -18,6 +18,7 @@
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import base64
|
import base64
|
||||||
|
import hashlib
|
||||||
import logging
|
import logging
|
||||||
import gzip
|
import gzip
|
||||||
import json
|
import json
|
||||||
|
@ -42,6 +43,31 @@ def decompress_field(field):
|
||||||
return obj
|
return obj
|
||||||
|
|
||||||
|
|
||||||
|
def generate_seq_ports_num(port_start, port_count, task_index):
|
||||||
|
base = port_start + port_count * task_index
|
||||||
|
return [str(port_num) for port_num in range(base, base + port_count)]
|
||||||
|
|
||||||
|
|
||||||
|
def generate_hashed_ports_num(pod_uid, port_name, port_count, port_start,
|
||||||
|
port_end):
|
||||||
|
""" Random generate the port number
|
||||||
|
|
||||||
|
The algorithm is:
|
||||||
|
(int(md5(podUid + portName + portIndex)[0:12] ,16) +
|
||||||
|
int(md5(podUid + portName + portIndex)[12:24] ,16) +
|
||||||
|
int(md5(podUid + portName + portIndex)[24:32] ,16)) % (port_end - port_start) + port_start
|
||||||
|
"""
|
||||||
|
port_list = []
|
||||||
|
for i in range(port_count):
|
||||||
|
raw_str = "[{}][{}][{}]".format(pod_uid, port_name, str(i))
|
||||||
|
hash_str = hashlib.md5(raw_str.encode("utf8")).hexdigest()
|
||||||
|
port_list.append(
|
||||||
|
str((int(hash_str[:12], 16) + int(hash_str[12:24], 16) +
|
||||||
|
int(hash_str[24:], 16)) % (port_end - port_start) +
|
||||||
|
port_start))
|
||||||
|
return port_list
|
||||||
|
|
||||||
|
|
||||||
def generate_runtime_env(framework): #pylint: disable=too-many-locals
|
def generate_runtime_env(framework): #pylint: disable=too-many-locals
|
||||||
"""Generate runtime env variables for tasks.
|
"""Generate runtime env variables for tasks.
|
||||||
|
|
||||||
|
@ -96,41 +122,50 @@ def generate_runtime_env(framework): #pylint: disable=too-many-locals
|
||||||
for task in taskrole["taskStatuses"]:
|
for task in taskrole["taskStatuses"]:
|
||||||
index = task["index"]
|
index = task["index"]
|
||||||
current_ip = task["attemptStatus"]["podHostIP"]
|
current_ip = task["attemptStatus"]["podHostIP"]
|
||||||
|
pod_uid = task["attemptStatus"]["podUID"]
|
||||||
|
task_ports = {}
|
||||||
|
|
||||||
taskrole_instances.append("{}:{}".format(name, index))
|
taskrole_instances.append("{}:{}".format(name, index))
|
||||||
|
|
||||||
get_port_base = lambda port_name, p=ports, i=index: int(p[
|
use_port_hash = True
|
||||||
port_name]["start"]) + int(p[port_name]["count"]) * int(i)
|
if "ports" in ports and "schedulePortStart" in ports and "schedulePortEnd" in ports:
|
||||||
|
port_start = ports["schedulePortStart"]
|
||||||
|
port_end = ports["schedulePortEnd"]
|
||||||
|
port_list = ports["ports"]
|
||||||
|
else:
|
||||||
|
# for backward compatibility
|
||||||
|
use_port_hash = False
|
||||||
|
port_list = ports
|
||||||
|
|
||||||
# export ip/port for task role, current ip maybe None for non-gang-allocation
|
for port in port_list.keys():
|
||||||
if current_ip:
|
count = int(port_list[port]["count"])
|
||||||
export("PAI_HOST_IP_{}_{}".format(name, index), current_ip)
|
task_ports[port] = generate_hashed_ports_num(
|
||||||
host_list.append("{}:{}".format(current_ip,
|
pod_uid, port, count, port_start,
|
||||||
get_port_base("http")))
|
port_end) if use_port_hash else generate_seq_ports_num(
|
||||||
|
port_list[port]["start"], count, index)
|
||||||
for port in ports.keys():
|
current_port_str = ",".join(task_ports[port])
|
||||||
start, count = get_port_base(port), int(ports[port]["count"])
|
|
||||||
current_port_str = ",".join(
|
|
||||||
str(x) for x in range(start, start + count))
|
|
||||||
export("PAI_PORT_LIST_{}_{}_{}".format(name, index, port),
|
export("PAI_PORT_LIST_{}_{}_{}".format(name, index, port),
|
||||||
current_port_str)
|
current_port_str)
|
||||||
export("PAI_{}_{}_{}_PORT".format(name, index, port),
|
export("PAI_{}_{}_{}_PORT".format(name, index, port),
|
||||||
current_port_str)
|
current_port_str)
|
||||||
|
|
||||||
|
# export ip/port for task role, current ip maybe None for non-gang-allocation
|
||||||
|
if current_ip:
|
||||||
|
export("PAI_HOST_IP_{}_{}".format(name, index), current_ip)
|
||||||
|
host_list.append("{}:{}".format(current_ip,
|
||||||
|
task_ports["http"][0]))
|
||||||
|
|
||||||
# export ip/port for current container
|
# export ip/port for current container
|
||||||
if (current_taskrole_name == name
|
if (current_taskrole_name == name
|
||||||
and current_task_index == str(index)):
|
and current_task_index == str(index)):
|
||||||
export("PAI_CURRENT_CONTAINER_IP", current_ip)
|
export("PAI_CURRENT_CONTAINER_IP", current_ip)
|
||||||
export("PAI_CURRENT_CONTAINER_PORT", get_port_base("http"))
|
export("PAI_CURRENT_CONTAINER_PORT", task_ports["http"][0])
|
||||||
export("PAI_CONTAINER_HOST_IP", current_ip)
|
export("PAI_CONTAINER_HOST_IP", current_ip)
|
||||||
export("PAI_CONTAINER_HOST_PORT", get_port_base("http"))
|
export("PAI_CONTAINER_HOST_PORT", task_ports["http"][0])
|
||||||
export("PAI_CONTAINER_SSH_PORT", get_port_base("ssh"))
|
export("PAI_CONTAINER_SSH_PORT", task_ports["ssh"][0])
|
||||||
port_str = ""
|
port_str = ""
|
||||||
for port in ports.keys():
|
for port in port_list.keys():
|
||||||
start, count = get_port_base(port), int(
|
current_port_str = ",".join(task_ports[port])
|
||||||
ports[port]["count"])
|
|
||||||
current_port_str = ",".join(
|
|
||||||
str(x) for x in range(start, start + count))
|
|
||||||
export("PAI_CONTAINER_HOST_{}_PORT_LIST".format(port),
|
export("PAI_CONTAINER_HOST_{}_PORT_LIST".format(port),
|
||||||
current_port_str)
|
current_port_str)
|
||||||
port_str += "{}:{};".format(port, current_port_str)
|
port_str += "{}:{};".format(port, current_port_str)
|
|
@ -49,8 +49,13 @@ def check_port(portno):
|
||||||
|
|
||||||
|
|
||||||
def check_port_list_env(port_list_env):
|
def check_port_list_env(port_list_env):
|
||||||
|
ports = {}
|
||||||
for each in re.split(":|;|,", port_list_env):
|
for each in re.split(":|;|,", port_list_env):
|
||||||
if each.isdigit():
|
if each.isdigit():
|
||||||
|
if each in ports:
|
||||||
|
LOGGER.error("Port %s has conflict.", each)
|
||||||
|
sys.exit(10)
|
||||||
|
ports[each] = True
|
||||||
check_port(int(each))
|
check_port(int(each))
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,893 @@
|
||||||
|
{
|
||||||
|
"apiVersion": "frameworkcontroller.microsoft.com/v1",
|
||||||
|
"kind": "Framework",
|
||||||
|
"metadata": {
|
||||||
|
"annotations": {
|
||||||
|
"config": "protocolVersion: 2\nname: test\ntype: job\njobRetryCount: 0\nprerequisites:\n - type: dockerimage\n uri: 'openpai/standard:python_3.6-pytorch_1.2.0-gpu'\n name: docker_image_0\ntaskRoles:\n taskrole:\n instances: 1\n completion:\n minFailedInstances: 1\n minSucceededInstances: -1\n taskRetryCount: 0\n dockerImage: docker_image_0\n resourcePerInstance:\n gpu: 1\n cpu: 4\n memoryMB: 8192\n ports:\n tcp: 3\n udp: 3\n commands:\n - printenv\n taskrole_1:\n instances: 1\n completion:\n minFailedInstances: 1\n minSucceededInstances: -1\n taskRetryCount: 0\n dockerImage: docker_image_0\n resourcePerInstance:\n gpu: 1\n cpu: 4\n memoryMB: 8192\n ports:\n mpi: 2\n tensorflow: 1\n commands:\n - printenv\ndefaults:\n virtualCluster: default\nextras:\n com.microsoft.pai.runtimeplugin:\n - plugin: ssh\n parameters:\n jobssh: true\n",
|
||||||
|
"jobName": "test",
|
||||||
|
"logPathInfix": "51b333b433467483e9e16fcff34ceeda",
|
||||||
|
"totalGpuNumber": "2"
|
||||||
|
},
|
||||||
|
"creationTimestamp": "2020-05-22T03:18:43Z",
|
||||||
|
"generation": 19,
|
||||||
|
"labels": {
|
||||||
|
"userName": "test_user",
|
||||||
|
"virtualCluster": "default"
|
||||||
|
},
|
||||||
|
"name": "51b333b433467483e9e16fcff34ceeda",
|
||||||
|
"namespace": "default",
|
||||||
|
"resourceVersion": "48376615",
|
||||||
|
"selfLink": "/apis/frameworkcontroller.microsoft.com/v1/namespaces/default/frameworks/51b333b433467483e9e16fcff34ceeda",
|
||||||
|
"uid": "f0f0e75a-9bda-11ea-830b-000d3ab25bb6"
|
||||||
|
},
|
||||||
|
"spec": {
|
||||||
|
"description": "",
|
||||||
|
"executionType": "Start",
|
||||||
|
"retryPolicy": {
|
||||||
|
"fancyRetryPolicy": true,
|
||||||
|
"maxRetryCount": 0
|
||||||
|
},
|
||||||
|
"taskRoles": [
|
||||||
|
{
|
||||||
|
"frameworkAttemptCompletionPolicy": {
|
||||||
|
"minFailedTaskCount": 1,
|
||||||
|
"minSucceededTaskCount": -1
|
||||||
|
},
|
||||||
|
"name": "taskrole",
|
||||||
|
"task": {
|
||||||
|
"pod": {
|
||||||
|
"metadata": {
|
||||||
|
"annotations": {
|
||||||
|
"container.apparmor.security.beta.kubernetes.io/app": "unconfined",
|
||||||
|
"rest-server/port-scheduling-spec": "{\"schedulePortStart\":20000,\"schedulePortEnd\":40000,\"ports\":{\"tcp\":{\"count\":3},\"udp\":{\"count\":3},\"ssh\":{\"count\":1},\"http\":{\"count\":1}}}"
|
||||||
|
},
|
||||||
|
"creationTimestamp": null,
|
||||||
|
"labels": {
|
||||||
|
"type": "kube-launcher-task",
|
||||||
|
"userName": "test_user",
|
||||||
|
"virtualCluster": "default"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"spec": {
|
||||||
|
"affinity": {
|
||||||
|
"nodeAffinity": {
|
||||||
|
"requiredDuringSchedulingIgnoredDuringExecution": {
|
||||||
|
"nodeSelectorTerms": [
|
||||||
|
{
|
||||||
|
"matchExpressions": [
|
||||||
|
{
|
||||||
|
"key": "pai-worker",
|
||||||
|
"operator": "In",
|
||||||
|
"values": ["true"]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"containers": [
|
||||||
|
{
|
||||||
|
"command": ["/usr/local/pai/runtime"],
|
||||||
|
"env": [
|
||||||
|
{
|
||||||
|
"name": "PAI_FRAMEWORK_NAME",
|
||||||
|
"value": "test_user~test"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_JOB_NAME",
|
||||||
|
"value": "test_user~test"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_USER_NAME",
|
||||||
|
"value": "test_user"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_DEFAULT_FS_URI"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_TASK_ROLE_COUNT",
|
||||||
|
"value": "2"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_TASK_ROLE_LIST",
|
||||||
|
"value": "taskrole,taskrole_1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_TASK_ROLE_TASK_COUNT_taskrole",
|
||||||
|
"value": "1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_RESOURCE_taskrole",
|
||||||
|
"value": "1,4,8192,0"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_MIN_FAILED_TASK_COUNT_taskrole",
|
||||||
|
"value": "1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_MIN_SUCCEEDED_TASK_COUNT_taskrole",
|
||||||
|
"value": "-1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_TASK_ROLE_TASK_COUNT_taskrole_1",
|
||||||
|
"value": "1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_RESOURCE_taskrole_1",
|
||||||
|
"value": "1,4,8192,0"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_MIN_FAILED_TASK_COUNT_taskrole_1",
|
||||||
|
"value": "1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_MIN_SUCCEEDED_TASK_COUNT_taskrole_1",
|
||||||
|
"value": "-1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_USERNAME",
|
||||||
|
"value": "test_user"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_TASKS_NUM",
|
||||||
|
"value": "2"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_JOB_TASK_COUNT",
|
||||||
|
"value": "2"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_TASK_ROLES_NUM",
|
||||||
|
"value": "2"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_JOB_TASK_ROLE_COUNT",
|
||||||
|
"value": "2"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_JOB_TASK_ROLE_LIST",
|
||||||
|
"value": "taskrole,taskrole_1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_CURRENT_TASK_ROLE_NAME",
|
||||||
|
"value": "taskrole"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_CURRENT_TASK_ROLE_CURRENT_TASK_INDEX",
|
||||||
|
"valueFrom": {
|
||||||
|
"fieldRef": {
|
||||||
|
"fieldPath": "metadata.annotations['FC_TASK_INDEX']"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_TASK_INDEX",
|
||||||
|
"valueFrom": {
|
||||||
|
"fieldRef": {
|
||||||
|
"fieldPath": "metadata.annotations['FC_TASK_INDEX']"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"image": "openpai/standard:python_3.6-pytorch_1.2.0-gpu",
|
||||||
|
"imagePullPolicy": "Always",
|
||||||
|
"name": "app",
|
||||||
|
"resources": {
|
||||||
|
"limits": {
|
||||||
|
"cpu": "4",
|
||||||
|
"github.com/fuse": "1",
|
||||||
|
"memory": "8Gi",
|
||||||
|
"nvidia.com/gpu": "1"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"securityContext": {
|
||||||
|
"capabilities": {
|
||||||
|
"add": ["SYS_ADMIN", "IPC_LOCK", "DAC_READ_SEARCH"],
|
||||||
|
"drop": ["MKNOD"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"terminationMessagePath": "/tmp/pai-termination-log",
|
||||||
|
"volumeMounts": [
|
||||||
|
{
|
||||||
|
"mountPath": "/dev/shm",
|
||||||
|
"name": "dshm"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"mountPath": "/usr/local/pai",
|
||||||
|
"name": "pai-vol"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"mountPath": "/usr/local/pai/logs",
|
||||||
|
"name": "host-log",
|
||||||
|
"subPath": "test_user/51b333b433467483e9e16fcff34ceeda/taskrole"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"mountPath": "/usr/local/pai/ssh-secret",
|
||||||
|
"name": "job-ssh-secret-volume",
|
||||||
|
"readOnly": true
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"hostNetwork": true,
|
||||||
|
"imagePullSecrets": [
|
||||||
|
{
|
||||||
|
"name": "pai-secret"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"initContainers": [
|
||||||
|
{
|
||||||
|
"env": [
|
||||||
|
{
|
||||||
|
"name": "USER_CMD",
|
||||||
|
"value": "printenv"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "KUBE_APISERVER_ADDRESS",
|
||||||
|
"value": "http://10.151.40.4:8080"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "GANG_ALLOCATION",
|
||||||
|
"value": "true"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_FRAMEWORK_NAME",
|
||||||
|
"value": "test_user~test"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_JOB_NAME",
|
||||||
|
"value": "test_user~test"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_USER_NAME",
|
||||||
|
"value": "test_user"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_DEFAULT_FS_URI"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_TASK_ROLE_COUNT",
|
||||||
|
"value": "2"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_TASK_ROLE_LIST",
|
||||||
|
"value": "taskrole,taskrole_1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_TASK_ROLE_TASK_COUNT_taskrole",
|
||||||
|
"value": "1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_RESOURCE_taskrole",
|
||||||
|
"value": "1,4,8192,0"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_MIN_FAILED_TASK_COUNT_taskrole",
|
||||||
|
"value": "1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_MIN_SUCCEEDED_TASK_COUNT_taskrole",
|
||||||
|
"value": "-1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_TASK_ROLE_TASK_COUNT_taskrole_1",
|
||||||
|
"value": "1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_RESOURCE_taskrole_1",
|
||||||
|
"value": "1,4,8192,0"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_MIN_FAILED_TASK_COUNT_taskrole_1",
|
||||||
|
"value": "1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_MIN_SUCCEEDED_TASK_COUNT_taskrole_1",
|
||||||
|
"value": "-1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_USERNAME",
|
||||||
|
"value": "test_user"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_TASKS_NUM",
|
||||||
|
"value": "2"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_JOB_TASK_COUNT",
|
||||||
|
"value": "2"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_TASK_ROLES_NUM",
|
||||||
|
"value": "2"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_JOB_TASK_ROLE_COUNT",
|
||||||
|
"value": "2"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_JOB_TASK_ROLE_LIST",
|
||||||
|
"value": "taskrole,taskrole_1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_CURRENT_TASK_ROLE_NAME",
|
||||||
|
"value": "taskrole"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_CURRENT_TASK_ROLE_CURRENT_TASK_INDEX",
|
||||||
|
"valueFrom": {
|
||||||
|
"fieldRef": {
|
||||||
|
"fieldPath": "metadata.annotations['FC_TASK_INDEX']"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"image": "openpai/openpai-runtime:test_user",
|
||||||
|
"imagePullPolicy": "Always",
|
||||||
|
"name": "init",
|
||||||
|
"resources": {},
|
||||||
|
"volumeMounts": [
|
||||||
|
{
|
||||||
|
"mountPath": "/usr/local/pai",
|
||||||
|
"name": "pai-vol"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"mountPath": "/usr/local/pai/logs",
|
||||||
|
"name": "host-log",
|
||||||
|
"subPath": "test_user/51b333b433467483e9e16fcff34ceeda/taskrole"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"mountPath": "/usr/local/pai-config",
|
||||||
|
"name": "job-exit-spec"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"priorityClassName": "51b333b433467483e9e16fcff34ceeda-priority",
|
||||||
|
"restartPolicy": "Never",
|
||||||
|
"serviceAccountName": "runtime-account",
|
||||||
|
"volumes": [
|
||||||
|
{
|
||||||
|
"emptyDir": {
|
||||||
|
"medium": "Memory",
|
||||||
|
"sizeLimit": "512Mi"
|
||||||
|
},
|
||||||
|
"name": "dshm"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"emptyDir": {},
|
||||||
|
"name": "pai-vol"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"hostPath": {
|
||||||
|
"path": "/var/log/pai"
|
||||||
|
},
|
||||||
|
"name": "host-log"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "job-ssh-secret-volume",
|
||||||
|
"secret": {
|
||||||
|
"secretName": "job-ssh-secret"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"configMap": {
|
||||||
|
"name": "runtime-exit-spec-configuration"
|
||||||
|
},
|
||||||
|
"name": "job-exit-spec"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"podGracefulDeletionTimeoutSec": 1800,
|
||||||
|
"retryPolicy": {
|
||||||
|
"fancyRetryPolicy": false,
|
||||||
|
"maxRetryCount": 0
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"taskNumber": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"frameworkAttemptCompletionPolicy": {
|
||||||
|
"minFailedTaskCount": 1,
|
||||||
|
"minSucceededTaskCount": -1
|
||||||
|
},
|
||||||
|
"name": "taskrole1",
|
||||||
|
"task": {
|
||||||
|
"pod": {
|
||||||
|
"metadata": {
|
||||||
|
"annotations": {
|
||||||
|
"container.apparmor.security.beta.kubernetes.io/app": "unconfined",
|
||||||
|
"rest-server/port-scheduling-spec": "{\"schedulePortStart\":20000,\"schedulePortEnd\":40000,\"ports\":{\"mpi\":{\"count\":2},\"tensorflow\":{\"count\":1},\"ssh\":{\"count\":1},\"http\":{\"count\":1}}}"
|
||||||
|
},
|
||||||
|
"creationTimestamp": null,
|
||||||
|
"labels": {
|
||||||
|
"type": "kube-launcher-task",
|
||||||
|
"userName": "test_user",
|
||||||
|
"virtualCluster": "default"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"spec": {
|
||||||
|
"affinity": {
|
||||||
|
"nodeAffinity": {
|
||||||
|
"requiredDuringSchedulingIgnoredDuringExecution": {
|
||||||
|
"nodeSelectorTerms": [
|
||||||
|
{
|
||||||
|
"matchExpressions": [
|
||||||
|
{
|
||||||
|
"key": "pai-worker",
|
||||||
|
"operator": "In",
|
||||||
|
"values": ["true"]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"containers": [
|
||||||
|
{
|
||||||
|
"command": ["/usr/local/pai/runtime"],
|
||||||
|
"env": [
|
||||||
|
{
|
||||||
|
"name": "PAI_FRAMEWORK_NAME",
|
||||||
|
"value": "test_user~test"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_JOB_NAME",
|
||||||
|
"value": "test_user~test"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_USER_NAME",
|
||||||
|
"value": "test_user"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_DEFAULT_FS_URI"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_TASK_ROLE_COUNT",
|
||||||
|
"value": "2"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_TASK_ROLE_LIST",
|
||||||
|
"value": "taskrole,taskrole_1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_TASK_ROLE_TASK_COUNT_taskrole",
|
||||||
|
"value": "1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_RESOURCE_taskrole",
|
||||||
|
"value": "1,4,8192,0"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_MIN_FAILED_TASK_COUNT_taskrole",
|
||||||
|
"value": "1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_MIN_SUCCEEDED_TASK_COUNT_taskrole",
|
||||||
|
"value": "-1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_TASK_ROLE_TASK_COUNT_taskrole_1",
|
||||||
|
"value": "1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_RESOURCE_taskrole_1",
|
||||||
|
"value": "1,4,8192,0"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_MIN_FAILED_TASK_COUNT_taskrole_1",
|
||||||
|
"value": "1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_MIN_SUCCEEDED_TASK_COUNT_taskrole_1",
|
||||||
|
"value": "-1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_USERNAME",
|
||||||
|
"value": "test_user"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_TASKS_NUM",
|
||||||
|
"value": "2"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_JOB_TASK_COUNT",
|
||||||
|
"value": "2"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_TASK_ROLES_NUM",
|
||||||
|
"value": "2"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_JOB_TASK_ROLE_COUNT",
|
||||||
|
"value": "2"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_JOB_TASK_ROLE_LIST",
|
||||||
|
"value": "taskrole,taskrole_1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_CURRENT_TASK_ROLE_NAME",
|
||||||
|
"value": "taskrole_1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_CURRENT_TASK_ROLE_CURRENT_TASK_INDEX",
|
||||||
|
"valueFrom": {
|
||||||
|
"fieldRef": {
|
||||||
|
"fieldPath": "metadata.annotations['FC_TASK_INDEX']"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_TASK_INDEX",
|
||||||
|
"valueFrom": {
|
||||||
|
"fieldRef": {
|
||||||
|
"fieldPath": "metadata.annotations['FC_TASK_INDEX']"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"image": "openpai/standard:python_3.6-pytorch_1.2.0-gpu",
|
||||||
|
"imagePullPolicy": "Always",
|
||||||
|
"name": "app",
|
||||||
|
"resources": {
|
||||||
|
"limits": {
|
||||||
|
"cpu": "4",
|
||||||
|
"github.com/fuse": "1",
|
||||||
|
"memory": "8Gi",
|
||||||
|
"nvidia.com/gpu": "1"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"securityContext": {
|
||||||
|
"capabilities": {
|
||||||
|
"add": ["SYS_ADMIN", "IPC_LOCK", "DAC_READ_SEARCH"],
|
||||||
|
"drop": ["MKNOD"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"terminationMessagePath": "/tmp/pai-termination-log",
|
||||||
|
"volumeMounts": [
|
||||||
|
{
|
||||||
|
"mountPath": "/dev/shm",
|
||||||
|
"name": "dshm"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"mountPath": "/usr/local/pai",
|
||||||
|
"name": "pai-vol"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"mountPath": "/usr/local/pai/logs",
|
||||||
|
"name": "host-log",
|
||||||
|
"subPath": "test_user/51b333b433467483e9e16fcff34ceeda/taskrole1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"mountPath": "/usr/local/pai/ssh-secret",
|
||||||
|
"name": "job-ssh-secret-volume",
|
||||||
|
"readOnly": true
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"hostNetwork": true,
|
||||||
|
"imagePullSecrets": [
|
||||||
|
{
|
||||||
|
"name": "pai-secret"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"initContainers": [
|
||||||
|
{
|
||||||
|
"env": [
|
||||||
|
{
|
||||||
|
"name": "USER_CMD",
|
||||||
|
"value": "printenv"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "KUBE_APISERVER_ADDRESS",
|
||||||
|
"value": "http://10.151.40.4:8080"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "GANG_ALLOCATION",
|
||||||
|
"value": "true"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_FRAMEWORK_NAME",
|
||||||
|
"value": "test_user~test"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_JOB_NAME",
|
||||||
|
"value": "test_user~test"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_USER_NAME",
|
||||||
|
"value": "test_user"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_DEFAULT_FS_URI"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_TASK_ROLE_COUNT",
|
||||||
|
"value": "2"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_TASK_ROLE_LIST",
|
||||||
|
"value": "taskrole,taskrole_1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_TASK_ROLE_TASK_COUNT_taskrole",
|
||||||
|
"value": "1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_RESOURCE_taskrole",
|
||||||
|
"value": "1,4,8192,0"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_MIN_FAILED_TASK_COUNT_taskrole",
|
||||||
|
"value": "1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_MIN_SUCCEEDED_TASK_COUNT_taskrole",
|
||||||
|
"value": "-1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_TASK_ROLE_TASK_COUNT_taskrole_1",
|
||||||
|
"value": "1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_RESOURCE_taskrole_1",
|
||||||
|
"value": "1,4,8192,0"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_MIN_FAILED_TASK_COUNT_taskrole_1",
|
||||||
|
"value": "1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_MIN_SUCCEEDED_TASK_COUNT_taskrole_1",
|
||||||
|
"value": "-1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_USERNAME",
|
||||||
|
"value": "test_user"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_TASKS_NUM",
|
||||||
|
"value": "2"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_JOB_TASK_COUNT",
|
||||||
|
"value": "2"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_TASK_ROLES_NUM",
|
||||||
|
"value": "2"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_JOB_TASK_ROLE_COUNT",
|
||||||
|
"value": "2"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_JOB_TASK_ROLE_LIST",
|
||||||
|
"value": "taskrole,taskrole_1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_CURRENT_TASK_ROLE_NAME",
|
||||||
|
"value": "taskrole_1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PAI_CURRENT_TASK_ROLE_CURRENT_TASK_INDEX",
|
||||||
|
"valueFrom": {
|
||||||
|
"fieldRef": {
|
||||||
|
"fieldPath": "metadata.annotations['FC_TASK_INDEX']"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"image": "openpai/openpai-runtime:test_user",
|
||||||
|
"imagePullPolicy": "Always",
|
||||||
|
"name": "init",
|
||||||
|
"resources": {},
|
||||||
|
"volumeMounts": [
|
||||||
|
{
|
||||||
|
"mountPath": "/usr/local/pai",
|
||||||
|
"name": "pai-vol"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"mountPath": "/usr/local/pai/logs",
|
||||||
|
"name": "host-log",
|
||||||
|
"subPath": "test_user/51b333b433467483e9e16fcff34ceeda/taskrole1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"mountPath": "/usr/local/pai-config",
|
||||||
|
"name": "job-exit-spec"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"priorityClassName": "51b333b433467483e9e16fcff34ceeda-priority",
|
||||||
|
"restartPolicy": "Never",
|
||||||
|
"serviceAccountName": "runtime-account",
|
||||||
|
"volumes": [
|
||||||
|
{
|
||||||
|
"emptyDir": {
|
||||||
|
"medium": "Memory",
|
||||||
|
"sizeLimit": "512Mi"
|
||||||
|
},
|
||||||
|
"name": "dshm"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"emptyDir": {},
|
||||||
|
"name": "pai-vol"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"hostPath": {
|
||||||
|
"path": "/var/log/pai"
|
||||||
|
},
|
||||||
|
"name": "host-log"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "job-ssh-secret-volume",
|
||||||
|
"secret": {
|
||||||
|
"secretName": "job-ssh-secret"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"configMap": {
|
||||||
|
"name": "runtime-exit-spec-configuration"
|
||||||
|
},
|
||||||
|
"name": "job-exit-spec"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"podGracefulDeletionTimeoutSec": 1800,
|
||||||
|
"retryPolicy": {
|
||||||
|
"fancyRetryPolicy": false,
|
||||||
|
"maxRetryCount": 0
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"taskNumber": 1
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"status": {
|
||||||
|
"attemptStatus": {
|
||||||
|
"completionStatus": {
|
||||||
|
"code": 0,
|
||||||
|
"diagnostics": "Pod succeeded",
|
||||||
|
"phrase": "Succeeded",
|
||||||
|
"trigger": {
|
||||||
|
"message": "All Tasks are completed and no user specified conditions in FrameworkAttemptCompletionPolicy have ever been triggered: TotalTaskCount: 2, FailedTaskCount: 0",
|
||||||
|
"taskIndex": 0,
|
||||||
|
"taskRoleName": "taskrole1"
|
||||||
|
},
|
||||||
|
"type": {
|
||||||
|
"attributes": [],
|
||||||
|
"name": "Succeeded"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"completionTime": "2020-05-22T03:19:33Z",
|
||||||
|
"configMapName": "51b333b433467483e9e16fcff34ceeda-attempt",
|
||||||
|
"configMapUID": "f0f47860-9bda-11ea-830b-000d3ab25bb6",
|
||||||
|
"id": 0,
|
||||||
|
"instanceUID": "0_f0f47860-9bda-11ea-830b-000d3ab25bb6",
|
||||||
|
"runTime": "2020-05-22T03:19:26Z",
|
||||||
|
"startTime": "2020-05-22T03:18:43Z",
|
||||||
|
"taskRoleStatuses": [
|
||||||
|
{
|
||||||
|
"name": "taskrole",
|
||||||
|
"taskStatuses": [
|
||||||
|
{
|
||||||
|
"attemptStatus": {
|
||||||
|
"completionStatus": {
|
||||||
|
"code": 0,
|
||||||
|
"diagnostics": "Pod succeeded",
|
||||||
|
"phrase": "Succeeded",
|
||||||
|
"pod": {
|
||||||
|
"containers": [
|
||||||
|
{
|
||||||
|
"code": 0,
|
||||||
|
"name": "init",
|
||||||
|
"reason": "Completed"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"code": 0,
|
||||||
|
"name": "app",
|
||||||
|
"reason": "Completed"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"type": {
|
||||||
|
"attributes": [],
|
||||||
|
"name": "Succeeded"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"completionTime": "2020-05-22T03:19:32Z",
|
||||||
|
"id": 0,
|
||||||
|
"instanceUID": "0_f1020521-9bda-11ea-830b-000d3ab25bb6",
|
||||||
|
"podHostIP": "10.151.41.8",
|
||||||
|
"podIP": "10.151.41.8",
|
||||||
|
"podName": "51b333b433467483e9e16fcff34ceeda-taskrole-0",
|
||||||
|
"podNodeName": "10.151.41.8",
|
||||||
|
"podUID": "f1020521-9bda-11ea-830b-000d3ab25bb6",
|
||||||
|
"runTime": "2020-05-22T03:19:26Z",
|
||||||
|
"startTime": "2020-05-22T03:18:43Z"
|
||||||
|
},
|
||||||
|
"completionTime": "2020-05-22T03:19:32Z",
|
||||||
|
"index": 0,
|
||||||
|
"retryPolicyStatus": {
|
||||||
|
"accountableRetriedCount": 0,
|
||||||
|
"retryDelaySec": null,
|
||||||
|
"totalRetriedCount": 0
|
||||||
|
},
|
||||||
|
"startTime": "2020-05-22T03:18:43Z",
|
||||||
|
"state": "Completed",
|
||||||
|
"transitionTime": "2020-05-22T03:19:32Z"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "taskrole1",
|
||||||
|
"taskStatuses": [
|
||||||
|
{
|
||||||
|
"attemptStatus": {
|
||||||
|
"completionStatus": {
|
||||||
|
"code": 0,
|
||||||
|
"diagnostics": "Pod succeeded",
|
||||||
|
"phrase": "Succeeded",
|
||||||
|
"pod": {
|
||||||
|
"containers": [
|
||||||
|
{
|
||||||
|
"code": 0,
|
||||||
|
"name": "init",
|
||||||
|
"reason": "Completed"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"code": 0,
|
||||||
|
"name": "app",
|
||||||
|
"reason": "Completed"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"type": {
|
||||||
|
"attributes": [],
|
||||||
|
"name": "Succeeded"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"completionTime": "2020-05-22T03:19:32Z",
|
||||||
|
"id": 0,
|
||||||
|
"instanceUID": "0_f102a4a7-9bda-11ea-830b-000d3ab25bb6",
|
||||||
|
"podHostIP": "10.151.41.9",
|
||||||
|
"podIP": "10.151.41.9",
|
||||||
|
"podName": "51b333b433467483e9e16fcff34ceeda-taskrole1-0",
|
||||||
|
"podNodeName": "10.151.41.9",
|
||||||
|
"podUID": "f102a4a7-9bda-11ea-830b-000d3ab25bb6",
|
||||||
|
"runTime": "2020-05-22T03:19:27Z",
|
||||||
|
"startTime": "2020-05-22T03:18:43Z"
|
||||||
|
},
|
||||||
|
"completionTime": "2020-05-22T03:19:32Z",
|
||||||
|
"index": 0,
|
||||||
|
"retryPolicyStatus": {
|
||||||
|
"accountableRetriedCount": 0,
|
||||||
|
"retryDelaySec": null,
|
||||||
|
"totalRetriedCount": 0
|
||||||
|
},
|
||||||
|
"startTime": "2020-05-22T03:18:43Z",
|
||||||
|
"state": "Completed",
|
||||||
|
"transitionTime": "2020-05-22T03:19:32Z"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"completionTime": "2020-05-22T03:19:33Z",
|
||||||
|
"retryPolicyStatus": {
|
||||||
|
"accountableRetriedCount": 0,
|
||||||
|
"retryDelaySec": null,
|
||||||
|
"totalRetriedCount": 0
|
||||||
|
},
|
||||||
|
"startTime": "2020-05-22T03:18:43Z",
|
||||||
|
"state": "Completed",
|
||||||
|
"transitionTime": "2020-05-22T03:19:33Z"
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,74 @@
|
||||||
|
# Copyright (c) Microsoft Corporation
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# MIT License
|
||||||
|
#
|
||||||
|
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
|
||||||
|
# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
|
||||||
|
# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
|
||||||
|
# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
||||||
|
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
||||||
|
#
|
||||||
|
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
|
||||||
|
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
|
||||||
|
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
|
||||||
|
import json
|
||||||
|
from io import StringIO
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
# pylint: disable=wrong-import-position
|
||||||
|
sys.path.append(
|
||||||
|
os.path.join(os.path.dirname(os.path.abspath(__file__)), "../src"))
|
||||||
|
sys.path.append(
|
||||||
|
os.path.join(os.path.dirname(os.path.abspath(__file__)), "../src/init.d"))
|
||||||
|
from framework_parser import generate_runtime_env
|
||||||
|
from common.utils import init_logger
|
||||||
|
# pylint: enable=wrong-import-position
|
||||||
|
|
||||||
|
PACKAGE_DIRECTORY_COM = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
init_logger()
|
||||||
|
|
||||||
|
|
||||||
|
class TestParser(unittest.TestCase):
|
||||||
|
def setUp(self):
|
||||||
|
try:
|
||||||
|
os.chdir(PACKAGE_DIRECTORY_COM)
|
||||||
|
except Exception: #pylint: disable=broad-except
|
||||||
|
pass
|
||||||
|
|
||||||
|
def test_generate_runtime_env(self):
|
||||||
|
os.environ["FC_TASK_INDEX"] = "0"
|
||||||
|
os.environ["FC_TASKROLE_NAME"] = "taskrole"
|
||||||
|
test_file = "framework.json"
|
||||||
|
expect_lines = [
|
||||||
|
"export PAI_PORT_LIST_taskrole_0_tcp='29877,22353,29076'",
|
||||||
|
"export PAI_CONTAINER_HOST_PORT_LIST='tcp:29877,22353,29076;udp:31903,33486,35953;ssh:39080;http:30643;'",
|
||||||
|
"export PAI_taskrole1_0_mpi_PORT='20966,21891'",
|
||||||
|
"export PAI_CONTAINER_HOST_http_PORT_LIST='30643'",
|
||||||
|
"export PAI_PORT_LIST_taskrole_0_udp='31903,33486,35953'",
|
||||||
|
"export PAI_CONTAINER_SSH_PORT='39080'"
|
||||||
|
]
|
||||||
|
with open(test_file, "r") as f:
|
||||||
|
framework = json.load(f)
|
||||||
|
|
||||||
|
sys.stdout = temp_stdout = StringIO()
|
||||||
|
|
||||||
|
generate_runtime_env(framework)
|
||||||
|
runtime_env = temp_stdout.getvalue().splitlines()
|
||||||
|
|
||||||
|
sys.stdout = sys.__stdout__
|
||||||
|
|
||||||
|
for expect in expect_lines:
|
||||||
|
self.assertIn(expect, runtime_env)
|
||||||
|
|
||||||
|
del os.environ["FC_TASK_INDEX"]
|
||||||
|
del os.environ["FC_TASKROLE_NAME"]
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
unittest.main()
|
Загрузка…
Ссылка в новой задаче