Use hash func to generate port number (#11)
Refer to issue: microsoft/pai#4384 Using hash function: (int(md5(podUid + taskPortName + taskPortIndex)[0:12] ,16) + int(md5(podUid + taskPortName + taskPortIndex)[12:24] ,16) + int(md5(podUid + taskPortName + taskPortIndex)[24:32] ,16)) % (globalPortEnd - globalPortStart) + globalPortStart to generate portnumber. If port conflict happens, the task will failed. Retried task will has different podUid, and new task will be given different port number.
This commit is contained in:
Родитель
0fed34d82e
Коммит
388bd4f945
4
src/init
4
src/init
|
@ -153,12 +153,12 @@ cp ${PAI_CONFIG_DIR}/runtime-exit-spec.yaml ${PAI_RUNTIME_DIR}
|
|||
# generate runtime env variables
|
||||
# priority=10
|
||||
CHILD_PROCESS="ENV_GENERATOR"
|
||||
python ${PAI_INIT_DIR}/parser.py genenv framework.json > ${PAI_RUNTIME_DIR}/runtime_env.sh
|
||||
python ${PAI_INIT_DIR}/framework_parser.py genenv framework.json > ${PAI_RUNTIME_DIR}/runtime_env.sh
|
||||
|
||||
# generate jobconfig
|
||||
# priority=11
|
||||
CHILD_PROCESS="CONFIG_GENERATOR"
|
||||
python ${PAI_INIT_DIR}/parser.py genconf framework.json > ${PAI_RUNTIME_DIR}/job_config.yaml
|
||||
python ${PAI_INIT_DIR}/framework_parser.py genconf framework.json > ${PAI_RUNTIME_DIR}/job_config.yaml
|
||||
|
||||
# Init plugins
|
||||
# priority=12
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
|
||||
import argparse
|
||||
import base64
|
||||
import hashlib
|
||||
import logging
|
||||
import gzip
|
||||
import json
|
||||
|
@ -42,6 +43,31 @@ def decompress_field(field):
|
|||
return obj
|
||||
|
||||
|
||||
def generate_seq_ports_num(port_start, port_count, task_index):
|
||||
base = port_start + port_count * task_index
|
||||
return [str(port_num) for port_num in range(base, base + port_count)]
|
||||
|
||||
|
||||
def generate_hashed_ports_num(pod_uid, port_name, port_count, port_start,
|
||||
port_end):
|
||||
""" Random generate the port number
|
||||
|
||||
The algorithm is:
|
||||
(int(md5(podUid + portName + portIndex)[0:12] ,16) +
|
||||
int(md5(podUid + portName + portIndex)[12:24] ,16) +
|
||||
int(md5(podUid + portName + portIndex)[24:32] ,16)) % (port_end - port_start) + port_start
|
||||
"""
|
||||
port_list = []
|
||||
for i in range(port_count):
|
||||
raw_str = "[{}][{}][{}]".format(pod_uid, port_name, str(i))
|
||||
hash_str = hashlib.md5(raw_str.encode("utf8")).hexdigest()
|
||||
port_list.append(
|
||||
str((int(hash_str[:12], 16) + int(hash_str[12:24], 16) +
|
||||
int(hash_str[24:], 16)) % (port_end - port_start) +
|
||||
port_start))
|
||||
return port_list
|
||||
|
||||
|
||||
def generate_runtime_env(framework): #pylint: disable=too-many-locals
|
||||
"""Generate runtime env variables for tasks.
|
||||
|
||||
|
@ -96,41 +122,50 @@ def generate_runtime_env(framework): #pylint: disable=too-many-locals
|
|||
for task in taskrole["taskStatuses"]:
|
||||
index = task["index"]
|
||||
current_ip = task["attemptStatus"]["podHostIP"]
|
||||
pod_uid = task["attemptStatus"]["podUID"]
|
||||
task_ports = {}
|
||||
|
||||
taskrole_instances.append("{}:{}".format(name, index))
|
||||
|
||||
get_port_base = lambda port_name, p=ports, i=index: int(p[
|
||||
port_name]["start"]) + int(p[port_name]["count"]) * int(i)
|
||||
use_port_hash = True
|
||||
if "ports" in ports and "schedulePortStart" in ports and "schedulePortEnd" in ports:
|
||||
port_start = ports["schedulePortStart"]
|
||||
port_end = ports["schedulePortEnd"]
|
||||
port_list = ports["ports"]
|
||||
else:
|
||||
# for backward compatibility
|
||||
use_port_hash = False
|
||||
port_list = ports
|
||||
|
||||
# export ip/port for task role, current ip maybe None for non-gang-allocation
|
||||
if current_ip:
|
||||
export("PAI_HOST_IP_{}_{}".format(name, index), current_ip)
|
||||
host_list.append("{}:{}".format(current_ip,
|
||||
get_port_base("http")))
|
||||
|
||||
for port in ports.keys():
|
||||
start, count = get_port_base(port), int(ports[port]["count"])
|
||||
current_port_str = ",".join(
|
||||
str(x) for x in range(start, start + count))
|
||||
for port in port_list.keys():
|
||||
count = int(port_list[port]["count"])
|
||||
task_ports[port] = generate_hashed_ports_num(
|
||||
pod_uid, port, count, port_start,
|
||||
port_end) if use_port_hash else generate_seq_ports_num(
|
||||
port_list[port]["start"], count, index)
|
||||
current_port_str = ",".join(task_ports[port])
|
||||
export("PAI_PORT_LIST_{}_{}_{}".format(name, index, port),
|
||||
current_port_str)
|
||||
export("PAI_{}_{}_{}_PORT".format(name, index, port),
|
||||
current_port_str)
|
||||
|
||||
# export ip/port for task role, current ip maybe None for non-gang-allocation
|
||||
if current_ip:
|
||||
export("PAI_HOST_IP_{}_{}".format(name, index), current_ip)
|
||||
host_list.append("{}:{}".format(current_ip,
|
||||
task_ports["http"][0]))
|
||||
|
||||
# export ip/port for current container
|
||||
if (current_taskrole_name == name
|
||||
and current_task_index == str(index)):
|
||||
export("PAI_CURRENT_CONTAINER_IP", current_ip)
|
||||
export("PAI_CURRENT_CONTAINER_PORT", get_port_base("http"))
|
||||
export("PAI_CURRENT_CONTAINER_PORT", task_ports["http"][0])
|
||||
export("PAI_CONTAINER_HOST_IP", current_ip)
|
||||
export("PAI_CONTAINER_HOST_PORT", get_port_base("http"))
|
||||
export("PAI_CONTAINER_SSH_PORT", get_port_base("ssh"))
|
||||
export("PAI_CONTAINER_HOST_PORT", task_ports["http"][0])
|
||||
export("PAI_CONTAINER_SSH_PORT", task_ports["ssh"][0])
|
||||
port_str = ""
|
||||
for port in ports.keys():
|
||||
start, count = get_port_base(port), int(
|
||||
ports[port]["count"])
|
||||
current_port_str = ",".join(
|
||||
str(x) for x in range(start, start + count))
|
||||
for port in port_list.keys():
|
||||
current_port_str = ",".join(task_ports[port])
|
||||
export("PAI_CONTAINER_HOST_{}_PORT_LIST".format(port),
|
||||
current_port_str)
|
||||
port_str += "{}:{};".format(port, current_port_str)
|
|
@ -49,8 +49,13 @@ def check_port(portno):
|
|||
|
||||
|
||||
def check_port_list_env(port_list_env):
|
||||
ports = {}
|
||||
for each in re.split(":|;|,", port_list_env):
|
||||
if each.isdigit():
|
||||
if each in ports:
|
||||
LOGGER.error("Port %s has conflict.", each)
|
||||
sys.exit(10)
|
||||
ports[each] = True
|
||||
check_port(int(each))
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,893 @@
|
|||
{
|
||||
"apiVersion": "frameworkcontroller.microsoft.com/v1",
|
||||
"kind": "Framework",
|
||||
"metadata": {
|
||||
"annotations": {
|
||||
"config": "protocolVersion: 2\nname: test\ntype: job\njobRetryCount: 0\nprerequisites:\n - type: dockerimage\n uri: 'openpai/standard:python_3.6-pytorch_1.2.0-gpu'\n name: docker_image_0\ntaskRoles:\n taskrole:\n instances: 1\n completion:\n minFailedInstances: 1\n minSucceededInstances: -1\n taskRetryCount: 0\n dockerImage: docker_image_0\n resourcePerInstance:\n gpu: 1\n cpu: 4\n memoryMB: 8192\n ports:\n tcp: 3\n udp: 3\n commands:\n - printenv\n taskrole_1:\n instances: 1\n completion:\n minFailedInstances: 1\n minSucceededInstances: -1\n taskRetryCount: 0\n dockerImage: docker_image_0\n resourcePerInstance:\n gpu: 1\n cpu: 4\n memoryMB: 8192\n ports:\n mpi: 2\n tensorflow: 1\n commands:\n - printenv\ndefaults:\n virtualCluster: default\nextras:\n com.microsoft.pai.runtimeplugin:\n - plugin: ssh\n parameters:\n jobssh: true\n",
|
||||
"jobName": "test",
|
||||
"logPathInfix": "51b333b433467483e9e16fcff34ceeda",
|
||||
"totalGpuNumber": "2"
|
||||
},
|
||||
"creationTimestamp": "2020-05-22T03:18:43Z",
|
||||
"generation": 19,
|
||||
"labels": {
|
||||
"userName": "test_user",
|
||||
"virtualCluster": "default"
|
||||
},
|
||||
"name": "51b333b433467483e9e16fcff34ceeda",
|
||||
"namespace": "default",
|
||||
"resourceVersion": "48376615",
|
||||
"selfLink": "/apis/frameworkcontroller.microsoft.com/v1/namespaces/default/frameworks/51b333b433467483e9e16fcff34ceeda",
|
||||
"uid": "f0f0e75a-9bda-11ea-830b-000d3ab25bb6"
|
||||
},
|
||||
"spec": {
|
||||
"description": "",
|
||||
"executionType": "Start",
|
||||
"retryPolicy": {
|
||||
"fancyRetryPolicy": true,
|
||||
"maxRetryCount": 0
|
||||
},
|
||||
"taskRoles": [
|
||||
{
|
||||
"frameworkAttemptCompletionPolicy": {
|
||||
"minFailedTaskCount": 1,
|
||||
"minSucceededTaskCount": -1
|
||||
},
|
||||
"name": "taskrole",
|
||||
"task": {
|
||||
"pod": {
|
||||
"metadata": {
|
||||
"annotations": {
|
||||
"container.apparmor.security.beta.kubernetes.io/app": "unconfined",
|
||||
"rest-server/port-scheduling-spec": "{\"schedulePortStart\":20000,\"schedulePortEnd\":40000,\"ports\":{\"tcp\":{\"count\":3},\"udp\":{\"count\":3},\"ssh\":{\"count\":1},\"http\":{\"count\":1}}}"
|
||||
},
|
||||
"creationTimestamp": null,
|
||||
"labels": {
|
||||
"type": "kube-launcher-task",
|
||||
"userName": "test_user",
|
||||
"virtualCluster": "default"
|
||||
}
|
||||
},
|
||||
"spec": {
|
||||
"affinity": {
|
||||
"nodeAffinity": {
|
||||
"requiredDuringSchedulingIgnoredDuringExecution": {
|
||||
"nodeSelectorTerms": [
|
||||
{
|
||||
"matchExpressions": [
|
||||
{
|
||||
"key": "pai-worker",
|
||||
"operator": "In",
|
||||
"values": ["true"]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"containers": [
|
||||
{
|
||||
"command": ["/usr/local/pai/runtime"],
|
||||
"env": [
|
||||
{
|
||||
"name": "PAI_FRAMEWORK_NAME",
|
||||
"value": "test_user~test"
|
||||
},
|
||||
{
|
||||
"name": "PAI_JOB_NAME",
|
||||
"value": "test_user~test"
|
||||
},
|
||||
{
|
||||
"name": "PAI_USER_NAME",
|
||||
"value": "test_user"
|
||||
},
|
||||
{
|
||||
"name": "PAI_DEFAULT_FS_URI"
|
||||
},
|
||||
{
|
||||
"name": "PAI_TASK_ROLE_COUNT",
|
||||
"value": "2"
|
||||
},
|
||||
{
|
||||
"name": "PAI_TASK_ROLE_LIST",
|
||||
"value": "taskrole,taskrole_1"
|
||||
},
|
||||
{
|
||||
"name": "PAI_TASK_ROLE_TASK_COUNT_taskrole",
|
||||
"value": "1"
|
||||
},
|
||||
{
|
||||
"name": "PAI_RESOURCE_taskrole",
|
||||
"value": "1,4,8192,0"
|
||||
},
|
||||
{
|
||||
"name": "PAI_MIN_FAILED_TASK_COUNT_taskrole",
|
||||
"value": "1"
|
||||
},
|
||||
{
|
||||
"name": "PAI_MIN_SUCCEEDED_TASK_COUNT_taskrole",
|
||||
"value": "-1"
|
||||
},
|
||||
{
|
||||
"name": "PAI_TASK_ROLE_TASK_COUNT_taskrole_1",
|
||||
"value": "1"
|
||||
},
|
||||
{
|
||||
"name": "PAI_RESOURCE_taskrole_1",
|
||||
"value": "1,4,8192,0"
|
||||
},
|
||||
{
|
||||
"name": "PAI_MIN_FAILED_TASK_COUNT_taskrole_1",
|
||||
"value": "1"
|
||||
},
|
||||
{
|
||||
"name": "PAI_MIN_SUCCEEDED_TASK_COUNT_taskrole_1",
|
||||
"value": "-1"
|
||||
},
|
||||
{
|
||||
"name": "PAI_USERNAME",
|
||||
"value": "test_user"
|
||||
},
|
||||
{
|
||||
"name": "PAI_TASKS_NUM",
|
||||
"value": "2"
|
||||
},
|
||||
{
|
||||
"name": "PAI_JOB_TASK_COUNT",
|
||||
"value": "2"
|
||||
},
|
||||
{
|
||||
"name": "PAI_TASK_ROLES_NUM",
|
||||
"value": "2"
|
||||
},
|
||||
{
|
||||
"name": "PAI_JOB_TASK_ROLE_COUNT",
|
||||
"value": "2"
|
||||
},
|
||||
{
|
||||
"name": "PAI_JOB_TASK_ROLE_LIST",
|
||||
"value": "taskrole,taskrole_1"
|
||||
},
|
||||
{
|
||||
"name": "PAI_CURRENT_TASK_ROLE_NAME",
|
||||
"value": "taskrole"
|
||||
},
|
||||
{
|
||||
"name": "PAI_CURRENT_TASK_ROLE_CURRENT_TASK_INDEX",
|
||||
"valueFrom": {
|
||||
"fieldRef": {
|
||||
"fieldPath": "metadata.annotations['FC_TASK_INDEX']"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "PAI_TASK_INDEX",
|
||||
"valueFrom": {
|
||||
"fieldRef": {
|
||||
"fieldPath": "metadata.annotations['FC_TASK_INDEX']"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"image": "openpai/standard:python_3.6-pytorch_1.2.0-gpu",
|
||||
"imagePullPolicy": "Always",
|
||||
"name": "app",
|
||||
"resources": {
|
||||
"limits": {
|
||||
"cpu": "4",
|
||||
"github.com/fuse": "1",
|
||||
"memory": "8Gi",
|
||||
"nvidia.com/gpu": "1"
|
||||
}
|
||||
},
|
||||
"securityContext": {
|
||||
"capabilities": {
|
||||
"add": ["SYS_ADMIN", "IPC_LOCK", "DAC_READ_SEARCH"],
|
||||
"drop": ["MKNOD"]
|
||||
}
|
||||
},
|
||||
"terminationMessagePath": "/tmp/pai-termination-log",
|
||||
"volumeMounts": [
|
||||
{
|
||||
"mountPath": "/dev/shm",
|
||||
"name": "dshm"
|
||||
},
|
||||
{
|
||||
"mountPath": "/usr/local/pai",
|
||||
"name": "pai-vol"
|
||||
},
|
||||
{
|
||||
"mountPath": "/usr/local/pai/logs",
|
||||
"name": "host-log",
|
||||
"subPath": "test_user/51b333b433467483e9e16fcff34ceeda/taskrole"
|
||||
},
|
||||
{
|
||||
"mountPath": "/usr/local/pai/ssh-secret",
|
||||
"name": "job-ssh-secret-volume",
|
||||
"readOnly": true
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"hostNetwork": true,
|
||||
"imagePullSecrets": [
|
||||
{
|
||||
"name": "pai-secret"
|
||||
}
|
||||
],
|
||||
"initContainers": [
|
||||
{
|
||||
"env": [
|
||||
{
|
||||
"name": "USER_CMD",
|
||||
"value": "printenv"
|
||||
},
|
||||
{
|
||||
"name": "KUBE_APISERVER_ADDRESS",
|
||||
"value": "http://10.151.40.4:8080"
|
||||
},
|
||||
{
|
||||
"name": "GANG_ALLOCATION",
|
||||
"value": "true"
|
||||
},
|
||||
{
|
||||
"name": "PAI_FRAMEWORK_NAME",
|
||||
"value": "test_user~test"
|
||||
},
|
||||
{
|
||||
"name": "PAI_JOB_NAME",
|
||||
"value": "test_user~test"
|
||||
},
|
||||
{
|
||||
"name": "PAI_USER_NAME",
|
||||
"value": "test_user"
|
||||
},
|
||||
{
|
||||
"name": "PAI_DEFAULT_FS_URI"
|
||||
},
|
||||
{
|
||||
"name": "PAI_TASK_ROLE_COUNT",
|
||||
"value": "2"
|
||||
},
|
||||
{
|
||||
"name": "PAI_TASK_ROLE_LIST",
|
||||
"value": "taskrole,taskrole_1"
|
||||
},
|
||||
{
|
||||
"name": "PAI_TASK_ROLE_TASK_COUNT_taskrole",
|
||||
"value": "1"
|
||||
},
|
||||
{
|
||||
"name": "PAI_RESOURCE_taskrole",
|
||||
"value": "1,4,8192,0"
|
||||
},
|
||||
{
|
||||
"name": "PAI_MIN_FAILED_TASK_COUNT_taskrole",
|
||||
"value": "1"
|
||||
},
|
||||
{
|
||||
"name": "PAI_MIN_SUCCEEDED_TASK_COUNT_taskrole",
|
||||
"value": "-1"
|
||||
},
|
||||
{
|
||||
"name": "PAI_TASK_ROLE_TASK_COUNT_taskrole_1",
|
||||
"value": "1"
|
||||
},
|
||||
{
|
||||
"name": "PAI_RESOURCE_taskrole_1",
|
||||
"value": "1,4,8192,0"
|
||||
},
|
||||
{
|
||||
"name": "PAI_MIN_FAILED_TASK_COUNT_taskrole_1",
|
||||
"value": "1"
|
||||
},
|
||||
{
|
||||
"name": "PAI_MIN_SUCCEEDED_TASK_COUNT_taskrole_1",
|
||||
"value": "-1"
|
||||
},
|
||||
{
|
||||
"name": "PAI_USERNAME",
|
||||
"value": "test_user"
|
||||
},
|
||||
{
|
||||
"name": "PAI_TASKS_NUM",
|
||||
"value": "2"
|
||||
},
|
||||
{
|
||||
"name": "PAI_JOB_TASK_COUNT",
|
||||
"value": "2"
|
||||
},
|
||||
{
|
||||
"name": "PAI_TASK_ROLES_NUM",
|
||||
"value": "2"
|
||||
},
|
||||
{
|
||||
"name": "PAI_JOB_TASK_ROLE_COUNT",
|
||||
"value": "2"
|
||||
},
|
||||
{
|
||||
"name": "PAI_JOB_TASK_ROLE_LIST",
|
||||
"value": "taskrole,taskrole_1"
|
||||
},
|
||||
{
|
||||
"name": "PAI_CURRENT_TASK_ROLE_NAME",
|
||||
"value": "taskrole"
|
||||
},
|
||||
{
|
||||
"name": "PAI_CURRENT_TASK_ROLE_CURRENT_TASK_INDEX",
|
||||
"valueFrom": {
|
||||
"fieldRef": {
|
||||
"fieldPath": "metadata.annotations['FC_TASK_INDEX']"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"image": "openpai/openpai-runtime:test_user",
|
||||
"imagePullPolicy": "Always",
|
||||
"name": "init",
|
||||
"resources": {},
|
||||
"volumeMounts": [
|
||||
{
|
||||
"mountPath": "/usr/local/pai",
|
||||
"name": "pai-vol"
|
||||
},
|
||||
{
|
||||
"mountPath": "/usr/local/pai/logs",
|
||||
"name": "host-log",
|
||||
"subPath": "test_user/51b333b433467483e9e16fcff34ceeda/taskrole"
|
||||
},
|
||||
{
|
||||
"mountPath": "/usr/local/pai-config",
|
||||
"name": "job-exit-spec"
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"priorityClassName": "51b333b433467483e9e16fcff34ceeda-priority",
|
||||
"restartPolicy": "Never",
|
||||
"serviceAccountName": "runtime-account",
|
||||
"volumes": [
|
||||
{
|
||||
"emptyDir": {
|
||||
"medium": "Memory",
|
||||
"sizeLimit": "512Mi"
|
||||
},
|
||||
"name": "dshm"
|
||||
},
|
||||
{
|
||||
"emptyDir": {},
|
||||
"name": "pai-vol"
|
||||
},
|
||||
{
|
||||
"hostPath": {
|
||||
"path": "/var/log/pai"
|
||||
},
|
||||
"name": "host-log"
|
||||
},
|
||||
{
|
||||
"name": "job-ssh-secret-volume",
|
||||
"secret": {
|
||||
"secretName": "job-ssh-secret"
|
||||
}
|
||||
},
|
||||
{
|
||||
"configMap": {
|
||||
"name": "runtime-exit-spec-configuration"
|
||||
},
|
||||
"name": "job-exit-spec"
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"podGracefulDeletionTimeoutSec": 1800,
|
||||
"retryPolicy": {
|
||||
"fancyRetryPolicy": false,
|
||||
"maxRetryCount": 0
|
||||
}
|
||||
},
|
||||
"taskNumber": 1
|
||||
},
|
||||
{
|
||||
"frameworkAttemptCompletionPolicy": {
|
||||
"minFailedTaskCount": 1,
|
||||
"minSucceededTaskCount": -1
|
||||
},
|
||||
"name": "taskrole1",
|
||||
"task": {
|
||||
"pod": {
|
||||
"metadata": {
|
||||
"annotations": {
|
||||
"container.apparmor.security.beta.kubernetes.io/app": "unconfined",
|
||||
"rest-server/port-scheduling-spec": "{\"schedulePortStart\":20000,\"schedulePortEnd\":40000,\"ports\":{\"mpi\":{\"count\":2},\"tensorflow\":{\"count\":1},\"ssh\":{\"count\":1},\"http\":{\"count\":1}}}"
|
||||
},
|
||||
"creationTimestamp": null,
|
||||
"labels": {
|
||||
"type": "kube-launcher-task",
|
||||
"userName": "test_user",
|
||||
"virtualCluster": "default"
|
||||
}
|
||||
},
|
||||
"spec": {
|
||||
"affinity": {
|
||||
"nodeAffinity": {
|
||||
"requiredDuringSchedulingIgnoredDuringExecution": {
|
||||
"nodeSelectorTerms": [
|
||||
{
|
||||
"matchExpressions": [
|
||||
{
|
||||
"key": "pai-worker",
|
||||
"operator": "In",
|
||||
"values": ["true"]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"containers": [
|
||||
{
|
||||
"command": ["/usr/local/pai/runtime"],
|
||||
"env": [
|
||||
{
|
||||
"name": "PAI_FRAMEWORK_NAME",
|
||||
"value": "test_user~test"
|
||||
},
|
||||
{
|
||||
"name": "PAI_JOB_NAME",
|
||||
"value": "test_user~test"
|
||||
},
|
||||
{
|
||||
"name": "PAI_USER_NAME",
|
||||
"value": "test_user"
|
||||
},
|
||||
{
|
||||
"name": "PAI_DEFAULT_FS_URI"
|
||||
},
|
||||
{
|
||||
"name": "PAI_TASK_ROLE_COUNT",
|
||||
"value": "2"
|
||||
},
|
||||
{
|
||||
"name": "PAI_TASK_ROLE_LIST",
|
||||
"value": "taskrole,taskrole_1"
|
||||
},
|
||||
{
|
||||
"name": "PAI_TASK_ROLE_TASK_COUNT_taskrole",
|
||||
"value": "1"
|
||||
},
|
||||
{
|
||||
"name": "PAI_RESOURCE_taskrole",
|
||||
"value": "1,4,8192,0"
|
||||
},
|
||||
{
|
||||
"name": "PAI_MIN_FAILED_TASK_COUNT_taskrole",
|
||||
"value": "1"
|
||||
},
|
||||
{
|
||||
"name": "PAI_MIN_SUCCEEDED_TASK_COUNT_taskrole",
|
||||
"value": "-1"
|
||||
},
|
||||
{
|
||||
"name": "PAI_TASK_ROLE_TASK_COUNT_taskrole_1",
|
||||
"value": "1"
|
||||
},
|
||||
{
|
||||
"name": "PAI_RESOURCE_taskrole_1",
|
||||
"value": "1,4,8192,0"
|
||||
},
|
||||
{
|
||||
"name": "PAI_MIN_FAILED_TASK_COUNT_taskrole_1",
|
||||
"value": "1"
|
||||
},
|
||||
{
|
||||
"name": "PAI_MIN_SUCCEEDED_TASK_COUNT_taskrole_1",
|
||||
"value": "-1"
|
||||
},
|
||||
{
|
||||
"name": "PAI_USERNAME",
|
||||
"value": "test_user"
|
||||
},
|
||||
{
|
||||
"name": "PAI_TASKS_NUM",
|
||||
"value": "2"
|
||||
},
|
||||
{
|
||||
"name": "PAI_JOB_TASK_COUNT",
|
||||
"value": "2"
|
||||
},
|
||||
{
|
||||
"name": "PAI_TASK_ROLES_NUM",
|
||||
"value": "2"
|
||||
},
|
||||
{
|
||||
"name": "PAI_JOB_TASK_ROLE_COUNT",
|
||||
"value": "2"
|
||||
},
|
||||
{
|
||||
"name": "PAI_JOB_TASK_ROLE_LIST",
|
||||
"value": "taskrole,taskrole_1"
|
||||
},
|
||||
{
|
||||
"name": "PAI_CURRENT_TASK_ROLE_NAME",
|
||||
"value": "taskrole_1"
|
||||
},
|
||||
{
|
||||
"name": "PAI_CURRENT_TASK_ROLE_CURRENT_TASK_INDEX",
|
||||
"valueFrom": {
|
||||
"fieldRef": {
|
||||
"fieldPath": "metadata.annotations['FC_TASK_INDEX']"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "PAI_TASK_INDEX",
|
||||
"valueFrom": {
|
||||
"fieldRef": {
|
||||
"fieldPath": "metadata.annotations['FC_TASK_INDEX']"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"image": "openpai/standard:python_3.6-pytorch_1.2.0-gpu",
|
||||
"imagePullPolicy": "Always",
|
||||
"name": "app",
|
||||
"resources": {
|
||||
"limits": {
|
||||
"cpu": "4",
|
||||
"github.com/fuse": "1",
|
||||
"memory": "8Gi",
|
||||
"nvidia.com/gpu": "1"
|
||||
}
|
||||
},
|
||||
"securityContext": {
|
||||
"capabilities": {
|
||||
"add": ["SYS_ADMIN", "IPC_LOCK", "DAC_READ_SEARCH"],
|
||||
"drop": ["MKNOD"]
|
||||
}
|
||||
},
|
||||
"terminationMessagePath": "/tmp/pai-termination-log",
|
||||
"volumeMounts": [
|
||||
{
|
||||
"mountPath": "/dev/shm",
|
||||
"name": "dshm"
|
||||
},
|
||||
{
|
||||
"mountPath": "/usr/local/pai",
|
||||
"name": "pai-vol"
|
||||
},
|
||||
{
|
||||
"mountPath": "/usr/local/pai/logs",
|
||||
"name": "host-log",
|
||||
"subPath": "test_user/51b333b433467483e9e16fcff34ceeda/taskrole1"
|
||||
},
|
||||
{
|
||||
"mountPath": "/usr/local/pai/ssh-secret",
|
||||
"name": "job-ssh-secret-volume",
|
||||
"readOnly": true
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"hostNetwork": true,
|
||||
"imagePullSecrets": [
|
||||
{
|
||||
"name": "pai-secret"
|
||||
}
|
||||
],
|
||||
"initContainers": [
|
||||
{
|
||||
"env": [
|
||||
{
|
||||
"name": "USER_CMD",
|
||||
"value": "printenv"
|
||||
},
|
||||
{
|
||||
"name": "KUBE_APISERVER_ADDRESS",
|
||||
"value": "http://10.151.40.4:8080"
|
||||
},
|
||||
{
|
||||
"name": "GANG_ALLOCATION",
|
||||
"value": "true"
|
||||
},
|
||||
{
|
||||
"name": "PAI_FRAMEWORK_NAME",
|
||||
"value": "test_user~test"
|
||||
},
|
||||
{
|
||||
"name": "PAI_JOB_NAME",
|
||||
"value": "test_user~test"
|
||||
},
|
||||
{
|
||||
"name": "PAI_USER_NAME",
|
||||
"value": "test_user"
|
||||
},
|
||||
{
|
||||
"name": "PAI_DEFAULT_FS_URI"
|
||||
},
|
||||
{
|
||||
"name": "PAI_TASK_ROLE_COUNT",
|
||||
"value": "2"
|
||||
},
|
||||
{
|
||||
"name": "PAI_TASK_ROLE_LIST",
|
||||
"value": "taskrole,taskrole_1"
|
||||
},
|
||||
{
|
||||
"name": "PAI_TASK_ROLE_TASK_COUNT_taskrole",
|
||||
"value": "1"
|
||||
},
|
||||
{
|
||||
"name": "PAI_RESOURCE_taskrole",
|
||||
"value": "1,4,8192,0"
|
||||
},
|
||||
{
|
||||
"name": "PAI_MIN_FAILED_TASK_COUNT_taskrole",
|
||||
"value": "1"
|
||||
},
|
||||
{
|
||||
"name": "PAI_MIN_SUCCEEDED_TASK_COUNT_taskrole",
|
||||
"value": "-1"
|
||||
},
|
||||
{
|
||||
"name": "PAI_TASK_ROLE_TASK_COUNT_taskrole_1",
|
||||
"value": "1"
|
||||
},
|
||||
{
|
||||
"name": "PAI_RESOURCE_taskrole_1",
|
||||
"value": "1,4,8192,0"
|
||||
},
|
||||
{
|
||||
"name": "PAI_MIN_FAILED_TASK_COUNT_taskrole_1",
|
||||
"value": "1"
|
||||
},
|
||||
{
|
||||
"name": "PAI_MIN_SUCCEEDED_TASK_COUNT_taskrole_1",
|
||||
"value": "-1"
|
||||
},
|
||||
{
|
||||
"name": "PAI_USERNAME",
|
||||
"value": "test_user"
|
||||
},
|
||||
{
|
||||
"name": "PAI_TASKS_NUM",
|
||||
"value": "2"
|
||||
},
|
||||
{
|
||||
"name": "PAI_JOB_TASK_COUNT",
|
||||
"value": "2"
|
||||
},
|
||||
{
|
||||
"name": "PAI_TASK_ROLES_NUM",
|
||||
"value": "2"
|
||||
},
|
||||
{
|
||||
"name": "PAI_JOB_TASK_ROLE_COUNT",
|
||||
"value": "2"
|
||||
},
|
||||
{
|
||||
"name": "PAI_JOB_TASK_ROLE_LIST",
|
||||
"value": "taskrole,taskrole_1"
|
||||
},
|
||||
{
|
||||
"name": "PAI_CURRENT_TASK_ROLE_NAME",
|
||||
"value": "taskrole_1"
|
||||
},
|
||||
{
|
||||
"name": "PAI_CURRENT_TASK_ROLE_CURRENT_TASK_INDEX",
|
||||
"valueFrom": {
|
||||
"fieldRef": {
|
||||
"fieldPath": "metadata.annotations['FC_TASK_INDEX']"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"image": "openpai/openpai-runtime:test_user",
|
||||
"imagePullPolicy": "Always",
|
||||
"name": "init",
|
||||
"resources": {},
|
||||
"volumeMounts": [
|
||||
{
|
||||
"mountPath": "/usr/local/pai",
|
||||
"name": "pai-vol"
|
||||
},
|
||||
{
|
||||
"mountPath": "/usr/local/pai/logs",
|
||||
"name": "host-log",
|
||||
"subPath": "test_user/51b333b433467483e9e16fcff34ceeda/taskrole1"
|
||||
},
|
||||
{
|
||||
"mountPath": "/usr/local/pai-config",
|
||||
"name": "job-exit-spec"
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"priorityClassName": "51b333b433467483e9e16fcff34ceeda-priority",
|
||||
"restartPolicy": "Never",
|
||||
"serviceAccountName": "runtime-account",
|
||||
"volumes": [
|
||||
{
|
||||
"emptyDir": {
|
||||
"medium": "Memory",
|
||||
"sizeLimit": "512Mi"
|
||||
},
|
||||
"name": "dshm"
|
||||
},
|
||||
{
|
||||
"emptyDir": {},
|
||||
"name": "pai-vol"
|
||||
},
|
||||
{
|
||||
"hostPath": {
|
||||
"path": "/var/log/pai"
|
||||
},
|
||||
"name": "host-log"
|
||||
},
|
||||
{
|
||||
"name": "job-ssh-secret-volume",
|
||||
"secret": {
|
||||
"secretName": "job-ssh-secret"
|
||||
}
|
||||
},
|
||||
{
|
||||
"configMap": {
|
||||
"name": "runtime-exit-spec-configuration"
|
||||
},
|
||||
"name": "job-exit-spec"
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"podGracefulDeletionTimeoutSec": 1800,
|
||||
"retryPolicy": {
|
||||
"fancyRetryPolicy": false,
|
||||
"maxRetryCount": 0
|
||||
}
|
||||
},
|
||||
"taskNumber": 1
|
||||
}
|
||||
]
|
||||
},
|
||||
"status": {
|
||||
"attemptStatus": {
|
||||
"completionStatus": {
|
||||
"code": 0,
|
||||
"diagnostics": "Pod succeeded",
|
||||
"phrase": "Succeeded",
|
||||
"trigger": {
|
||||
"message": "All Tasks are completed and no user specified conditions in FrameworkAttemptCompletionPolicy have ever been triggered: TotalTaskCount: 2, FailedTaskCount: 0",
|
||||
"taskIndex": 0,
|
||||
"taskRoleName": "taskrole1"
|
||||
},
|
||||
"type": {
|
||||
"attributes": [],
|
||||
"name": "Succeeded"
|
||||
}
|
||||
},
|
||||
"completionTime": "2020-05-22T03:19:33Z",
|
||||
"configMapName": "51b333b433467483e9e16fcff34ceeda-attempt",
|
||||
"configMapUID": "f0f47860-9bda-11ea-830b-000d3ab25bb6",
|
||||
"id": 0,
|
||||
"instanceUID": "0_f0f47860-9bda-11ea-830b-000d3ab25bb6",
|
||||
"runTime": "2020-05-22T03:19:26Z",
|
||||
"startTime": "2020-05-22T03:18:43Z",
|
||||
"taskRoleStatuses": [
|
||||
{
|
||||
"name": "taskrole",
|
||||
"taskStatuses": [
|
||||
{
|
||||
"attemptStatus": {
|
||||
"completionStatus": {
|
||||
"code": 0,
|
||||
"diagnostics": "Pod succeeded",
|
||||
"phrase": "Succeeded",
|
||||
"pod": {
|
||||
"containers": [
|
||||
{
|
||||
"code": 0,
|
||||
"name": "init",
|
||||
"reason": "Completed"
|
||||
},
|
||||
{
|
||||
"code": 0,
|
||||
"name": "app",
|
||||
"reason": "Completed"
|
||||
}
|
||||
]
|
||||
},
|
||||
"type": {
|
||||
"attributes": [],
|
||||
"name": "Succeeded"
|
||||
}
|
||||
},
|
||||
"completionTime": "2020-05-22T03:19:32Z",
|
||||
"id": 0,
|
||||
"instanceUID": "0_f1020521-9bda-11ea-830b-000d3ab25bb6",
|
||||
"podHostIP": "10.151.41.8",
|
||||
"podIP": "10.151.41.8",
|
||||
"podName": "51b333b433467483e9e16fcff34ceeda-taskrole-0",
|
||||
"podNodeName": "10.151.41.8",
|
||||
"podUID": "f1020521-9bda-11ea-830b-000d3ab25bb6",
|
||||
"runTime": "2020-05-22T03:19:26Z",
|
||||
"startTime": "2020-05-22T03:18:43Z"
|
||||
},
|
||||
"completionTime": "2020-05-22T03:19:32Z",
|
||||
"index": 0,
|
||||
"retryPolicyStatus": {
|
||||
"accountableRetriedCount": 0,
|
||||
"retryDelaySec": null,
|
||||
"totalRetriedCount": 0
|
||||
},
|
||||
"startTime": "2020-05-22T03:18:43Z",
|
||||
"state": "Completed",
|
||||
"transitionTime": "2020-05-22T03:19:32Z"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "taskrole1",
|
||||
"taskStatuses": [
|
||||
{
|
||||
"attemptStatus": {
|
||||
"completionStatus": {
|
||||
"code": 0,
|
||||
"diagnostics": "Pod succeeded",
|
||||
"phrase": "Succeeded",
|
||||
"pod": {
|
||||
"containers": [
|
||||
{
|
||||
"code": 0,
|
||||
"name": "init",
|
||||
"reason": "Completed"
|
||||
},
|
||||
{
|
||||
"code": 0,
|
||||
"name": "app",
|
||||
"reason": "Completed"
|
||||
}
|
||||
]
|
||||
},
|
||||
"type": {
|
||||
"attributes": [],
|
||||
"name": "Succeeded"
|
||||
}
|
||||
},
|
||||
"completionTime": "2020-05-22T03:19:32Z",
|
||||
"id": 0,
|
||||
"instanceUID": "0_f102a4a7-9bda-11ea-830b-000d3ab25bb6",
|
||||
"podHostIP": "10.151.41.9",
|
||||
"podIP": "10.151.41.9",
|
||||
"podName": "51b333b433467483e9e16fcff34ceeda-taskrole1-0",
|
||||
"podNodeName": "10.151.41.9",
|
||||
"podUID": "f102a4a7-9bda-11ea-830b-000d3ab25bb6",
|
||||
"runTime": "2020-05-22T03:19:27Z",
|
||||
"startTime": "2020-05-22T03:18:43Z"
|
||||
},
|
||||
"completionTime": "2020-05-22T03:19:32Z",
|
||||
"index": 0,
|
||||
"retryPolicyStatus": {
|
||||
"accountableRetriedCount": 0,
|
||||
"retryDelaySec": null,
|
||||
"totalRetriedCount": 0
|
||||
},
|
||||
"startTime": "2020-05-22T03:18:43Z",
|
||||
"state": "Completed",
|
||||
"transitionTime": "2020-05-22T03:19:32Z"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"completionTime": "2020-05-22T03:19:33Z",
|
||||
"retryPolicyStatus": {
|
||||
"accountableRetriedCount": 0,
|
||||
"retryDelaySec": null,
|
||||
"totalRetriedCount": 0
|
||||
},
|
||||
"startTime": "2020-05-22T03:18:43Z",
|
||||
"state": "Completed",
|
||||
"transitionTime": "2020-05-22T03:19:33Z"
|
||||
}
|
||||
}
|
|
@ -0,0 +1,74 @@
|
|||
# Copyright (c) Microsoft Corporation
|
||||
# All rights reserved.
|
||||
#
|
||||
# MIT License
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
|
||||
# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
|
||||
# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
|
||||
# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
||||
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
|
||||
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
|
||||
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
import json
|
||||
from io import StringIO
|
||||
import os
|
||||
import sys
|
||||
import unittest
|
||||
|
||||
# pylint: disable=wrong-import-position
|
||||
sys.path.append(
|
||||
os.path.join(os.path.dirname(os.path.abspath(__file__)), "../src"))
|
||||
sys.path.append(
|
||||
os.path.join(os.path.dirname(os.path.abspath(__file__)), "../src/init.d"))
|
||||
from framework_parser import generate_runtime_env
|
||||
from common.utils import init_logger
|
||||
# pylint: enable=wrong-import-position
|
||||
|
||||
PACKAGE_DIRECTORY_COM = os.path.dirname(os.path.abspath(__file__))
|
||||
init_logger()
|
||||
|
||||
|
||||
class TestParser(unittest.TestCase):
|
||||
def setUp(self):
|
||||
try:
|
||||
os.chdir(PACKAGE_DIRECTORY_COM)
|
||||
except Exception: #pylint: disable=broad-except
|
||||
pass
|
||||
|
||||
def test_generate_runtime_env(self):
|
||||
os.environ["FC_TASK_INDEX"] = "0"
|
||||
os.environ["FC_TASKROLE_NAME"] = "taskrole"
|
||||
test_file = "framework.json"
|
||||
expect_lines = [
|
||||
"export PAI_PORT_LIST_taskrole_0_tcp='29877,22353,29076'",
|
||||
"export PAI_CONTAINER_HOST_PORT_LIST='tcp:29877,22353,29076;udp:31903,33486,35953;ssh:39080;http:30643;'",
|
||||
"export PAI_taskrole1_0_mpi_PORT='20966,21891'",
|
||||
"export PAI_CONTAINER_HOST_http_PORT_LIST='30643'",
|
||||
"export PAI_PORT_LIST_taskrole_0_udp='31903,33486,35953'",
|
||||
"export PAI_CONTAINER_SSH_PORT='39080'"
|
||||
]
|
||||
with open(test_file, "r") as f:
|
||||
framework = json.load(f)
|
||||
|
||||
sys.stdout = temp_stdout = StringIO()
|
||||
|
||||
generate_runtime_env(framework)
|
||||
runtime_env = temp_stdout.getvalue().splitlines()
|
||||
|
||||
sys.stdout = sys.__stdout__
|
||||
|
||||
for expect in expect_lines:
|
||||
self.assertIn(expect, runtime_env)
|
||||
|
||||
del os.environ["FC_TASK_INDEX"]
|
||||
del os.environ["FC_TASKROLE_NAME"]
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
Загрузка…
Ссылка в новой задаче