зеркало из https://github.com/microsoft/pai.git
[Hived]: Per VC queuing to avoid cross VC starvation (#4041)
This commit is contained in:
Родитель
317d0c4dfe
Коммит
44f27bb666
|
@ -15,6 +15,7 @@
|
|||
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
import yaml
|
||||
|
||||
class Hivedscheduler:
|
||||
def __init__(self, cluster_conf, service_conf, default_service_conf):
|
||||
|
@ -26,10 +27,12 @@ class Hivedscheduler:
|
|||
return False, 'webservice-port is missing in hivedscheduler service configuration'
|
||||
if 'config' not in self.service_conf:
|
||||
self.service_conf['config'] = ''
|
||||
# return False, 'hived scheduler config is missing'
|
||||
return True, None
|
||||
|
||||
def run(self):
|
||||
self.service_conf['structured-config'] = {}
|
||||
if self.service_conf['config'] != '':
|
||||
self.service_conf['structured-config'] = yaml.load(self.service_conf['config'], yaml.SafeLoader)
|
||||
machine_list = self.cluster_conf['machine-list']
|
||||
master_ip = [host['hostip'] for host in machine_list if host.get('pai-master') == 'true'][0]
|
||||
self.service_conf['webservice'] = 'http://{}:{}'.format(master_ip, self.service_conf['webservice-port'])
|
||||
|
|
|
@ -21,27 +21,13 @@ metadata:
|
|||
name: hivedscheduler-config
|
||||
namespace: default
|
||||
data:
|
||||
config.yaml: |
|
||||
apiVersion: kubescheduler.config.k8s.io/v1alpha1
|
||||
kind: KubeSchedulerConfiguration
|
||||
schedulerName: hivedscheduler
|
||||
disablePreemption: false
|
||||
algorithmSource:
|
||||
policy:
|
||||
configMap:
|
||||
name: hivedscheduler-config
|
||||
namespace: default
|
||||
leaderElection:
|
||||
leaderElect: false
|
||||
lockObjectName: hivedscheduler
|
||||
lockObjectNamespace: default
|
||||
policy.cfg : |
|
||||
{
|
||||
"kind": "Policy",
|
||||
"apiVersion": "v1",
|
||||
"extenders": [
|
||||
{
|
||||
"urlPrefix": "http://localhost:30096/v1/extender",
|
||||
"urlPrefix": "{{ cluster_cfg['hivedscheduler']['webservice'] }}/v1/extender",
|
||||
"filterVerb": "filter",
|
||||
"preemptVerb": "preempt",
|
||||
"bindVerb": "bind",
|
||||
|
|
|
@ -21,7 +21,7 @@ metadata:
|
|||
name: hivedscheduler-service
|
||||
spec:
|
||||
selector:
|
||||
app: hivedscheduler
|
||||
app: hivedscheduler-hs
|
||||
type: NodePort
|
||||
ports:
|
||||
- protocol: TCP
|
||||
|
|
|
@ -18,35 +18,23 @@
|
|||
apiVersion: apps/v1
|
||||
kind: StatefulSet
|
||||
metadata:
|
||||
name: hivedscheduler-sts
|
||||
name: hivedscheduler-hs
|
||||
namespace: default
|
||||
spec:
|
||||
serviceName: hivedscheduler
|
||||
serviceName: hivedscheduler-hs
|
||||
selector:
|
||||
matchLabels:
|
||||
app: hivedscheduler
|
||||
app: hivedscheduler-hs
|
||||
replicas: 1
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: hivedscheduler
|
||||
app: hivedscheduler-hs
|
||||
spec:
|
||||
nodeSelector:
|
||||
pai-master: "true"
|
||||
serviceAccountName: hivedscheduler-account
|
||||
containers:
|
||||
- name: defaultscheduler
|
||||
image: gcr.io/google_containers/kube-scheduler:v1.14.2
|
||||
command: [
|
||||
"/usr/local/bin/kube-scheduler",
|
||||
{%- if cluster_cfg['cluster']['common']['k8s-rbac'] != 'true' %}
|
||||
"--master={{ cluster_cfg['layout']['kubernetes']['api-servers-url'] }}",
|
||||
{%- endif %}
|
||||
"--config=/hivedscheduler-config/config.yaml",
|
||||
"--feature-gates=PodPriority=true",
|
||||
"--leader-elect=false",
|
||||
"--v=4"]
|
||||
volumeMounts:
|
||||
- name: hivedscheduler-config
|
||||
mountPath: /hivedscheduler-config
|
||||
- name: hivedscheduler
|
||||
image: hivedscheduler/hivedscheduler:v0.2.5
|
||||
command: [
|
||||
|
@ -59,9 +47,56 @@ spec:
|
|||
value: "{{ cluster_cfg['layout']['kubernetes']['api-servers-url'] }}"
|
||||
{%- endif %}
|
||||
volumeMounts:
|
||||
- name: hivedscheduler-config
|
||||
mountPath: /hivedscheduler-config
|
||||
- name: hivedscheduler-config
|
||||
mountPath: /hivedscheduler-config
|
||||
volumes:
|
||||
- name: hivedscheduler-config
|
||||
configMap:
|
||||
name: hivedscheduler-config
|
||||
|
||||
{%- for vc in cluster_cfg['hivedscheduler']['structured-config']['virtualClusters'] %}
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: StatefulSet
|
||||
metadata:
|
||||
name: hivedscheduler-ds-{{ vc }}
|
||||
namespace: default
|
||||
spec:
|
||||
serviceName: hivedscheduler-ds-{{ vc }}
|
||||
selector:
|
||||
matchLabels:
|
||||
app: hivedscheduler-ds-{{ vc }}
|
||||
replicas: 1
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: hivedscheduler-ds-{{ vc }}
|
||||
spec:
|
||||
nodeSelector:
|
||||
pai-master: "true"
|
||||
serviceAccountName: hivedscheduler-account
|
||||
containers:
|
||||
- name: defaultscheduler
|
||||
image: gcr.io/google_containers/kube-scheduler:v1.14.2
|
||||
command: [
|
||||
"sh", "-c",
|
||||
"echo \"apiVersion: kubescheduler.config.k8s.io/v1alpha1\" >> config.yaml &&
|
||||
echo \"kind: KubeSchedulerConfiguration\" >> config.yaml &&
|
||||
echo \"schedulerName: hivedscheduler-ds-{{ vc }}\" >> config.yaml &&
|
||||
echo \"disablePreemption: false\" >> config.yaml &&
|
||||
echo \"algorithmSource:\" >> config.yaml &&
|
||||
echo \" policy:\" >> config.yaml &&
|
||||
echo \" configMap:\" >> config.yaml &&
|
||||
echo \" name: hivedscheduler-config\" >> config.yaml &&
|
||||
echo \" namespace: default\" >> config.yaml &&
|
||||
echo \"leaderElection:\" >> config.yaml &&
|
||||
echo \" leaderElect: false\" >> config.yaml &&
|
||||
/usr/local/bin/kube-scheduler
|
||||
{%- if cluster_cfg['cluster']['common']['k8s-rbac'] != 'true' %}
|
||||
--master={{ cluster_cfg['layout']['kubernetes']['api-servers-url'] }}
|
||||
{%- endif %}
|
||||
--config=config.yaml
|
||||
--feature-gates=PodPriority=true
|
||||
--leader-elect=false
|
||||
--v=4"]
|
||||
{%- endfor %}
|
|
@ -35,6 +35,3 @@ stop-script: stop.sh
|
|||
delete-script: delete.sh
|
||||
refresh-script: refresh.sh
|
||||
upgraded-script: upgraded.sh
|
||||
|
||||
deploy-rules:
|
||||
- in: pai-master
|
|
@ -28,7 +28,7 @@ kubectl apply --overwrite=true -f hivedscheduler.yaml || exit $?
|
|||
|
||||
sleep 10
|
||||
# Wait until the service is ready.
|
||||
PYTHONPATH="../../../deployment" python -m k8sPaiLibrary.monitorTool.check_pod_ready_status -w -k app -v hivedscheduler || exit $?
|
||||
PYTHONPATH="../../../deployment" python -m k8sPaiLibrary.monitorTool.check_pod_ready_status -w -k app -v hivedscheduler-hs || exit $?
|
||||
|
||||
{% endif %}
|
||||
|
||||
|
|
|
@ -21,8 +21,13 @@ pushd $(dirname "$0") > /dev/null
|
|||
|
||||
{% if cluster_cfg['hivedscheduler']['config']|length > 1 %}
|
||||
|
||||
{% for vc in cluster_cfg['hivedscheduler']['structured-config']['virtualClusters'] %}
|
||||
PYTHONPATH="../../../deployment" python -m k8sPaiLibrary.maintaintool.update_resource \
|
||||
--operation delete --resource statefulset --name hivedscheduler-sts
|
||||
--operation delete --resource statefulset --name hivedscheduler-ds-{{ vc }}
|
||||
{% endfor %}
|
||||
|
||||
PYTHONPATH="../../../deployment" python -m k8sPaiLibrary.maintaintool.update_resource \
|
||||
--operation delete --resource statefulset --name hivedscheduler-hs
|
||||
|
||||
if kubectl get service | grep -q "hivedscheduler-service"; then
|
||||
kubectl delete service hivedscheduler-service || exit $?
|
||||
|
|
|
@ -559,7 +559,7 @@ const generateTaskRole = (frameworkName, taskRole, labels, config, storageConfig
|
|||
};
|
||||
// hived spec
|
||||
if (launcherConfig.enabledHived) {
|
||||
frameworkTaskRole.task.pod.spec.schedulerName = launcherConfig.scheduler;
|
||||
frameworkTaskRole.task.pod.spec.schedulerName = `${launcherConfig.scheduler}-ds-${config.taskRoles[taskRole].hivedPodSpec.virtualCluster}`;
|
||||
|
||||
delete frameworkTaskRole.task.pod.spec.containers[0].resources.limits['nvidia.com/gpu'];
|
||||
frameworkTaskRole.task.pod.spec.containers[0]
|
||||
|
|
Загрузка…
Ссылка в новой задаче