[Hived]: Per VC queuing to avoid cross VC starvation (#4041)

This commit is contained in:
Yuqi Wang 2019-12-20 12:22:14 +08:00 коммит произвёл GitHub
Родитель 317d0c4dfe
Коммит 44f27bb666
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
8 изменённых файлов: 69 добавлений и 43 удалений

Просмотреть файл

@ -15,6 +15,7 @@
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
import yaml
class Hivedscheduler:
def __init__(self, cluster_conf, service_conf, default_service_conf):
@ -26,10 +27,12 @@ class Hivedscheduler:
return False, 'webservice-port is missing in hivedscheduler service configuration'
if 'config' not in self.service_conf:
self.service_conf['config'] = ''
# return False, 'hived scheduler config is missing'
return True, None
def run(self):
self.service_conf['structured-config'] = {}
if self.service_conf['config'] != '':
self.service_conf['structured-config'] = yaml.load(self.service_conf['config'], yaml.SafeLoader)
machine_list = self.cluster_conf['machine-list']
master_ip = [host['hostip'] for host in machine_list if host.get('pai-master') == 'true'][0]
self.service_conf['webservice'] = 'http://{}:{}'.format(master_ip, self.service_conf['webservice-port'])

Просмотреть файл

@ -21,27 +21,13 @@ metadata:
name: hivedscheduler-config
namespace: default
data:
config.yaml: |
apiVersion: kubescheduler.config.k8s.io/v1alpha1
kind: KubeSchedulerConfiguration
schedulerName: hivedscheduler
disablePreemption: false
algorithmSource:
policy:
configMap:
name: hivedscheduler-config
namespace: default
leaderElection:
leaderElect: false
lockObjectName: hivedscheduler
lockObjectNamespace: default
policy.cfg : |
{
"kind": "Policy",
"apiVersion": "v1",
"extenders": [
{
"urlPrefix": "http://localhost:30096/v1/extender",
"urlPrefix": "{{ cluster_cfg['hivedscheduler']['webservice'] }}/v1/extender",
"filterVerb": "filter",
"preemptVerb": "preempt",
"bindVerb": "bind",

Просмотреть файл

@ -21,7 +21,7 @@ metadata:
name: hivedscheduler-service
spec:
selector:
app: hivedscheduler
app: hivedscheduler-hs
type: NodePort
ports:
- protocol: TCP

Просмотреть файл

@ -18,35 +18,23 @@
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: hivedscheduler-sts
name: hivedscheduler-hs
namespace: default
spec:
serviceName: hivedscheduler
serviceName: hivedscheduler-hs
selector:
matchLabels:
app: hivedscheduler
app: hivedscheduler-hs
replicas: 1
template:
metadata:
labels:
app: hivedscheduler
app: hivedscheduler-hs
spec:
nodeSelector:
pai-master: "true"
serviceAccountName: hivedscheduler-account
containers:
- name: defaultscheduler
image: gcr.io/google_containers/kube-scheduler:v1.14.2
command: [
"/usr/local/bin/kube-scheduler",
{%- if cluster_cfg['cluster']['common']['k8s-rbac'] != 'true' %}
"--master={{ cluster_cfg['layout']['kubernetes']['api-servers-url'] }}",
{%- endif %}
"--config=/hivedscheduler-config/config.yaml",
"--feature-gates=PodPriority=true",
"--leader-elect=false",
"--v=4"]
volumeMounts:
- name: hivedscheduler-config
mountPath: /hivedscheduler-config
- name: hivedscheduler
image: hivedscheduler/hivedscheduler:v0.2.5
command: [
@ -59,9 +47,56 @@ spec:
value: "{{ cluster_cfg['layout']['kubernetes']['api-servers-url'] }}"
{%- endif %}
volumeMounts:
- name: hivedscheduler-config
mountPath: /hivedscheduler-config
- name: hivedscheduler-config
mountPath: /hivedscheduler-config
volumes:
- name: hivedscheduler-config
configMap:
name: hivedscheduler-config
{%- for vc in cluster_cfg['hivedscheduler']['structured-config']['virtualClusters'] %}
---
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: hivedscheduler-ds-{{ vc }}
namespace: default
spec:
serviceName: hivedscheduler-ds-{{ vc }}
selector:
matchLabels:
app: hivedscheduler-ds-{{ vc }}
replicas: 1
template:
metadata:
labels:
app: hivedscheduler-ds-{{ vc }}
spec:
nodeSelector:
pai-master: "true"
serviceAccountName: hivedscheduler-account
containers:
- name: defaultscheduler
image: gcr.io/google_containers/kube-scheduler:v1.14.2
command: [
"sh", "-c",
"echo \"apiVersion: kubescheduler.config.k8s.io/v1alpha1\" >> config.yaml &&
echo \"kind: KubeSchedulerConfiguration\" >> config.yaml &&
echo \"schedulerName: hivedscheduler-ds-{{ vc }}\" >> config.yaml &&
echo \"disablePreemption: false\" >> config.yaml &&
echo \"algorithmSource:\" >> config.yaml &&
echo \" policy:\" >> config.yaml &&
echo \" configMap:\" >> config.yaml &&
echo \" name: hivedscheduler-config\" >> config.yaml &&
echo \" namespace: default\" >> config.yaml &&
echo \"leaderElection:\" >> config.yaml &&
echo \" leaderElect: false\" >> config.yaml &&
/usr/local/bin/kube-scheduler
{%- if cluster_cfg['cluster']['common']['k8s-rbac'] != 'true' %}
--master={{ cluster_cfg['layout']['kubernetes']['api-servers-url'] }}
{%- endif %}
--config=config.yaml
--feature-gates=PodPriority=true
--leader-elect=false
--v=4"]
{%- endfor %}

Просмотреть файл

@ -35,6 +35,3 @@ stop-script: stop.sh
delete-script: delete.sh
refresh-script: refresh.sh
upgraded-script: upgraded.sh
deploy-rules:
- in: pai-master

Просмотреть файл

@ -28,7 +28,7 @@ kubectl apply --overwrite=true -f hivedscheduler.yaml || exit $?
sleep 10
# Wait until the service is ready.
PYTHONPATH="../../../deployment" python -m k8sPaiLibrary.monitorTool.check_pod_ready_status -w -k app -v hivedscheduler || exit $?
PYTHONPATH="../../../deployment" python -m k8sPaiLibrary.monitorTool.check_pod_ready_status -w -k app -v hivedscheduler-hs || exit $?
{% endif %}

Просмотреть файл

@ -21,8 +21,13 @@ pushd $(dirname "$0") > /dev/null
{% if cluster_cfg['hivedscheduler']['config']|length > 1 %}
{% for vc in cluster_cfg['hivedscheduler']['structured-config']['virtualClusters'] %}
PYTHONPATH="../../../deployment" python -m k8sPaiLibrary.maintaintool.update_resource \
--operation delete --resource statefulset --name hivedscheduler-sts
--operation delete --resource statefulset --name hivedscheduler-ds-{{ vc }}
{% endfor %}
PYTHONPATH="../../../deployment" python -m k8sPaiLibrary.maintaintool.update_resource \
--operation delete --resource statefulset --name hivedscheduler-hs
if kubectl get service | grep -q "hivedscheduler-service"; then
kubectl delete service hivedscheduler-service || exit $?

Просмотреть файл

@ -559,7 +559,7 @@ const generateTaskRole = (frameworkName, taskRole, labels, config, storageConfig
};
// hived spec
if (launcherConfig.enabledHived) {
frameworkTaskRole.task.pod.spec.schedulerName = launcherConfig.scheduler;
frameworkTaskRole.task.pod.spec.schedulerName = `${launcherConfig.scheduler}-ds-${config.taskRoles[taskRole].hivedPodSpec.virtualCluster}`;
delete frameworkTaskRole.task.pod.spec.containers[0].resources.limits['nvidia.com/gpu'];
frameworkTaskRole.task.pod.spec.containers[0]