зеркало из https://github.com/Azure/acs-engine.git
[WIP] Add NVIDIA drivers for k8s linux agents (#989)
* [k8s] add NVIDIA drivers for linux agents * update k8s+GPU documentation * Allows different drivers install script for different skus * add unit test for GPU script * remove templates.go * Add kubernetes GPU example template
This commit is contained in:
Родитель
58626a9793
Коммит
456001c321
|
@ -1,65 +1,75 @@
|
|||
# Microsoft Azure Container Service Engine - Kubernetes Multi-GPU support Walkthrough
|
||||
# Microsoft Azure Container Service Engine - Using GPUs with Kubernetes
|
||||
|
||||
## Deployment
|
||||
|
||||
Here are the steps to deploy a simple Kubernetes cluster with multi-GPU support:
|
||||
|
||||
1. [Install a Kubernetes cluster][Kubernetes Walkthrough](deploy.md) - shows how to create a Kubernetes cluster.
|
||||
> NOTE: Make sure to configure the agent nodes with vm size `Standard_NC12` or above to utilize the GPUs
|
||||
|
||||
2. Install drivers:
|
||||
* SSH into each node and run the following scripts :
|
||||
install-nvidia-driver.sh
|
||||
```
|
||||
curl -L -sf https://raw.githubusercontent.com/ritazh/acs-k8s-gpu/master/install-nvidia-driver.sh | sudo sh
|
||||
```
|
||||
|
||||
To verify, when you run `kubectl describe node <node-name>`, you should get something like the following:
|
||||
If you created a Kubernetes cluster with one or multiple agent pool(s) whose VM size is `Standard_NC*` or `Standard_NV*` you can schedule GPU workload on your cluster.
|
||||
The NVIDIA drivers are automatically installed on every GPU agent in your cluster, so you don't need to do that manually, unless you require a specific version of the drivers. Currently, the installed driver is version 378.13.
|
||||
|
||||
To make sure everything is fine, run `kubectl describe node <name-of-a-gpu-node>`. You should see the correct number of GPU reported (in this example shows 2 GPU for a NC12 VM):
|
||||
```
|
||||
[...]
|
||||
Capacity:
|
||||
alpha.kubernetes.io/nvidia-gpu: 2
|
||||
cpu: 12
|
||||
memory: 115505744Ki
|
||||
pods: 110
|
||||
[...]
|
||||
```
|
||||
|
||||
3. Scheduling a multi-GPU container
|
||||
If `alpha.kubernetes.io/nvidia-gpu` is `0` and you just created the cluster, you might have to wait a little bit. The driver installation takes about 12 minutes, and the node might join the cluster before the installation is completed. After a few minute the node should restart, and report the correct number of GPUs.
|
||||
|
||||
* You need to specify `alpha.kubernetes.io/nvidia-gpu: 2` as a limit
|
||||
* You need to expose the drivers to the container as a volume. If you are using TF original docker image, it is based on ubuntu 16.04, just like your cluster's VM, so you can just mount `/usr/bin` and `/usr/lib/x86_64-linux-gnu`, it's a bit dirty but it works. Ideally, improve the previous script to install the driver in a specific directory and only expose this one.
|
||||
## Running a GPU-enabled container
|
||||
|
||||
``` yaml
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
When running a GPU container, you will need to specify how many GPU you want to use. If you don't specify a GPU count, kubernetes will asumme you don't require any, and will not map the device into the container.
|
||||
You will also need to mount the drivers from the host (the kubernetes agent) into the container.
|
||||
|
||||
On the host, the drivers are installed under `/usr/lib/nvidia-378`.
|
||||
|
||||
Here is an example template running TensorFlow:
|
||||
|
||||
```
|
||||
apiVersion: extensions/v1beta1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: gpu-test
|
||||
labels:
|
||||
app: gpu-test
|
||||
app: tensorflow
|
||||
name: tensorflow
|
||||
spec:
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: tensorflow
|
||||
spec:
|
||||
volumes:
|
||||
- name: binaries
|
||||
hostPath:
|
||||
path: /usr/bin/
|
||||
- name: libraries
|
||||
hostPath:
|
||||
path: /usr/lib/x86_64-linux-gnu
|
||||
containers:
|
||||
- name: tensorflow
|
||||
image: gcr.io/tensorflow/tensorflow:latest-gpu
|
||||
ports:
|
||||
- containerPort: 8888
|
||||
image: tensorflow/tensorflow:latest-gpu
|
||||
command: ["python main.py"]
|
||||
imagePullPolicy: IfNotPresent
|
||||
env:
|
||||
- name: LD_LIBRARY_PATH
|
||||
value: /usr/lib/nvidia:/usr/lib/x86_64-linux-gnu
|
||||
resources:
|
||||
limits:
|
||||
requests:
|
||||
alpha.kubernetes.io/nvidia-gpu: 2
|
||||
volumeMounts:
|
||||
- mountPath: /usr/bin/
|
||||
name: binaries
|
||||
- mountPath: /usr/lib/x86_64-linux-gnu
|
||||
name: libraries
|
||||
- mountPath: /usr/local/nvidia/bin
|
||||
name: bin
|
||||
- mountPath: /usr/lib/nvidia
|
||||
name: lib
|
||||
- mountPath: /usr/lib/x86_64-linux-gnu/libcuda.so.1
|
||||
name: libcuda
|
||||
volumes:
|
||||
- name: bin
|
||||
hostPath:
|
||||
path: /usr/lib/nvidia-378/bin
|
||||
- name: lib
|
||||
hostPath:
|
||||
path: /usr/lib/nvidia-378
|
||||
- name: libcuda
|
||||
hostPath:
|
||||
path: /usr/lib/x86_64-linux-gnu/libcuda.so.1
|
||||
```
|
||||
To verify, when you run `kubectl describe pod <pod-name>`, you see get the following:
|
||||
|
||||
```
|
||||
Successfully assigned gpu-test to k8s-agentpool1-10960440-1
|
||||
```
|
||||
We specify `alpha.kubernetes.io/nvidia-gpu: 1` in the resources requests, and we mount the drivers from the host into the container.
|
||||
Note that we also modify the `LD_LIBRARY_PATH` environment variable to let python know where to find the driver's libraries.
|
||||
|
||||
Some libraries, such as `libcuda.so` are installed under `/usr/lib/x86_64-linux-gnu` on the host, you might need to mount them separatly as shown above based on your needs.
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,35 @@
|
|||
{
|
||||
"apiVersion": "vlabs",
|
||||
"properties": {
|
||||
"orchestratorProfile": {
|
||||
"orchestratorType": "Kubernetes"
|
||||
},
|
||||
"masterProfile": {
|
||||
"count": 1,
|
||||
"dnsPrefix": "",
|
||||
"vmSize": "Standard_D2_v2"
|
||||
},
|
||||
"agentPoolProfiles": [
|
||||
{
|
||||
"name": "agentpool1",
|
||||
"count": 3,
|
||||
"vmSize": "Standard_NC6",
|
||||
"availabilityProfile": "AvailabilitySet"
|
||||
}
|
||||
],
|
||||
"linuxProfile": {
|
||||
"adminUsername": "azureuser",
|
||||
"ssh": {
|
||||
"publicKeys": [
|
||||
{
|
||||
"keyData": ""
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"servicePrincipalProfile": {
|
||||
"clientId": "",
|
||||
"secret": ""
|
||||
}
|
||||
}
|
||||
}
|
|
@ -176,6 +176,7 @@ runcmd:
|
|||
- systemctl restart docker
|
||||
- mkdir -p /etc/kubernetes/manifests
|
||||
- usermod -aG docker {{WrapAsVariable "username"}}
|
||||
{{GetGPUDriversInstallScript .}}
|
||||
- echo `date`,`hostname`, PRE-APT-SYSTEMD-DAILY>>/opt/m
|
||||
- /usr/lib/apt/apt.systemd.daily
|
||||
- echo `date`,`hostname`, POST-APT-SYSTEMD-DAILY>>/opt/m
|
||||
|
|
|
@ -1069,6 +1069,9 @@ func (t *TemplateGenerator) getTemplateFuncMap(cs *api.ContainerService) templat
|
|||
}
|
||||
return false
|
||||
},
|
||||
"GetGPUDriversInstallScript": func(profile *api.AgentPoolProfile) string {
|
||||
return getGPUDriversInstallScript(profile)
|
||||
},
|
||||
"HasLinuxSecrets": func() bool {
|
||||
return cs.Properties.LinuxProfile.HasSecrets()
|
||||
},
|
||||
|
@ -1297,6 +1300,58 @@ func getPackageGUID(orchestratorType string, orchestratorVersion string, masterC
|
|||
return ""
|
||||
}
|
||||
|
||||
func getGPUDriversInstallScript(profile *api.AgentPoolProfile) string {
|
||||
|
||||
// latest version of the drivers. Later this parameter could be bubbled up so that users can choose specific driver versions.
|
||||
dv := "384"
|
||||
|
||||
/*
|
||||
First we remove the nouveau drivers, which are the open source drivers for NVIDIA cards. Nouveau is installed on NV Series VMs by default.
|
||||
Then we add the graphics-drivers ppa repository and get the proprietary drivers from there.
|
||||
*/
|
||||
ppaScript := fmt.Sprintf(`- rmmod nouveau
|
||||
- sh -c "echo \"blacklist nouveau\" >> /etc/modprobe.d/blacklist.conf"
|
||||
- update-initramfs -u
|
||||
- sudo add-apt-repository -y ppa:graphics-drivers
|
||||
- sudo apt-get update
|
||||
- sudo apt-get install -y nvidia-%s`, dv)
|
||||
|
||||
// We don't have an agreement in place with NVIDIA to provide the drivers on every sku. For this VMs we simply log a warning message.
|
||||
na := getGPUDriversNotInstalledWarningMessage(profile.VMSize)
|
||||
|
||||
/* If a new GPU sku becomes available, add a key to this map, but only provide an installation script if you have a confirmation
|
||||
that we have an agreement with NVIDIA for this specific gpu. Otherwise use the warning message.
|
||||
*/
|
||||
dm := map[string]string{
|
||||
"Standard_NC6": ppaScript,
|
||||
"Standard_NC12": ppaScript,
|
||||
"Standard_NC24": ppaScript,
|
||||
"Standard_NC24r": ppaScript,
|
||||
"Standard_NV6": ppaScript,
|
||||
"Standard_NV12": ppaScript,
|
||||
"Standard_NV24": ppaScript,
|
||||
"Standard_NV24r": ppaScript,
|
||||
"Standard_NC6_v2": na,
|
||||
"Standard_NC12_v2": na,
|
||||
"Standard_NC24_v2": na,
|
||||
"Standard_NC24r_v2": na,
|
||||
"Standard_ND6": na,
|
||||
"Standard_ND12": na,
|
||||
"Standard_ND24": na,
|
||||
"Standard_ND24r": na,
|
||||
}
|
||||
if _, ok := dm[profile.VMSize]; ok {
|
||||
return dm[profile.VMSize]
|
||||
}
|
||||
|
||||
// The VM is not part of the GPU skus, no extra steps.
|
||||
return ""
|
||||
}
|
||||
|
||||
func getGPUDriversNotInstalledWarningMessage(VMSize string) string {
|
||||
return fmt.Sprintf("echo 'Warning: NVIDIA Drivers for this VM SKU (%v) are not automatically installed'", VMSize)
|
||||
}
|
||||
|
||||
func getDCOSCustomDataPublicIPStr(orchestratorType string, masterCount int) string {
|
||||
if orchestratorType == api.DCOS {
|
||||
var buf bytes.Buffer
|
||||
|
|
|
@ -317,3 +317,50 @@ func TestTemplateOutputPresence(t *testing.T) {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetGPUDriversInstallScript(t *testing.T) {
|
||||
|
||||
// VMSize with GPU and NVIDIA agreement for drivers distribution
|
||||
validSkus := []string{
|
||||
"Standard_NC6",
|
||||
"Standard_NC12",
|
||||
"Standard_NC24",
|
||||
"Standard_NC24r",
|
||||
"Standard_NV6",
|
||||
"Standard_NV12",
|
||||
"Standard_NV24",
|
||||
"Standard_NV24r",
|
||||
}
|
||||
|
||||
// VMSize with GPU but NO NVIDIA agreement for drivers distribution
|
||||
noLicenceSkus := []string{
|
||||
"Standard_NC6_v2",
|
||||
"Standard_NC12_v2",
|
||||
"Standard_NC24_v2",
|
||||
"Standard_NC24r_v2",
|
||||
"Standard_ND6",
|
||||
"Standard_ND12",
|
||||
"Standard_ND24",
|
||||
"Standard_ND24r",
|
||||
}
|
||||
|
||||
for _, sku := range validSkus {
|
||||
s := getGPUDriversInstallScript(&api.AgentPoolProfile{VMSize: sku})
|
||||
if s == "" || s == getGPUDriversNotInstalledWarningMessage(sku) {
|
||||
t.Fatalf("Expected NVIDIA driver install script for sku %v", sku)
|
||||
}
|
||||
}
|
||||
|
||||
for _, sku := range noLicenceSkus {
|
||||
s := getGPUDriversInstallScript(&api.AgentPoolProfile{VMSize: sku})
|
||||
if s != getGPUDriversNotInstalledWarningMessage(sku) {
|
||||
t.Fatalf("NVIDIA driver install script was provided for a VM sku (%v) that does not meet NVIDIA agreement.", sku)
|
||||
}
|
||||
}
|
||||
|
||||
// VMSize without GPU
|
||||
s := getGPUDriversInstallScript(&api.AgentPoolProfile{VMSize: "Standard_D2_v2"})
|
||||
if s != "" {
|
||||
t.Fatalf("VMSize without GPU should not receive a script, expected empty string, received %v", s)
|
||||
}
|
||||
}
|
||||
|
|
Загрузка…
Ссылка в новой задаче