simply config using azure config VM, add ACS deployment document

This commit is contained in:
Sanjeev Mehrotra 2017-09-18 12:12:46 -07:00
Родитель 9011474725
Коммит 6574f17822
5 изменённых файлов: 588 добавлений и 420 удалений

Просмотреть файл

@ -0,0 +1,63 @@
# Deploy DL workspace cluster on Azure Container Service (ACS)
This document describes the procedure to deploy DL workspace cluster on ACS. We are still improving the deployment procedure on ACS. Please contact the authors if you encounter any issues.
0. Follow [this document](https://docs.microsoft.com/en-us/cli/azure/install-azure-cli?view=azure-cli-latest) to install Azure CLI and login to your Azure subscription on your dev machine.
1. Clone this repo
2. Go into directory src/ClusterBootstrap inside the repo directory
3. Please create a configuration file called "config.yaml"
```
cluster_name: [your cluster name]
cluster_location : [your cluster location - e.g., northcentralus]
worker_node_num : [number of agent nodes for the ACS cluster]
master_node_num : [number of master nodes for the ACS cluster]
acsagentsize : [size of VM for agent nodes - e.g., Standard_NC12]
azstoragesku: [sku for Azure storage account - e.g., Standard_LRS]
azfilesharequota: [quota for fileshare in GB - e.g., 2048]
```
4. To start and deploy the cluster
```
./deploy.py acs
```
The deployment script executes the following commands (you do not need to run them if you directly run step 4)
1. Setup basic K8S cluster on ACS
```
./deploy.py acs deploy
```
2. Label nodels and deploy services:
```
./deploy.py acs postdeploy
```
3. Mount storage on nodes:
```
./deploy.py acs storagemount
```
4. Install GPU drivers on nodes (if needed):
```
./deploy.py acs gpudrivers
```
5. Install network plugin
```
./deploy.py acs freeflow
```
6. Build needed docker images and configuration files for restfulapi, jobmanager, and webui
```
./deploy.py acs bldwebui
```
7. Start DLWorkspace pods
```
./deploy.py acs restartwebui
```

380
src/ClusterBootstrap/acs_tools.py Executable file
Просмотреть файл

@ -0,0 +1,380 @@
#!/usr/bin/python
# Tools to build ACS cluster
import sys
import os
import subprocess
import yaml
import re
import numbers
sys.path.append("../utils")
import utils
import az_tools
# AZ ACS commands
def az_cmd(cmd):
if verbose:
print "az "+cmd
output = subprocess.check_output("az "+cmd, shell=True)
return yaml.load(output)
def az_sys(cmd):
if verbose:
print "az "+cmd
os.system("az "+cmd)
def az_tryuntil(cmd, stopFn, waitPeriod=5):
return utils.tryuntil(lambda : az_sys(cmd), stopFn, lambda : (), waitPeriod)
# Create SQL database
def az_create_sql_server():
# escape the password in case it has characters such as "$"
pwd = utils.shellquote(config["sqlserver-password"])
cmd = "sql server create"
cmd += " --resource-group=%s" % config["resource_group"]
cmd += " --location=%s" % config["cluster_location"]
cmd += " --name=%s" % config["azure-sqlservername"]
cmd += " --admin-user=%s" % config["sqlserver-username"]
cmd += " --admin-password=%s" % pwd
az_sys(cmd)
# now open firewall
cmd = "sql server firewall-rule create"
cmd += " --resource-group=%s" % config["resource_group"]
cmd += " --server=%s" % config["azure-sqlservername"]
# first open all IPs
cmd2 = cmd + " --name=All --start-ip-address=0.0.0.0 --end-ip-address=255.255.255.255"
az_sys(cmd2)
# now open Azure
cmd2 = cmd + " --name=Azure --start-ip-address=0.0.0.0 --end-ip-address=0.0.0.0"
az_sys(cmd2)
def az_create_sql_database(dbname):
cmd = "sql db create"
cmd += " --resource-group=%s" % config["resource_group"]
cmd += " --server=%s" % config["azure-sqlservername"]
cmd += " --name=%s" % dbname
az_sys(cmd)
def az_create_sql():
az_create_sql_server()
az_create_sql_database(config["sqlserver-database"])
def az_grp_exist(grpname):
resgrp = az_cmd("group show --name=%s" % grpname)
return not resgrp is None
# Overwrite resource group with location where machines are located
# If no machines are found, that may be because they are not created, so leave it as it is
def acs_set_resource_grp(exitIfNotFound):
config["acs_resource_group"] = config["resource_group"] # where container service resides
bFoundMachines = False
if (az_grp_exist(config["resource_group"])):
machines = az_cmd("vm list --resource-group=%s" % config["resource_group"])
if (len(machines) > 0):
bFoundMachines = True
if not bFoundMachines:
# try child resource group
tryGroup = "%s_%s_%s" % (config["resource_group"], config["cluster_name"], config["cluster_location"])
print "Grp %s has no matchines trying %s" % (config["resource_group"], tryGroup)
if (az_grp_exist(tryGroup)):
machines = az_cmd("vm list --resource-group=%s" % tryGroup)
if (len(machines) > 0):
# overwrite with group where machines are located
config["resource_group"] = tryGroup
bFoundMachines = True
if not bFoundMachines and exitIfNotFound:
print "No machines found -- quitting"
exit()
print "Resource group = %s" % config["resource_group"]
def acs_get_id(elem):
elemFullName = elem["id"]
reMatch = re.match('(.*)/(.*)', elemFullName)
return reMatch.group(2)
def acs_get_ip(ipaddrName):
ipInfo = az_cmd("network public-ip show --resource-group="+config["resource_group"]+" --name="+ipaddrName)
return ipInfo["ipAddress"]
def acs_attach_dns_to_node(node, dnsName=None):
nodeName = config["nodenames_from_ip"][node]
if (dnsName is None):
dnsName = nodeName
ipName = config["acsnodes"][nodeName]["publicipname"]
cmd = "network public-ip update"
cmd += " --resource-group=%s" % config["resource_group"]
cmd += " --name=%s" % ipName
cmd += " --dns-name=%s" % dnsName
az_sys(cmd)
def acs_get_machineIP(machineName):
print "Machine: "+machineName
nics = az_cmd("vm show --name="+machineName+" --resource-group="+config["resource_group"])
#print nics
nics = nics["networkProfile"]["networkInterfaces"]
i = 0
for nic in nics:
nicName = acs_get_id(nic)
print "Nic Name: "+nicName
if (i==0):
nicDefault = nicName
ipconfigs = az_cmd("network nic show --resource-group="+config["resource_group"]+" --name="+nicName)
ipConfigs = ipconfigs["ipConfigurations"]
j = 0
for ipConfig in ipConfigs:
ipConfigName = acs_get_id(ipConfig)
print "IP Config Name: "+ipConfigName
if ((i==0) and (j==0)):
ipConfigDefault = ipConfigName
configInfo = az_cmd("network nic ip-config show --resource-group="+config["resource_group"]+
" --nic-name="+nicName+" --name="+ipConfigName)
publicIP = configInfo["publicIpAddress"]
if (not (publicIP is None)):
ipName = acs_get_id(publicIP)
print "IP Name: " + ipName
return {"nic" : nicName, "ipconfig" : ipConfigName, "publicipname" : ipName, "publicip" : acs_get_ip(ipName)}
j+=1
i+=1
return {"nic" : nicDefault, "ipconfig": ipConfigDefault, "publicipname" : None, "publicip" : None}
def acs_get_nodes():
binary = os.path.abspath('./deploy/bin/kubectl')
kubeconfig = os.path.abspath('./deploy/'+config["acskubeconfig"])
cmd = binary + ' -o=json --kubeconfig='+kubeconfig+' get nodes'
nodeInfo = utils.subproc_runonce(cmd)
nodes = yaml.load(nodeInfo)
return nodes["items"]
def acs_get_machinesAndIPs(bCreateIP):
# Public IP on worker nodes
nodes = acs_get_nodes()
ipInfo = {}
#print nodes["items"]
config["nodenames_from_ip"] = {}
for n in nodes:
machineName = n["metadata"]["name"]
ipInfo[machineName] = acs_get_machineIP(machineName)
if bCreateIP and (ipInfo[machineName]["publicip"] is None):
# Create IP
ipName = machineName+"-public-ip-0"
print "Creating public-IP: "+ipName
cmd = "network public-ip create --allocation-method=Dynamic"
cmd += " --resource-group=%s" % config["resource_group"]
cmd += " --name=%s" % ipName
cmd += " --location=%s" % config["cluster_location"]
az_sys(cmd)
# Add to NIC of machine
cmd = "network nic ip-config update"
cmd += " --resource-group=%s" % config["resource_group"]
cmd += " --nic-name=%s" % ipInfo[machineName]["nic"]
cmd += " --name=%s" % ipInfo[machineName]["ipconfig"]
cmd += " --public-ip-address=%s" % ipName
az_sys(cmd)
# now update
ipInfo[machineName]["publicipname"] = ipName
ipInfo[machineName]["publicip"] = acs_get_ip(ipName)
config["nodenames_from_ip"][ipInfo[machineName]["publicip"]] = machineName
return ipInfo
def acs_get_machinesAndIPsFast():
nodes = acs_get_nodes()
ipInfo = {}
config["nodenames_from_ip"] = {}
for n in nodes:
machineName = n["metadata"]["name"]
#print "MachineName: "+machineName
ipName = machineName+"-public-ip-0"
if (verbose):
print "PublicIP: "+ipName
ipInfo[machineName] = {}
ipInfo[machineName]["publicipname"] = ipName
ipInfo[machineName]["publicip"] = acs_get_ip(ipName)
config["nodenames_from_ip"][ipInfo[machineName]["publicip"]] = machineName
return ipInfo
def acs_is_valid_nsg_rule(rule):
#print "Access: %s D: %s P: %s P: %s" % (rule["access"].lower()=="allow",
#rule["direction"].lower()=="inbound",rule["sourceAddressPrefix"]=='*',
#(rule["protocol"].lower()=="tcp" or rule["protocol"]=='*'))
return (rule["access"].lower()=="allow" and
rule["direction"].lower()=="inbound" and
rule["sourceAddressPrefix"]=='*' and
(rule["protocol"].lower()=="tcp" or rule["protocol"]=='*'))
def acs_add_nsg_rules(ports_to_add):
nsgs = az_cmd("network nsg list --resource-group={0}".format(config["resource_group"]))
nsg_name = acs_get_id(nsgs[0])
cmd = "network nsg show --resource-group="+config["resource_group"]+" --name="+nsg_name
rulesInfo = az_cmd(cmd)
rules = rulesInfo["defaultSecurityRules"] + rulesInfo["securityRules"]
maxThreeDigitRule = 100
for rule in rules:
if acs_is_valid_nsg_rule(rule):
if (rule["priority"] < 1000):
#print "Priority: %d" % rule["priority"]
maxThreeDigitRule = max(maxThreeDigitRule, rule["priority"])
if verbose:
print "Existing max three digit rule for NSG: %s is %d" % (nsg_name, maxThreeDigitRule)
for port_rule in ports_to_add:
port_num = ports_to_add[port_rule]
createRule = True
isNum = isinstance(port_num, numbers.Number)
if (not isNum) and port_num.isdigit():
port_num = int(port_num)
isNum = True
if isNum:
# check for existing rules
found_port = None
for rule in rules:
if acs_is_valid_nsg_rule(rule):
match = re.match('(.*)-(.*)', rule["destinationPortRange"])
if (match is None):
minPort = int(rule["destinationPortRange"])
maxPort = minPort
elif (rule["destinationPortRange"] != "*"):
minPort = int(match.group(1))
maxPort = int(match.group(2))
else:
minPort = -1
maxPort = -1
if (minPort <= port_num) and (port_num <= maxPort):
found_port = rule["name"]
break
if not (found_port is None):
print "Rule for %s : %d -- already satisfied by %s" % (port_rule, port_num, found_port)
createRule = False
if createRule:
maxThreeDigitRule = maxThreeDigitRule + 10
cmd = "network nsg rule create"
cmd += " --resource-group=%s" % config["resource_group"]
cmd += " --nsg-name=%s" % nsg_name
cmd += " --name=%s" % port_rule
cmd += " --access=Allow"
if isNum:
cmd += " --destination-port-range=%d" % port_num
else:
cmd += " --destination-port-range=%s" % port_num
cmd += " --direction=Inbound"
cmd += " --priority=%d" % maxThreeDigitRule
az_cmd(cmd)
def acs_get_config():
# Install kubectl / get credentials
if not (os.path.exists('./deploy/bin/kubectl')):
os.system("mkdir -p ./deploy/bin")
az_tryuntil("acs kubernetes install-cli --install-location ./deploy/bin/kubectl", lambda : os.path.exists('./deploy/bin/kubectl'))
if not (os.path.exists('./deploy/'+config["acskubeconfig"])):
cmd = "acs kubernetes get-credentials"
cmd += " --resource-group=%s" % config["acs_resource_group"]
cmd += " --name=%s" % config["cluster_name"]
cmd += " --file=./deploy/%s" % config["acskubeconfig"]
cmd += " --ssh-key-file=%s" % "./deploy/sshkey/id_rsa"
az_tryuntil(cmd, lambda : os.path.exists("./deploy/%s" % config["acskubeconfig"]))
def acs_get_storage_key():
cmd = "storage account keys list"
cmd += " --account-name=%s" % config["mountpoints"]["rootshare"]["accountname"]
cmd += " --resource-group=%s" % config["resource_group"]
keys = az_cmd(cmd)
return keys[0]["value"]
def acs_create_storage():
# Create storage account
cmd = "storage account create"
cmd += " --name=%s" % config["mountpoints"]["rootshare"]["accountname"]
cmd += " --resource-group=%s" % config["resource_group"]
cmd += " --sku=%s" % config["mountpoints"]["rootshare"]["azstoragesku"]
az_sys(cmd)
# Create file share
azureKey = acs_get_storage_key()
config["mountpoints"]["rootshare"]["accesskey"] = azureKey
cmd = "storage share create"
cmd += " --name=%s" % config["mountpoints"]["rootshare"]["filesharename"]
cmd += " --quota=%s" % config["mountpoints"]["rootshare"]["azfilesharequota"]
cmd += " --account-name=%s" % config["mountpoints"]["rootshare"]["accountname"]
cmd += " --account-key=%s" % azureKey
az_sys(cmd)
def acs_load_azconfig():
if (os.path.exists(azConfigFile)):
with open(azConfigFile, "r") as f:
return yaml.load(f)
else:
return None
def acs_write_azconfig(configToWrite):
with open(azConfigFile, "w") as f:
yaml.dump(configToWrite, f, default_flow_style=False)
def acs_generate_azconfig():
az_tools.config = az_tools.init_config()
az_tools.config["azure_cluster"]["cluster_name"] = config["cluster_name"]
az_tools.config["azure_cluster"]["azure_location"] = config["cluster_location"]
az_tools.config = az_tools.update_config(az_tools.config, False)
return az_tools.gen_cluster_config("", False)
def acs_update_azconfig(gen_cluster_config):
config = acs_load_azconfig()
configNew = acs_generate_azconfig()
if not gen_cluster_config:
if config is None:
config = configNew
acs_write_azconfig(config)
else:
if config is None:
config = {}
utils.mergeDict(config, configNew, False)
acs_write_azconfig(config)
return config
def acs_deploy():
generate_key = not os.path.exists("./deploy/sshkey")
cmd = "group create"
cmd += " --location=%s" % config["cluster_location"]
cmd += " --name=%s" % config["resource_group"]
az_sys(cmd)
cmd = "acs create --orchestrator-type=kubernetes"
cmd += " --resource-group=%s" % config["resource_group"]
cmd += " --name=%s" % config["cluster_name"]
cmd += " --agent-count=%d" % config["worker_node_num"]
cmd += " --master-count=%d" % config["master_node_num"]
cmd += " --location=%s" % config["cluster_location"]
cmd += " --agent-vm-size=%s" % config["acsagentsize"]
cmd += " --admin-username=%s" % config["admin_username"]
cmd += " --ssh-key-value=%s" % "./deploy/sshkey/id_rsa.pub"
if (generate_key):
os.system("rm -r ./deploy/sshkey || true")
cmd += " --generate-ssh-keys"
az_sys(cmd)
acs_set_resource_grp(True) # overwrite resource group if machines are elsewhere
acs_create_storage()
az_create_sql()
acs_update_azconfig(True)
acs_get_config()
# Get/create public IP addresses for all machines
Nodes = acs_get_machinesAndIPs(True)
# Add rules for NSG
acs_add_nsg_rules({"HTTPAllow" : 80, "RestfulAPIAllow" : 5000, "AllowKubernetesServicePorts" : "30000-32767"})
return Nodes
# Main / Globals
azConfigFile = "azure_cluster_config.yaml"
if __name__ == '__main__':
# nothing for now
verbose = False
config = {}

Просмотреть файл

@ -64,7 +64,7 @@ def merge_config( config1, config2 ):
else:
config1[entry] = config2[entry]
def update_config(config):
def update_config(config, genSSH=True):
config["azure_cluster"]["resource_group_name"] = config["azure_cluster"]["cluster_name"]+"ResGrp"
config["azure_cluster"]["vnet_name"] = config["azure_cluster"]["cluster_name"]+"-VNet"
config["azure_cluster"]["storage_account_name"] = config["azure_cluster"]["cluster_name"]+"storage"
@ -78,17 +78,19 @@ def update_config(config):
if "sql_admin_password" not in config["azure_cluster"]:
config["azure_cluster"]["sql_admin_password"] = uuid.uuid4().hex+"12!AB"
if (os.path.exists('./deploy/sshkey/id_rsa.pub')):
f = open('./deploy/sshkey/id_rsa.pub')
config["azure_cluster"]["sshkey"] = f.read()
f.close()
else:
os.system("mkdir -p ./deploy/sshkey")
if not os.path.exists("./deploy/sshkey/azure_id_rsa"):
os.system("ssh-keygen -t rsa -b 4096 -f ./deploy/sshkey/azure_id_rsa -P ''")
f = open('./deploy/sshkey/azure_id_rsa.pub')
config["azure_cluster"]["sshkey"] = f.read()
f.close()
if (genSSH):
if (os.path.exists('./deploy/sshkey/id_rsa.pub')):
f = open('./deploy/sshkey/id_rsa.pub')
config["azure_cluster"]["sshkey"] = f.read()
f.close()
else:
os.system("mkdir -p ./deploy/sshkey")
if not os.path.exists("./deploy/sshkey/azure_id_rsa"):
os.system("ssh-keygen -t rsa -b 4096 -f ./deploy/sshkey/azure_id_rsa -P ''")
f = open('./deploy/sshkey/azure_id_rsa.pub')
config["azure_cluster"]["sshkey"] = f.read()
f.close()
return config
@ -282,7 +284,7 @@ def create_cluster():
print "creating VM %s..." % vmname
create_vm(vmname, True)
def gen_cluster_config(output_file_name):
def gen_cluster_config(output_file_name, output_file=True):
cmd = """
az storage account show-connection-string \
@ -337,9 +339,12 @@ def gen_cluster_config(output_file_name):
if file_share_key is not None:
cc["mountpoints"]["rootshare"]["accesskey"] = file_share_key
print yaml.dump(cc, default_flow_style=False)
with open(output_file_name, 'w') as outfile:
yaml.dump(cc, outfile, default_flow_style=False)
if output_file:
print yaml.dump(cc, default_flow_style=False)
with open(output_file_name, 'w') as outfile:
yaml.dump(cc, outfile, default_flow_style=False)
return cc
def run_command( args, command, nargs, parser ):
if command =="create":

Просмотреть файл

@ -39,6 +39,8 @@ from config import config as k8sconfig
sys.path.append("../docker-images/glusterfs")
import launch_glusterfs
import az_tools
import acs_tools
capacityMatch = re.compile("\d+[M|G]B")
digitsMatch = re.compile("\d+")
@ -810,15 +812,35 @@ def create_cluster_id():
config["clusterId"] = utils.get_cluster_ID_from_file()
print "Cluster ID is " + config["clusterId"]
def add_acs_config():
if (os.path.exists("./deploy/"+config["acskubeconfig"])):
def add_acs_config(command):
if (command=="acs" or os.path.exists("./deploy/"+config["acskubeconfig"])):
config["isacs"] = True
create_cluster_id()
#print "Config:{0}".format(config)
#print "Dockerprefix:{0}".format(config["dockerprefix"])
# Set ACS params to match
acs_tools.config = config
acs_tools.verbose = verbose
# Use az tools to generate default config params and overwrite if they don't exist
configAzure = acs_tools.acs_update_azconfig(False)
if verbose:
print "AzureConfig:\n{0}".format(configAzure)
utils.mergeDict(config, configAzure, True) # ovewrites defaults with Azure defaults
if verbose:
print "Config:\n{0}".format(config)
config["master_dns_name"] = config["cluster_name"]
config["resource_group"] = az_tools.config["azure_cluster"]["resource_group_name"]
config["platform-scripts"] = "acs"
config["WinbindServers"] = []
config["etcd_node_num"] = config["master_node_num"]
config["kube_addons"] = [] # no addons
config["mountpoints"]["rootshare"]["azstoragesku"] = config["azstoragesku"]
config["mountpoints"]["rootshare"]["azfilesharequota"] = config["azfilesharequota"]
config["freeflow"] = True
if ("azure-sqlservername" in config) and (not "sqlserver-hostname" in config):
config["sqlserver-hostname"] = ("tcp:%s.database.windows.net" % config["azure-sqlservername"])
@ -828,7 +850,7 @@ def add_acs_config():
config["azure-sqlservername"] = match.group(1)
# Some locations put VMs in child resource groups
acs_set_resource_grp()
acs_tools.acs_set_resource_grp(False)
# check for GPU sku
match = re.match('.*\_N.*', config["acsagentsize"])
@ -854,7 +876,10 @@ def add_acs_config():
config["mountpoints"]["rootshare"]["accesskey"] = azureKey
except:
()
if verbose:
print "Config:{0}".format(config)
# Render scripts for kubenete nodes
def add_kubelet_config():
renderfiles = []
@ -1065,7 +1090,7 @@ def get_ETCD_master_nodes_from_config(clusterId):
def get_nodes_from_acs(tomatch=""):
bFindNodes = True
if not ("acsnodes" in config):
machines = acs_get_machinesAndIPsFast()
machines = acs_tools.acs_get_machinesAndIPsFast()
config["acsnodes"] = machines
else:
bFindNodes = not (tomatch == "" or tomatch == "master" or tomatch == "agent")
@ -1838,63 +1863,12 @@ def pick_server( nodelists, curNode ):
return curNode
# simple utils
class ValClass:
def __init__(self, initVal):
self.val = initVal
def set(self, newVal):
self.val = newVal
def shellquote(s):
return "'" + s.replace("'", "'\\''") + "'"
def exec_rmt_cmd(node, cmd):
utils.SSH_exec_cmd(config["ssh_cert"], config["admin_username"], node, cmd)
def rmt_cp(node, source, target):
utils.sudo_scp(config["ssh_cert"], source, target, config["admin_username"], node)
def tryuntil(cmdLambda, stopFn, updateFn, waitPeriod=5):
while not stopFn():
try:
output = cmdLambda() # if exception occurs here, update does not occur
#print "Output: {0}".format(output)
updateFn()
toStop = False
try:
toStop = stopFn()
except Exception as e:
print "Exception {0} -- stopping anyways".format(e)
toStop = True
if toStop:
#print "Returning {0}".format(output)
return output
except Exception as e:
print "Exception in command {0}".format(e)
if not stopFn():
print "Not done yet - Sleep for 5 seconds and continue"
time.sleep(waitPeriod)
# Run until stop condition and success
def subproc_tryuntil(cmd, stopFn, shell=True, waitPeriod=5):
bFirst = ValClass(True)
return tryuntil(lambda : subprocess.check_output(cmd, shell), lambda : not bFirst.val and stopFn(), lambda : bFirst.set(False), waitPeriod)
def subprocrun(cmd, shellArg):
#print "Running Cmd: {0} Shell: {1}".format(cmd, shellArg)
#embed()
return subprocess.check_output(cmd, shell=shellArg)
# Run once until success (no exception)
def subproc_runonce(cmd, shell=True, waitPeriod=5):
bFirst = ValClass(True)
#print "Running cmd:{0} Shell:{1}".format(cmd, shell)
return tryuntil(lambda : subprocrun(cmd, shell), lambda : not bFirst.val, lambda : bFirst.set(False), waitPeriod)
# Run for N success
def subproc_runN(cmd, n, shell=True, waitPeriod=5):
bCnt = ValClass(0)
return tryuntil(lambda : subprocess.check_output(cmd, shell), lambda : (bCnt.val < n), lambda : bCnt.set(bCnt.val+1), waitPeriod)
# copy list of files to a node
def copy_list_of_files(listOfFiles, node):
with open(listOfFiles, "r") as f:
@ -1943,188 +1917,9 @@ def kube_dpeloy_configchanges():
for configChange in config["kube_configchanges"]:
exec_rmt_cmd(node, "sudo kubectl apply -f "+configChange)
# AZ ACS commands
def az_cmd(cmd):
if verbose:
print "az "+cmd
output = subprocess.check_output("az "+cmd, shell=True)
return yaml.load(output)
def az_sys(cmd):
if verbose:
print "az "+cmd
os.system("az "+cmd)
def az_tryuntil(cmd, stopFn, waitPeriod=5):
return tryuntil(lambda : az_sys(cmd), stopFn, lambda : (), waitPeriod)
# Create SQL database
def az_create_sql_server():
# escape the password in case it has characters such as "$"
pwd = shellquote(config["sqlserver-password"])
cmd = "sql server create"
cmd += " --resource-group=%s" % config["resource_group"]
cmd += " --location=%s" % config["cluster_location"]
cmd += " --name=%s" % config["azure-sqlservername"]
cmd += " --admin-user=%s" % config["sqlserver-username"]
cmd += " --admin-password=%s" % pwd
az_sys(cmd)
# now open firewall
cmd = "sql server firewall-rule create"
cmd += " --resource-group=%s" % config["resource_group"]
cmd += " --server=%s" % config["azure-sqlservername"]
# first open all IPs
cmd2 = cmd + " --name=All --start-ip-address=0.0.0.0 --end-ip-address=255.255.255.255"
az_sys(cmd2)
# now open Azure
cmd2 = cmd + " --name=Azure --start-ip-address=0.0.0.0 --end-ip-address=0.0.0.0"
az_sys(cmd2)
def az_create_sql_database(dbname):
cmd = "sql db create"
cmd += " --resource-group=%s" % config["resource_group"]
cmd += " --server=%s" % config["azure-sqlservername"]
cmd += " --name=%s" % dbname
az_sys(cmd)
def az_create_sql():
az_create_sql_server()
az_create_sql_database(config["sqlserver-database"])
def az_grp_exist(grpname):
resgrp = az_cmd("group show --name=%s" % grpname)
return not resgrp is None
# Overwrite resource group with location where machines are located
# If no machines are found, that may be because they are not created, so leave it as it is
def acs_set_resource_grp():
if (az_grp_exist(config["resource_group"])):
machines = az_cmd("vm list --resource-group=%s" % config["resource_group"])
if (len(machines)==0):
# try child resource group
tryGroup = "%s_%s_%s" % (config["resource_group"], config["cluster_name"], config["cluster_location"])
print "Grp %s has no matchines trying %s" % (config["resource_group"], tryGroup)
if (az_grp_exist(tryGroup)):
machines = az_cmd("vm list --resource-group=%s" % tryGroup)
if (len(machines) > 0):
# overwrite with group where machines are located
config["resource_group"] = tryGroup
print "Resource group = %s" % config["resource_group"]
def acs_get_id(elem):
elemFullName = elem["id"]
reMatch = re.match('(.*)/(.*)', elemFullName)
return reMatch.group(2)
def acs_get_ip(ipaddrName):
ipInfo = az_cmd("network public-ip show --resource-group="+config["resource_group"]+" --name="+ipaddrName)
return ipInfo["ipAddress"]
def acs_attach_dns_to_node(node, dnsName=None):
nodeName = config["nodenames_from_ip"][node]
if (dnsName is None):
dnsName = nodeName
ipName = config["acsnodes"][nodeName]["publicipname"]
cmd = "network public-ip update"
cmd += " --resource-group=%s" % config["resource_group"]
cmd += " --name=%s" % ipName
cmd += " --dns-name=%s" % dnsName
az_sys(cmd)
def acs_attach_dns_name():
get_nodes_from_acs()
firstMasterNode = config["kubernetes_master_node"][0]
acs_attach_dns_to_node(firstMasterNode, config["master_dns_name"])
for i in range(len(config["kubernetes_master_node"])):
if (i != 0):
acs_attach_dns_to_node(config["kubernetes_master_node"][i])
for node in config["worker_node"]:
acs_attach_dns_to_node(node)
def acs_get_machineIP(machineName):
print "Machine: "+machineName
nics = az_cmd("vm show --name="+machineName+" --resource-group="+config["resource_group"])
#print nics
nics = nics["networkProfile"]["networkInterfaces"]
i = 0
for nic in nics:
nicName = acs_get_id(nic)
print "Nic Name: "+nicName
if (i==0):
nicDefault = nicName
ipconfigs = az_cmd("network nic show --resource-group="+config["resource_group"]+" --name="+nicName)
ipConfigs = ipconfigs["ipConfigurations"]
j = 0
for ipConfig in ipConfigs:
ipConfigName = acs_get_id(ipConfig)
print "IP Config Name: "+ipConfigName
if ((i==0) and (j==0)):
ipConfigDefault = ipConfigName
configInfo = az_cmd("network nic ip-config show --resource-group="+config["resource_group"]+
" --nic-name="+nicName+" --name="+ipConfigName)
publicIP = configInfo["publicIpAddress"]
if (not (publicIP is None)):
ipName = acs_get_id(publicIP)
print "IP Name: " + ipName
return {"nic" : nicName, "ipconfig" : ipConfigName, "publicipname" : ipName, "publicip" : acs_get_ip(ipName)}
j+=1
i+=1
return {"nic" : nicDefault, "ipconfig": ipConfigDefault, "publicipname" : None, "publicip" : None}
def acs_get_nodes():
binary = os.path.abspath('./deploy/bin/kubectl')
kubeconfig = os.path.abspath('./deploy/'+config["acskubeconfig"])
cmd = binary + ' -o=json --kubeconfig='+kubeconfig+' get nodes'
nodeInfo = subproc_runonce(cmd)
nodes = yaml.load(nodeInfo)
return nodes["items"]
def acs_get_machinesAndIPs(bCreateIP):
# Public IP on worker nodes
nodes = acs_get_nodes()
ipInfo = {}
#print nodes["items"]
config["nodenames_from_ip"] = {}
for n in nodes:
machineName = n["metadata"]["name"]
ipInfo[machineName] = acs_get_machineIP(machineName)
if bCreateIP and (ipInfo[machineName]["publicip"] is None):
# Create IP
ipName = machineName+"-public-ip-0"
print "Creating public-IP: "+ipName
cmd = "network public-ip create --allocation-method=Dynamic"
cmd += " --resource-group=%s" % config["resource_group"]
cmd += " --name=%s" % ipName
cmd += " --location=%s" % config["cluster_location"]
az_sys(cmd)
# Add to NIC of machine
cmd = "network nic ip-config update"
cmd += " --resource-group=%s" % config["resource_group"]
cmd += " --nic-name=%s" % ipInfo[machineName]["nic"]
cmd += " --name=%s" % ipInfo[machineName]["ipconfig"]
cmd += " --public-ip-address=%s" % ipName
az_sys(cmd)
# now update
ipInfo[machineName]["publicipname"] = ipName
ipInfo[machineName]["publicip"] = acs_get_ip(ipName)
config["nodenames_from_ip"][ipInfo[machineName]["publicip"]] = machineName
return ipInfo
def acs_get_machinesAndIPsFast():
nodes = acs_get_nodes()
ipInfo = {}
config["nodenames_from_ip"] = {}
for n in nodes:
machineName = n["metadata"]["name"]
#print "MachineName: "+machineName
ipName = machineName+"-public-ip-0"
if (verbose):
print "PublicIP: "+ipName
ipInfo[machineName] = {}
ipInfo[machineName]["publicipname"] = ipName
ipInfo[machineName]["publicip"] = acs_get_ip(ipName)
config["nodenames_from_ip"][ipInfo[machineName]["publicip"]] = machineName
return ipInfo
def acs_deploy_addons():
kube_dpeloy_configchanges()
kube_deploy_addons()
def acs_label_webui():
for n in config["kubernetes_master_node"]:
@ -2133,96 +1928,18 @@ def acs_label_webui():
print "Label node: "+nodeName
label_webUI(nodeName)
def acs_is_valid_nsg_rule(rule):
#print "Access: %s D: %s P: %s P: %s" % (rule["access"].lower()=="allow",
#rule["direction"].lower()=="inbound",rule["sourceAddressPrefix"]=='*',
#(rule["protocol"].lower()=="tcp" or rule["protocol"]=='*'))
return (rule["access"].lower()=="allow" and
rule["direction"].lower()=="inbound" and
rule["sourceAddressPrefix"]=='*' and
(rule["protocol"].lower()=="tcp" or rule["protocol"]=='*'))
def acs_add_nsg_rules(ports_to_add):
Nodes = get_nodes_from_acs("")
#print "Nodes: %s" % Nodes
match = re.match('(.*)-0', config["nodenames_from_ip"][config["kubernetes_master_node"][0]])
nsg_name = match.group(1)+"-nsg"
rulesInfo = az_cmd("network nsg show --resource-group="+config["resource_group"]+" --name="+nsg_name)
rules = rulesInfo["defaultSecurityRules"] + rulesInfo["securityRules"]
maxThreeDigitRule = 100
for rule in rules:
if acs_is_valid_nsg_rule(rule):
if (rule["priority"] < 1000):
#print "Priority: %d" % rule["priority"]
maxThreeDigitRule = max(maxThreeDigitRule, rule["priority"])
if verbose:
print "Existing max three digit rule for NSG: %s is %d" % (nsg_name, maxThreeDigitRule)
for port_rule in ports_to_add:
port_num = ports_to_add[port_rule]
createRule = True
isNum = isinstance(port_num, numbers.Number)
if (not isNum) and port_num.isdigit():
port_num = int(port_num)
isNum = True
if isNum:
# check for existing rules
found_port = None
for rule in rules:
if acs_is_valid_nsg_rule(rule):
match = re.match('(.*)-(.*)', rule["destinationPortRange"])
if (match is None):
minPort = int(rule["destinationPortRange"])
maxPort = minPort
elif (rule["destinationPortRange"] != "*"):
minPort = int(match.group(1))
maxPort = int(match.group(2))
else:
minPort = -1
maxPort = -1
if (minPort <= port_num) and (port_num <= maxPort):
found_port = rule["name"]
break
if not (found_port is None):
print "Rule for %s : %d -- already satisfied by %s" % (port_rule, port_num, found_port)
createRule = False
if createRule:
maxThreeDigitRule = maxThreeDigitRule + 10
cmd = "network nsg rule create"
cmd += " --resource-group=%s" % config["resource_group"]
cmd += " --nsg-name=%s" % nsg_name
cmd += " --name=%s" % port_rule
cmd += " --access=Allow"
if isNum:
cmd += " --destination-port-range=%d" % port_num
else:
cmd += " --destination-port-range=%s" % port_num
cmd += " --direction=Inbound"
cmd += " --priority=%d" % maxThreeDigitRule
az_cmd(cmd)
def acs_get_config():
# Install kubectl / get credentials
if not (os.path.exists('./deploy/bin/kubectl')):
os.system("mkdir -p ./deploy/bin")
az_tryuntil("acs kubernetes install-cli --install-location ./deploy/bin/kubectl", lambda : os.path.exists('./deploy/bin/kubectl'))
if not (os.path.exists('./deploy/'+config["acskubeconfig"])):
cmd = "acs kubernetes get-credentials"
cmd += " --resource-group=%s" % config["resource_group"]
cmd += " --name=%s" % config["cluster_name"]
cmd += " --file=./deploy/%s" % config["acskubeconfig"]
cmd += " --ssh-key-file=%s" % "./deploy/sshkey/id_rsa"
az_tryuntil(cmd, lambda : os.path.exists("./deploy/%s" % config["acskubeconfig"]))
def acs_deploy_addons():
kube_dpeloy_configchanges()
kube_deploy_addons()
# other config post deploy -- ACS cluster is complete
# Run prescript, copyfiles, postscript
def acs_post_deploy():
# Attach DNS name to nodes
acs_attach_dns_name()
# Label nodes
ip = get_nodes_from_acs("")
acs_label_webui()
kubernetes_label_nodes("active", [], args.yes )
# Copy files, etc.
get_nodes_from_acs()
gen_configs()
utils.render_template_directory("./template/kubelet", "./deploy/kubelet", config)
@ -2238,77 +1955,15 @@ def acs_post_deploy():
deploy_on_nodes(config["worker_predeploy"], config["worker_filesdeploy"], config["worker_postdeploy"],
config["worker_node"])
def acs_deploy():
config["isacs"] = True
create_cluster_id()
generate_key = not os.path.exists("./deploy/sshkey")
cmd = "group create"
cmd += " --location=%s" % config["cluster_location"]
cmd += " --name=%s" % config["resource_group"]
az_sys(cmd)
acs_create_storage()
az_create_sql()
cmd = "acs create --orchestrator-type=kubernetes"
cmd += " --resource-group=%s" % config["resource_group"]
cmd += " --name=%s" % config["cluster_name"]
cmd += " --agent-count=%d" % config["worker_node_num"]
cmd += " --master-count=%d" % config["master_node_num"]
cmd += " --location=%s" % config["cluster_location"]
cmd += " --agent-vm-size=%s" % config["acsagentsize"]
cmd += " --admin-username=%s" % config["admin_username"]
cmd += " --ssh-key-value=%s" % "./deploy/sshkey/id_rsa.pub"
if (generate_key):
os.system("rm -r ./deploy/sshkey || true")
cmd += " --generate-ssh-keys"
az_sys(cmd)
acs_set_resource_grp() # overwrite resource group if machines are elsewhere
acs_get_config()
# Get/create public IP addresses for all machines
Nodes = acs_get_machinesAndIPs(True)
# Label nodes
ip = get_nodes_from_acs("")
acs_label_webui()
kubernetes_label_nodes("active", [], args.yes )
# Add rules for NSG
acs_add_nsg_rules({"HTTPAllow" : 80, "RestfulAPIAllow" : 5000, "AllowKubernetesServicePorts" : "30000-32767"})
# Attach DNS name to master
acs_attach_dns_name()
return Nodes
def acs_get_storage_key():
cmd = "storage account keys list"
cmd += " --account-name=%s" % config["mountpoints"]["rootshare"]["accountname"]
cmd += " --resource-group=%s" % config["resource_group"]
keys = az_cmd(cmd)
return keys[0]["value"]
def acs_create_storage():
# Create storage account
cmd = "storage account create"
cmd += " --name=%s" % config["mountpoints"]["rootshare"]["accountname"]
cmd += " --resource-group=%s" % config["resource_group"]
cmd += " --sku=%s" % config["mountpoints"]["rootshare"]["azstoragesku"]
az_sys(cmd)
# Create file share
azureKey = acs_get_storage_key()
config["mountpoints"]["rootshare"]["accesskey"] = azureKey
cmd = "storage share create"
cmd += " --name=%s" % config["mountpoints"]["rootshare"]["filesharename"]
cmd += " --quota=%s" % config["mountpoints"]["rootshare"]["filesharequota"]
cmd += " --account-name=%s" % config["mountpoints"]["rootshare"]["accountname"]
cmd += " --account-key=%s" % azureKey
az_sys(cmd)
def acs_attach_dns_name():
get_nodes_from_acs()
firstMasterNode = config["kubernetes_master_node"][0]
acs_tools.acs_attach_dns_to_node(firstMasterNode, config["master_dns_name"])
for i in range(len(config["kubernetes_master_node"])):
if (i != 0):
acs_tools.acs_attach_dns_to_node(config["kubernetes_master_node"][i])
for node in config["worker_node"]:
acs_tools.acs_attach_dns_to_node(node)
def acs_install_gpu():
nodes = get_worker_nodes(config["clusterId"])
@ -3634,7 +3289,7 @@ def run_command( args, command, nargs, parser ):
if "clusterId" in tmp:
config["clusterId"] = tmp["clusterId"]
add_acs_config()
add_acs_config(command)
if verbose and config["isacs"]:
print "Using Azure Container Services"
@ -3968,28 +3623,28 @@ def run_command( args, command, nargs, parser ):
run_script_blocks(scriptblocks["acs"])
elif (len(nargs) >= 1):
if nargs[0]=="deploy":
acs_deploy()
acs_tools.acs_deploy() # Core K8s cluster deployment
elif nargs[0]=="getconfig":
acs_get_config()
acs_tools.acs_get_config()
elif nargs[0]=="getip":
ip = acs_get_machinesAndIPsFast()
ip = acs_tools.acs_get_machinesAndIPsFast()
print ip
elif nargs[0]=="createip":
ip = acs_get_machinesAndIPs(True)
ip = acs_tools.acs_get_machinesAndIPs(True)
print ip
elif nargs[0]=="label":
ip = get_nodes_from_acs("")
acs_label_webui()
elif nargs[0]=="openports":
acs_add_nsg_rules({"HTTPAllow" : 80, "RestfulAPIAllow" : 5000, "AllowKubernetesServicePorts" : "30000-32767"})
acs_tools.acs_add_nsg_rules({"HTTPAllow" : 80, "RestfulAPIAllow" : 5000, "AllowKubernetesServicePorts" : "30000-32767"})
elif nargs[0]=="restartwebui":
run_script_blocks(scriptblocks["restartwebui"])
elif nargs[0]=="getserviceaddr":
print "Address: =" + json.dumps(k8sUtils.GetServiceAddress(nargs[1]))
elif nargs[0]=="storage":
acs_create_storage()
acs_tools.acs_create_storage()
elif nargs[0]=="storagemount":
acs_create_storage()
acs_tools.acs_create_storage()
fileshare_install()
allmountpoints = mount_fileshares_by_service(True)
del_fileshare_links()

Просмотреть файл

@ -366,3 +366,68 @@ def addressInNetwork(ip,net):
ret = False
return ret
class ValClass:
def __init__(self, initVal):
self.val = initVal
def set(self, newVal):
self.val = newVal
def shellquote(s):
return "'" + s.replace("'", "'\\''") + "'"
def tryuntil(cmdLambda, stopFn, updateFn, waitPeriod=5):
while not stopFn():
try:
output = cmdLambda() # if exception occurs here, update does not occur
#print "Output: {0}".format(output)
updateFn()
toStop = False
try:
toStop = stopFn()
except Exception as e:
print "Exception {0} -- stopping anyways".format(e)
toStop = True
if toStop:
#print "Returning {0}".format(output)
return output
except Exception as e:
print "Exception in command {0}".format(e)
if not stopFn():
print "Not done yet - Sleep for 5 seconds and continue"
time.sleep(waitPeriod)
# Run until stop condition and success
def subproc_tryuntil(cmd, stopFn, shell=True, waitPeriod=5):
bFirst = ValClass(True)
return tryuntil(lambda : subprocess.check_output(cmd, shell), lambda : not bFirst.val and stopFn(), lambda : bFirst.set(False), waitPeriod)
def subprocrun(cmd, shellArg):
#print "Running Cmd: {0} Shell: {1}".format(cmd, shellArg)
#embed()
return subprocess.check_output(cmd, shell=shellArg)
# Run once until success (no exception)
def subproc_runonce(cmd, shell=True, waitPeriod=5):
bFirst = ValClass(True)
#print "Running cmd:{0} Shell:{1}".format(cmd, shell)
return tryuntil(lambda : subprocrun(cmd, shell), lambda : not bFirst.val, lambda : bFirst.set(False), waitPeriod)
# Run for N success
def subproc_runN(cmd, n, shell=True, waitPeriod=5):
bCnt = ValClass(0)
return tryuntil(lambda : subprocess.check_output(cmd, shell), lambda : (bCnt.val < n), lambda : bCnt.set(bCnt.val+1), waitPeriod)
def mergeDict(configDst, configSrc, bOverwrite):
for entry in configSrc:
# if not isinstance(configSrc[entry], dict):
# print "key:{0} val:{1}".format(entry, configSrc[entry])
if bOverwrite:
configDst.pop(entry, None)
if (not entry in configDst) or (configDst[entry] is None) or \
(isinstance(configDst[entry], basestring) and configDst[entry].lower() == "null"):
if isinstance(configSrc[entry], dict):
configDst[entry] = {}
mergeDict(configDst[entry], configSrc[entry], bOverwrite)
else:
#print "settingkey:{0} val:{1}".format(entry, configSrc[entry])
configDst[entry] = configSrc[entry]