Network validation checks during provision (#2196)

* Add DNS + HTTPS checks, capture DNS packets

* ARM doesn’t like ‘{‘

* standardizing retrycmd_if_failure usage patterns

* Adding DNS pre-check for aptdocker.azureedge.net

* tracking time for each retried provision event

* standardizing to 3 masters api model for e2e tests

* retain e2e resources for debugging

* getting metrics logs from all cluster hosts

* improved master/agent host retrieval

* lint

* lint

* Adding “agent” substring to e2e api model pools

* invalid agent pool name

* revert agent forwarding ssh config

* restore cleanup

* add agent dns validation

* 5 seconds between etcddisk mount retries
This commit is contained in:
Jack Francis 2018-02-05 12:10:20 -08:00 коммит произвёл GitHub
Родитель 7923b9600b
Коммит fe3f715a4f
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
9 изменённых файлов: 135 добавлений и 38 удалений

Просмотреть файл

@ -8,7 +8,7 @@
}
},
"masterProfile": {
"count": 1,
"count": 3,
"dnsPrefix": "",
"vmSize": "Standard_D2_v2",
"OSDiskSizeGB": 200,
@ -18,7 +18,7 @@
},
"agentPoolProfiles": [
{
"name": "md",
"name": "agentmd",
"count": 3,
"vmSize": "Standard_D2_v2",
"OSDiskSizeGB": 200,
@ -28,7 +28,7 @@
"vnetSubnetId": "/subscriptions/SUB_ID/resourceGroups/RG_NAME/providers/Microsoft.Network/virtualNetworks/VNET_NAME/subnets/SUBNET_NAME"
},
{
"name": "sa",
"name": "agentsa",
"count": 3,
"vmSize": "Standard_D2_v2",
"OSDiskSizeGB": 200,

Просмотреть файл

@ -165,8 +165,10 @@ coreos:
ExecStart=/opt/azure/containers/provision-setup.sh
{{else}}
runcmd:
- retrycmd_if_failure() { for i in $(seq 1 36); do $@; [ $? -eq 0 ] && break || sleep 5; done ; }
- echo `date`,`hostname`, startruncmd>>/opt/m
- retrycmd_if_failure() { for i in $(seq 1 36); do $@; [ $? -eq 0 ] && break || sleep 5; done; echo Executed \"$@\" $i times; }
- retrycmd_if_failure nc -zw1 $(grep nameserver /etc/resolv.conf | cut -d \ -f 2) 53
- retrycmd_if_failure nc -zw1 azure.com 443
- apt-mark hold walinuxagent{{GetKubernetesAgentPreprovisionYaml .}}
- echo `date`,`hostname`, preaptupdate>>/opt/m
- retrycmd_if_failure apt-get update
@ -178,7 +180,7 @@ runcmd:
- systemctl start rpcbind
- systemctl start rpc-statd
- echo `date`,`hostname`, predockerinstall>>/opt/m
- retrycmd_if_failure curl --retry 5 --retry-delay 10 --retry-max-time 30 --max-time 60 -fsSL https://aptdocker.azureedge.net/gpg | apt-key add -
- curl --retry 5 --retry-delay 10 --retry-max-time 30 --max-time 60 -fsSL https://aptdocker.azureedge.net/gpg | apt-key add -
- echo "deb {{WrapAsVariable "dockerEngineDownloadRepo"}} ubuntu-xenial main" | sudo tee /etc/apt/sources.list.d/docker.list
- "echo \"Package: docker-engine\nPin: version {{WrapAsVariable "dockerEngineVersion"}}\nPin-Priority: 550\n\" > /etc/apt/preferences.d/docker.pref"
- retrycmd_if_failure apt-get update

Просмотреть файл

@ -292,7 +292,7 @@ MASTER_ARTIFACTS_CONFIG_PLACEHOLDER
content: |
#!/bin/bash
set -x
retrycmd_if_failure() { for i in $(seq 1 36); do $@; [ $? -eq 0 ] && break || sleep 5; done ; }
retrycmd_if_failure() { for i in $(seq 1 36); do $@; [ $? -eq 0 ] && break || sleep 5; done; echo Executed \"$@\" $i times; }
ETCD_VER=v{{WrapAsVariable "etcdVersion"}}
DOWNLOAD_URL={{WrapAsVariable "etcdDownloadURLBase"}}
mkdir -p /tmp/etcd-download
@ -314,7 +314,7 @@ MASTER_ARTIFACTS_CONFIG_PLACEHOLDER
owner: "root"
content: |
#!/bin/bash
retrycmd_if_failure() { for i in $(seq 1 36); do $@; [ $? -eq 0 ] && break || sleep 5; done ; }
retrycmd_if_failure() { for i in $(seq 1 36); do $@; [ $? -eq 0 ] && break || sleep 5; done; echo Executed \"$@\" $i times; }
/bin/echo DAEMON_ARGS=--name "{{WrapAsVerbatim "variables('masterVMNames')[copyIndex(variables('masterOffset'))]"}}" --initial-advertise-peer-urls "{{WrapAsVerbatim "variables('masterEtcdPeerURLs')[copyIndex(variables('masterOffset'))]"}}" --listen-peer-urls "{{WrapAsVerbatim "variables('masterEtcdPeerURLs')[copyIndex(variables('masterOffset'))]"}}" --advertise-client-urls "{{WrapAsVerbatim "variables('masterEtcdClientURLs')[copyIndex(variables('masterOffset'))]"}}" --listen-client-urls "{{WrapAsVerbatim "concat(variables('masterEtcdClientURLs')[copyIndex(variables('masterOffset'))], ',http://127.0.0.1:', variables('masterEtcdClientPort'))"}}" --initial-cluster-token "k8s-etcd-cluster" --initial-cluster "{{WrapAsVerbatim "variables('masterEtcdClusterStates')[div(variables('masterCount'), 2)]"}} --data-dir "/var/lib/etcddisk"" --initial-cluster-state "new" | tee -a /etc/default/etcd
sudo /bin/chown -R etcd:etcd /var/lib/etcd/default
/opt/azure/containers/mountetcd.sh
@ -347,7 +347,9 @@ coreos:
ExecStart=/opt/azure/containers/provision-setup.sh
{{else}}
runcmd:
- retrycmd_if_failure() { for i in $(seq 1 36); do $@; [ $? -eq 0 ] && break || sleep 5; done ; }
- retrycmd_if_failure() { for i in $(seq 1 36); do $@; [ $? -eq 0 ] && break || sleep 5; done; echo Executed \"$@\" $i times; }
- retrycmd_if_failure nc -zw1 $(grep nameserver /etc/resolv.conf | cut -d \ -f 2) 53
- retrycmd_if_failure nc -zw1 azure.com 443
- /opt/azure/containers/setup-etcd.sh > /opt/azure/containers/setup-etcd.log 2>&1
- apt-mark hold walinuxagent {{GetKubernetesMasterPreprovisionYaml}}
- /bin/echo DAEMON_ARGS=--name "{{WrapAsVerbatim "variables('masterVMNames')[copyIndex(variables('masterOffset'))]"}}" --peer-client-cert-auth --peer-trusted-ca-file={{WrapAsVariable "etcdCaFilepath"}} --peer-cert-file={{WrapAsVerbatim "variables('etcdPeerCertFilepath')[copyIndex(variables('masterOffset'))]"}} --peer-key-file={{WrapAsVerbatim "variables('etcdPeerKeyFilepath')[copyIndex(variables('masterOffset'))]"}} --initial-advertise-peer-urls "{{WrapAsVerbatim "variables('masterEtcdPeerURLs')[copyIndex(variables('masterOffset'))]"}}" --listen-peer-urls "{{WrapAsVerbatim "variables('masterEtcdPeerURLs')[copyIndex(variables('masterOffset'))]"}}" --client-cert-auth --trusted-ca-file={{WrapAsVariable "etcdCaFilepath"}} --cert-file={{WrapAsVariable "etcdServerCertFilepath"}} --key-file={{WrapAsVariable "etcdServerKeyFilepath"}} --advertise-client-urls "{{WrapAsVerbatim "variables('masterEtcdClientURLs')[copyIndex(variables('masterOffset'))]"}}" --listen-client-urls "{{WrapAsVerbatim "concat(variables('masterEtcdClientURLs')[copyIndex(variables('masterOffset'))], ',https://127.0.0.1:', variables('masterEtcdClientPort'))"}}" --initial-cluster-token "k8s-etcd-cluster" --initial-cluster "{{WrapAsVerbatim "variables('masterEtcdClusterStates')[div(variables('masterCount'), 2)]"}} --data-dir "/var/lib/etcddisk"" --initial-cluster-state "new" | tee -a /etc/default/etcd
@ -363,6 +365,7 @@ runcmd:
- retrycmd_if_failure curl --cacert /etc/kubernetes/certs/ca.crt --cert /etc/kubernetes/certs/etcdclient.crt --key /etc/kubernetes/certs/etcdclient.key --retry 5 --retry-delay 10 --retry-max-time 30 --max-time 60 "{{WrapAsVerbatim "variables('masterEtcdClientURLs')[copyIndex(variables('masterOffset'))]"}}"/v2/machines
- retrycmd_if_failure apt-get update
- retrycmd_if_failure apt-get install -y apt-transport-https ca-certificates
- retrycmd_if_failure nc -zw1 aptdocker.azureedge.net 443
- curl --retry 5 --retry-delay 10 --retry-max-time 30 --max-time 60 -fsSL https://aptdocker.azureedge.net/gpg | apt-key add -
- echo "deb {{WrapAsVariable "dockerEngineDownloadRepo"}} ubuntu-xenial main" | sudo tee /etc/apt/sources.list.d/docker.list
- "echo \"Package: docker-engine\nPin: version {{WrapAsVariable "dockerEngineVersion"}}\nPin-Priority: 550\n\" > /etc/apt/preferences.d/docker.pref"

Просмотреть файл

@ -23,6 +23,13 @@
# KUBECONFIG_KEY ETCD_SERVER_CERTIFICATE ETCD_SERVER_PRIVATE_KEY ETCD_CLIENT_CERTIFICATE ETCD_CLIENT_PRIVATE_KEY
# ETCD_PEER_CERTIFICATES ETCD_PEER_PRIVATE_KEYS ADMINUSER MASTER_INDEX
# Capture Interesting Network Stuffs during provision
packetCaptureProvision() {
tcpdump -G 600 -W 1 -n -vv -w /var/log/azure/dnsdump.pcap -Z root -i eth0 udp port 53 > /dev/null 2>&1 &
}
packetCaptureProvision
# Find distro name via ID value in releases files and upcase
OS=$(cat /etc/*-release | grep ^ID= | tr -d 'ID="' | awk '{print toupper($0)}')
UBUNTU_OS_NAME="UBUNTU"
@ -49,7 +56,7 @@ ensureRunCommandCompleted()
echo "waiting for runcmd to finish"
for i in {1..900}; do
if [ -e /opt/azure/containers/runcmd.complete ]; then
echo "runcmd finished"
echo "runcmd finished, took $i seconds"
break
fi
sleep 1
@ -185,6 +192,7 @@ function ensureKubectl() {
if [ -e $KUBECTL ]
then
kubectlfound=0
echo "kubectl installed successfully, took $i seconds"
break
fi
sleep 1
@ -203,6 +211,7 @@ function downloadUrl () {
# Wrapper around curl to download blobs more reliably.
# Workaround the --retry issues with a for loop and set a max timeout.
for i in 1 2 3 4 5; do curl --max-time 60 -fsSL ${1}; [ $? -eq 0 ] && break || sleep 10; done
echo Executed curl for \"${1}\" $i times
}
function setMaxPods () {
@ -466,6 +475,7 @@ function systemctlEnableAndCheck() {
systemctl is-enabled $1
enabled=$?
else
echo "$1 took $i seconds to be enabled by systemctl"
break
fi
sleep 1
@ -489,7 +499,7 @@ function ensureDocker() {
echo "status $?"
/bin/systemctl restart docker
else
echo "docker started"
echo "docker started, took $i seconds"
dockerStarted=0
break
fi
@ -543,7 +553,7 @@ function ensureApiserver() {
$KUBECTL cluster-info
if [ "$?" = "0" ]
then
echo "kubernetes started"
echo "kubernetes started, took $i seconds"
kubernetesStarted=0
break
fi
@ -551,7 +561,7 @@ function ensureApiserver() {
/usr/bin/docker ps | grep apiserver
if [ "$?" = "0" ]
then
echo "kubernetes started"
echo "kubernetes started, took $i seconds"
kubernetesStarted=0
break
fi
@ -570,10 +580,10 @@ function ensureEtcd() {
curl --cacert /etc/kubernetes/certs/ca.crt --cert /etc/kubernetes/certs/etcdclient.crt --key /etc/kubernetes/certs/etcdclient.key --max-time 60 https://127.0.0.1:2379/v2/machines;
if [ $? -eq 0 ]
then
echo "Etcd setup successfully"
echo "Etcd setup successfully, took $i seconds"
break
fi
sleep 5
sleep 1
done
}
@ -585,14 +595,16 @@ function ensureEtcdDataDir() {
return
else
echo "/var/lib/etcddisk was not found at /dev/sdc1. Trying to mount all devices."
s = 5
for i in {1..60}; do
sudo mount -a && mount | grep /dev/sdc1 | grep /var/lib/etcddisk;
if [ "$?" = "0" ]
then
echo "/var/lib/etcddisk mounted at: /dev/sdc1"
(( t = ${i} * ${s} ))
echo "/var/lib/etcddisk mounted at: /dev/sdc1, took $t seconds"
return
fi
sleep 5
sleep $s
done
fi

Просмотреть файл

@ -20,7 +20,7 @@ openssl genrsa -out $PROXY_CLIENT_KEY 2048
openssl req -new -key $PROXY_CLIENT_KEY -out $PROXY_CLIENT_CSR -subj '/CN=aggregator/O=system:masters'
openssl x509 -req -days 730 -in $PROXY_CLIENT_CSR -CA $PROXY_CRT -CAkey $PROXY_CA_KEY -set_serial 02 -out $PROXY_CLIENT_CRT
retrycmd_if_failure() { for i in 1 2 3 4 5 6 7 8 9 10; do $@; [ $? -eq 0 ] && break || sleep 30; done ; }
retrycmd_if_failure() { for i in $(seq 1 10); do $@; [ $? -eq 0 ] && break || sleep 30; done; echo Executed \"$@\" $i times; }
write_certs_to_disk() {
etcdctl get $ETCD_REQUESTHEADER_CLIENT_CA > $K8S_PROXY_CA_CRT_FILEPATH

Просмотреть файл

@ -28,6 +28,11 @@ type ResourceGroup struct {
Location string
}
// VM represents an azure vm
type VM struct {
Name string `json:"name"`
}
// Deployment represents a deployment of an acs cluster
type Deployment struct {
Name string // Name of the deployment
@ -217,3 +222,27 @@ func (a *Account) UpdateRouteTables(subnet, vnet string) error {
}
return nil
}
// GetHosts will get a list of vms in the resource group
func (a *Account) GetHosts(name string) ([]VM, error) {
var resourceGroup string
if name != "" {
resourceGroup = name
} else {
resourceGroup = a.ResourceGroup.Name
}
cmd := exec.Command("az", "vm", "list", "-g", resourceGroup)
util.PrintCommand(cmd)
out, err := cmd.CombinedOutput()
if err != nil {
log.Printf("Error while trying to get vm list:%s\n", out)
return nil, err
}
v := []VM{{}}
err = json.Unmarshal(out, &v)
if err != nil {
log.Printf("Error unmarshalling account json:%s\n", err)
log.Printf("JSON:%s\n", out)
}
return v, nil
}

Просмотреть файл

@ -8,6 +8,7 @@ import (
"net"
"os"
"os/exec"
"path/filepath"
"time"
"github.com/Azure/acs-engine/test/e2e/kubernetes/util"
@ -117,6 +118,26 @@ func (c *Connection) Read(path string) ([]byte, error) {
return out, nil
}
// CopyRemote uses this ssh connection to scp remote files
func (c *Connection) CopyRemote(hostname, path string) error {
cmd := exec.Command("ssh-add", c.PrivateKeyPath)
out, err := cmd.CombinedOutput()
if err != nil {
log.Printf("Error output:%s\n", out)
return err
}
remoteCommand := fmt.Sprintf("scp -o StrictHostKeyChecking=no %s:%s /tmp/%s-%s", hostname, path, hostname, filepath.Base(path))
connectString := fmt.Sprintf("%s@%s", c.User, c.Host)
cmd = exec.Command("ssh", "-A", "-i", c.PrivateKeyPath, "-o", "ConnectTimeout=30", "-o", "StrictHostKeyChecking=no", connectString, "-p", c.Port, remoteCommand)
util.PrintCommand(cmd)
out, err = cmd.CombinedOutput()
if err != nil {
log.Printf("Error output:%s\n", out)
return err
}
return nil
}
// ExecuteWithRetries will keep retrying a command until it does not return an error or the duration is exceeded
func (c *Connection) ExecuteWithRetries(cmd string, sleep, duration time.Duration) ([]byte, error) {
outCh := make(chan []byte, 1)

Просмотреть файл

@ -2,7 +2,6 @@ package main
import (
"fmt"
"io/ioutil"
"log"
"os"
"os/signal"
@ -132,20 +131,9 @@ func teardown() {
if err != nil {
log.Printf("cliProvisioner.FetchProvisioningMetrics error: %s\n", err)
}
for _, fp := range []string{"/var/log/azure/cluster-provision.log", "/var/log/cloud-init.log",
"/var/log/cloud-init-output.log", "/var/log/syslog", "/var/log/azure/custom-script/handler.log",
"/opt/m", "/opt/azure/containers/kubelet.sh", "/opt/azure/containers/mountetcd.sh",
"/opt/azure/containers/provision.sh", "/opt/azure/containers/setup-etcd.sh",
"/opt/azure/provision-ps.log"} {
data, err := cliProvisioner.FetchProvisioningMetrics(fp)
if err != nil {
log.Printf("cliProvisioner.FetchProvisioningMetrics error: %s\n", err)
}
target := filepath.Join(logsPath, filepath.Base(fp))
err = ioutil.WriteFile(target, data, 0777)
if err != nil {
log.Printf("ioutil.WriteFile error: %s\n", err)
}
err = cliProvisioner.FetchProvisioningMetrics(logsPath, cfg, acct)
if err != nil {
log.Printf("cliProvisioner.FetchProvisioningMetrics error: %s\n", err)
}
}
if cfg.CleanUpOnExit {

Просмотреть файл

@ -8,6 +8,7 @@ import (
"os"
"os/exec"
"path/filepath"
"strings"
"time"
"github.com/Azure/acs-engine/test/e2e/azure"
@ -199,16 +200,57 @@ func (cli *CLIProvisioner) waitForNodes() error {
return nil
}
// FetchProvisioningMetrics gets a file from the master
func (cli *CLIProvisioner) FetchProvisioningMetrics(path string) ([]byte, error) {
// FetchProvisioningMetrics gets provisioning files from all hosts in a cluster
func (cli *CLIProvisioner) FetchProvisioningMetrics(path string, cfg *config.Config, acct *azure.Account) error {
var masters, agents []string
hosts, err := acct.GetHosts("")
if err != nil {
return err
}
for _, host := range hosts {
if strings.Contains(host.Name, "master") {
masters = append(masters, host.Name)
} else if strings.Contains(host.Name, "agent") {
agents = append(agents, host.Name)
}
}
agentFiles := []string{"/var/log/azure/cluster-provision.log", "/var/log/cloud-init.log",
"/var/log/cloud-init-output.log", "/var/log/syslog", "/var/log/azure/custom-script/handler.log",
"/opt/m", "/opt/azure/containers/kubelet.sh", "/opt/azure/containers/provision.sh",
"/opt/azure/provision-ps.log", "/var/log/azure/dnsdump.pcap"}
masterFiles := agentFiles
masterFiles = append(masterFiles, "/opt/azure/containers/mountetcd.sh", "/opt/azure/containers/setup-etcd.sh")
hostname := fmt.Sprintf("%s.%s.cloudapp.azure.com", cli.Config.Name, cli.Config.Location)
conn, err := remote.NewConnection(hostname, "22", cli.Engine.ClusterDefinition.Properties.LinuxProfile.AdminUsername, cli.Config.GetSSHKeyPath())
if err != nil {
return nil, err
return err
}
data, err := conn.Read(path)
for _, master := range masters {
for _, fp := range masterFiles {
err := conn.CopyRemote(master, fp)
if err != nil {
return fmt.Errorf("Error reading file from path (%s):%s", path, err)
}
}
}
for _, agent := range agents {
for _, fp := range agentFiles {
err := conn.CopyRemote(agent, fp)
if err != nil {
return fmt.Errorf("Error reading file from path (%s):%s", path, err)
}
}
}
connectString := fmt.Sprintf("%s@%s:/tmp/k8s-*", conn.User, hostname)
logsPath := filepath.Join(cfg.CurrentWorkingDir, "_logs", hostname)
cmd := exec.Command("scp", "-i", conn.PrivateKeyPath, "-o", "ConnectTimeout=30", "-o", "StrictHostKeyChecking=no", connectString, logsPath)
util.PrintCommand(cmd)
out, err := cmd.CombinedOutput()
if err != nil {
return nil, fmt.Errorf("Error reading file from path (%s):%s", path, err)
log.Printf("Error output:%s\n", out)
return err
}
return data, nil
return nil
}