diff --git a/test/e2e/azure/cli.go b/test/e2e/azure/cli.go index bec875b87..44e844d66 100644 --- a/test/e2e/azure/cli.go +++ b/test/e2e/azure/cli.go @@ -6,7 +6,6 @@ package azure import ( "context" "encoding/json" - "errors" "fmt" "log" "os" @@ -19,6 +18,7 @@ import ( "github.com/Azure/azure-sdk-for-go/services/network/mgmt/2018-08-01/network" "github.com/kelseyhightower/envconfig" + "github.com/pkg/errors" ) // Storage provides access to StorageAccount objects @@ -122,6 +122,34 @@ func (a *Account) Login() error { return nil } +// LoginWithRetry invokes Login, retrying up to a timeout +func (a *Account) LoginWithRetry(sleep, timeout time.Duration) error { + ctx, cancel := context.WithTimeout(context.Background(), timeout) + defer cancel() + ch := make(chan error) + var mostRecentLoginWithRetryError error + go func() { + for { + select { + case <-ctx.Done(): + return + case ch <- a.Login(): + time.Sleep(sleep) + } + } + }() + for { + select { + case mostRecentLoginWithRetryError = <-ch: + if mostRecentLoginWithRetryError == nil { + return nil + } + case <-ctx.Done(): + return errors.Errorf("LoginWithRetry timed out: %s\n", mostRecentLoginWithRetryError) + } + } +} + // SetSubscription will call az account set --subscription for the given Account func (a *Account) SetSubscription() error { var cmd *exec.Cmd @@ -140,6 +168,34 @@ func (a *Account) SetSubscription() error { return nil } +// SetSubscriptionWithRetry invokes SetSubscription, retrying up to a timeout +func (a *Account) SetSubscriptionWithRetry(sleep, timeout time.Duration) error { + ctx, cancel := context.WithTimeout(context.Background(), timeout) + defer cancel() + ch := make(chan error) + var mostRecentSetSubscriptionWithRetryError error + go func() { + for { + select { + case <-ctx.Done(): + return + case ch <- a.SetSubscription(): + time.Sleep(sleep) + } + } + }() + for { + select { + case mostRecentSetSubscriptionWithRetryError = <-ch: + if mostRecentSetSubscriptionWithRetryError == nil { + return nil + } + case <-ctx.Done(): + return errors.Errorf("SetSubscriptionWithRetry timed out: %s\n", mostRecentSetSubscriptionWithRetryError) + } + } +} + // CreateGroup will create a resource group in a given location //--tags "type=${RESOURCE_GROUP_TAG_TYPE:-}" "now=$(date +%s)" "job=${JOB_BASE_NAME:-}" "buildno=${BUILD_NUM:-}" func (a *Account) CreateGroup(name, location string) error { @@ -168,6 +224,34 @@ func (a *Account) CreateGroup(name, location string) error { return nil } +// CreateGroupWithRetry invokes CreateGroup, retrying up to a timeout +func (a *Account) CreateGroupWithRetry(name, location string, sleep, timeout time.Duration) error { + ctx, cancel := context.WithTimeout(context.Background(), timeout) + defer cancel() + ch := make(chan error) + var mostRecentCreateGroupWithRetryError error + go func() { + for { + select { + case <-ctx.Done(): + return + case ch <- a.CreateGroup(name, location): + time.Sleep(sleep) + } + } + }() + for { + select { + case mostRecentCreateGroupWithRetryError = <-ch: + if mostRecentCreateGroupWithRetryError == nil { + return nil + } + case <-ctx.Done(): + return errors.Errorf("CreateGroupWithRetry timed out: %s\n", mostRecentCreateGroupWithRetryError) + } + } +} + // DeleteGroup deletes a given resource group by name func (a *Account) DeleteGroup(name string, wait bool) error { var cmd *exec.Cmd @@ -189,6 +273,34 @@ func (a *Account) DeleteGroup(name string, wait bool) error { return nil } +// DeleteGroupWithRetry invokes DeleteGroup, retrying up to a timeout +func (a *Account) DeleteGroupWithRetry(name string, wait bool, sleep, timeout time.Duration) error { + ctx, cancel := context.WithTimeout(context.Background(), timeout) + defer cancel() + ch := make(chan error) + var mostRecentDeleteGroupWithRetryError error + go func() { + for { + select { + case <-ctx.Done(): + return + case ch <- a.DeleteGroup(name, wait): + time.Sleep(sleep) + } + } + }() + for { + select { + case mostRecentDeleteGroupWithRetryError = <-ch: + if mostRecentDeleteGroupWithRetryError == nil { + return nil + } + case <-ctx.Done(): + return errors.Errorf("DeleteGroupWithRetry timed out: %s\n", mostRecentDeleteGroupWithRetryError) + } + } +} + // CreateDeployment will deploy a cluster to a given resource group using the template and parameters on disk func (a *Account) CreateDeployment(name string, e *engine.Engine) error { d := Deployment{ @@ -228,6 +340,34 @@ func (a *Account) CreateDeployment(name string, e *engine.Engine) error { return nil } +// CreateDeploymentWithRetry invokes CreateDeployment, retrying up to a timeout +func (a *Account) CreateDeploymentWithRetry(name string, e *engine.Engine, sleep, timeout time.Duration) error { + ctx, cancel := context.WithTimeout(context.Background(), timeout) + defer cancel() + ch := make(chan error) + var mostRecentCreateDeploymentWithRetryError error + go func() { + for { + select { + case <-ctx.Done(): + return + case ch <- a.CreateDeployment(name, e): + time.Sleep(sleep) + } + } + }() + for { + select { + case mostRecentCreateDeploymentWithRetryError = <-ch: + if mostRecentCreateDeploymentWithRetryError == nil { + return nil + } + case <-ctx.Done(): + return errors.Errorf("CreateDeploymentWithRetry timed out: %s\n", mostRecentCreateDeploymentWithRetryError) + } + } +} + // CreateVnet will create a vnet in a resource group func (a *Account) CreateVnet(vnet, addressPrefixes string) error { var cmd *exec.Cmd @@ -247,6 +387,34 @@ func (a *Account) CreateVnet(vnet, addressPrefixes string) error { return nil } +// CreateVnetWithRetry invokes CreateVnet, retrying up to a timeout +func (a *Account) CreateVnetWithRetry(vnet, addressPrefixes string, sleep, timeout time.Duration) error { + ctx, cancel := context.WithTimeout(context.Background(), timeout) + defer cancel() + ch := make(chan error) + var mostRecentCreateVnetWithRetryError error + go func() { + for { + select { + case <-ctx.Done(): + return + case ch <- a.CreateVnet(vnet, addressPrefixes): + time.Sleep(sleep) + } + } + }() + for { + select { + case mostRecentCreateVnetWithRetryError = <-ch: + if mostRecentCreateVnetWithRetryError == nil { + return nil + } + case <-ctx.Done(): + return errors.Errorf("CreateVnetWithRetry timed out: %s\n", mostRecentCreateVnetWithRetryError) + } + } +} + // ListRGRouteTableResult defines a struct for making a multi-value channel result type type ListRGRouteTableResult struct { routeTables []network.RouteTable @@ -356,6 +524,34 @@ func (a *Account) CreateSubnet(vnet, subnetName, subnetPrefix string) error { return nil } +// CreateSubnetWithRetry invokes CreateSubnet, retrying up to a timeout +func (a *Account) CreateSubnetWithRetry(vnet, subnetName, subnetPrefix string, sleep, timeout time.Duration) error { + ctx, cancel := context.WithTimeout(context.Background(), timeout) + defer cancel() + ch := make(chan error) + var mostRecentCreateSubnetWithRetryError error + go func() { + for { + select { + case <-ctx.Done(): + return + case ch <- a.CreateSubnet(vnet, subnetName, subnetPrefix): + time.Sleep(sleep) + } + } + }() + for { + select { + case mostRecentCreateSubnetWithRetryError = <-ch: + if mostRecentCreateSubnetWithRetryError == nil { + return nil + } + case <-ctx.Done(): + return errors.Errorf("CreateSubnetWithRetry timed out: %s\n", mostRecentCreateSubnetWithRetryError) + } + } +} + // RouteTable holds information from running az network route-table list type RouteTable struct { ID string `json:"id"` @@ -365,36 +561,19 @@ type RouteTable struct { ResourceGroup string `json:"resourceGroup"` } -// UpdateRouteTables is used to updated a vnet with the appropriate route tables -func (a *Account) UpdateRouteTables(subnet, vnet string) error { - var cmd *exec.Cmd - if a.TimeoutCommands { - cmd = exec.Command("timeout", "60", "az", "network", "route-table", "list", "-g", a.ResourceGroup.Name) - } else { - cmd = exec.Command("az", "network", "route-table", "list", "-g", a.ResourceGroup.Name) - } - out, err := cmd.CombinedOutput() - if err != nil { - log.Printf("Error while trying to get route table list!\n Output:%s\n", out) - return err - } - rts := []RouteTable{} - json.Unmarshal(out, &rts) +// GetHostsResult is a return struct for GetAsync +type GetHostsResult struct { + Vms []VM + Err error +} - if a.TimeoutCommands { - cmd = exec.Command("timeout", "60", "az", "network", "vnet", "subnet", "update", - "-n", subnet, "-g", a.ResourceGroup.Name, "--vnet-name", vnet, "--route-table", rts[0].Name) - } else { - cmd = exec.Command("az", "network", "vnet", "subnet", "update", - "-n", subnet, "-g", a.ResourceGroup.Name, "--vnet-name", vnet, "--route-table", rts[0].Name) +// GetHostsAsync wraps GetHosts with a struct response for goroutine + channel usage +func (a *Account) GetHostsAsync(name string) GetHostsResult { + vms, err := a.GetHosts(name) + return GetHostsResult{ + Vms: vms, + Err: err, } - util.PrintCommand(cmd) - out, err = cmd.CombinedOutput() - if err != nil { - log.Printf("Error while trying to update vnet route tables:%s\n", out) - return err - } - return nil } // GetHosts will get a list of vms in the resource group @@ -427,6 +606,37 @@ func (a *Account) GetHosts(name string) ([]VM, error) { return v, nil } +// GetHostsWithRetry invokes GetHostsAsync, retrying up to a timeout +func (a *Account) GetHostsWithRetry(name string, sleep, timeout time.Duration) ([]VM, error) { + ctx, cancel := context.WithTimeout(context.Background(), timeout) + defer cancel() + ch := make(chan GetHostsResult) + var mostRecentGetHostsWithRetryError error + var vms []VM + go func() { + for { + select { + case <-ctx.Done(): + return + case ch <- a.GetHostsAsync(name): + time.Sleep(sleep) + } + } + }() + for { + select { + case result := <-ch: + mostRecentGetHostsWithRetryError = result.Err + vms = result.Vms + if mostRecentGetHostsWithRetryError == nil { + return vms, nil + } + case <-ctx.Done(): + return nil, errors.Errorf("GetHostsWithRetry timed out: %s\n", mostRecentGetHostsWithRetryError) + } + } +} + // SetResourceGroup will set the account resource group func (a *Account) SetResourceGroup(name string) error { if a.ResourceGroup.Name != "" { @@ -458,6 +668,34 @@ func (a *Account) SetResourceGroup(name string) error { return nil } +// SetResourceGroupWithRetry invokes SetResourceGroup, retrying up to a timeout +func (a *Account) SetResourceGroupWithRetry(name string, sleep, timeout time.Duration) error { + ctx, cancel := context.WithTimeout(context.Background(), timeout) + defer cancel() + ch := make(chan error) + var mostRecentSetResourceGroupWithRetryError error + go func() { + for { + select { + case <-ctx.Done(): + return + case ch <- a.SetResourceGroup(name): + time.Sleep(sleep) + } + } + }() + for { + select { + case mostRecentSetResourceGroupWithRetryError = <-ch: + if mostRecentSetResourceGroupWithRetryError == nil { + return nil + } + case <-ctx.Done(): + return errors.Errorf("SetResourceGroupWithRetry timed out: %s\n", mostRecentSetResourceGroupWithRetryError) + } + } +} + // IsClusterExpired will return true if a deployment was created more than t nanoseconds ago, or if timestamp is not found func (a *Account) IsClusterExpired(d time.Duration) bool { tag, err := strconv.ParseInt(a.ResourceGroup.Tags["now"], 10, 64) diff --git a/test/e2e/runner.go b/test/e2e/runner.go index 5cd6525f1..7e9d2f5e1 100644 --- a/test/e2e/runner.go +++ b/test/e2e/runner.go @@ -59,12 +59,12 @@ func main() { log.Fatalf("Error while trying to setup azure account: %s\n", err) } - err := acct.Login() + err := acct.LoginWithRetry(3*time.Second, cfg.Timeout) if err != nil { log.Fatalf("Error while trying to login to azure account! %s\n", err) } - err = acct.SetSubscription() + err = acct.SetSubscriptionWithRetry(3*time.Second, cfg.Timeout) if err != nil { log.Fatal("Error while trying to set azure subscription!") } @@ -95,7 +95,7 @@ func main() { } provision := true rg := cfg.SoakClusterName - err = acct.SetResourceGroup(rg) + err = acct.SetResourceGroupWithRetry(rg, 3*time.Second, cfg.Timeout) if err != nil { log.Printf("Error while trying to set RG:%s\n", err) } else { @@ -110,7 +110,7 @@ func main() { if provision || cfg.ForceDeploy { log.Printf("Soak cluster %s does not exist or has expired\n", rg) log.Printf("Deleting Resource Group:%s\n", rg) - acct.DeleteGroup(rg, true) + acct.DeleteGroupWithRetry(rg, true, 3*time.Second, cfg.Timeout) log.Printf("Deleting Storage files:%s\n", rg) sa.DeleteFiles(cfg.SoakClusterName) cfg.Name = "" @@ -120,7 +120,7 @@ func main() { if err != nil { log.Printf("Error while trying to download _output dir: %s, will provision a new cluster.\n", err) log.Printf("Deleting Resource Group:%s\n", rg) - acct.DeleteGroup(rg, true) + acct.DeleteGroupWithRetry(rg, true, 3*time.Second, cfg.Timeout) log.Printf("Deleting Storage files:%s\n", rg) sa.DeleteFiles(cfg.SoakClusterName) cfg.Name = "" @@ -250,7 +250,7 @@ func teardown() { if cfg.CleanUpOnExit { for _, rg := range rgs { log.Printf("Deleting Group:%s\n", rg) - acct.DeleteGroup(rg, false) + acct.DeleteGroupWithRetry(rg, false, 3*time.Second, cfg.Timeout) } } } diff --git a/test/e2e/runner/cli_provisioner.go b/test/e2e/runner/cli_provisioner.go index 36ca32480..c1682b7cd 100644 --- a/test/e2e/runner/cli_provisioner.go +++ b/test/e2e/runner/cli_provisioner.go @@ -118,7 +118,7 @@ func (cli *CLIProvisioner) provision() error { os.Setenv("DNS_PREFIX", cli.Config.Name) - err := cli.Account.CreateGroup(cli.Config.Name, cli.Config.Location) + err := cli.Account.CreateGroupWithRetry(cli.Config.Name, cli.Config.Location, 3*time.Second, cli.Config.Timeout) if err != nil { return errors.Wrap(err, "Error while trying to create resource group") } @@ -142,16 +142,16 @@ func (cli *CLIProvisioner) provision() error { if cli.CreateVNET { if cli.MasterVMSS { agentSubnetName := fmt.Sprintf("%sCustomSubnetAgent", cli.Config.Name) - err = cli.Account.CreateVnet(vnetName, "10.239.0.0/16") + err = cli.Account.CreateVnetWithRetry(vnetName, "10.239.0.0/16", 3*time.Second, cli.Config.Timeout) if err != nil { return errors.Errorf("Error trying to create vnet:%s", err.Error()) } - err = cli.Account.CreateSubnet(vnetName, masterSubnetName, "10.239.0.0/17") + err = cli.Account.CreateSubnetWithRetry(vnetName, masterSubnetName, "10.239.0.0/17", 3*time.Second, cli.Config.Timeout) if err != nil { return errors.Errorf("Error trying to create subnet:%s", err.Error()) } subnets = append(subnets, masterSubnetName) - err = cli.Account.CreateSubnet(vnetName, agentSubnetName, "10.239.128.0/17") + err = cli.Account.CreateSubnetWithRetry(vnetName, agentSubnetName, "10.239.128.0/17", 3*time.Second, cli.Config.Timeout) if err != nil { return errors.Errorf("Error trying to create subnet in subnet:%s", err.Error()) } @@ -159,18 +159,18 @@ func (cli *CLIProvisioner) provision() error { agentSubnetID = fmt.Sprintf("/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Network/virtualNetworks/%s/subnets/%s", cli.Account.SubscriptionID, cli.Account.ResourceGroup.Name, vnetName, agentSubnetName) } else { - err = cli.Account.CreateVnet(vnetName, "10.239.0.0/16") + err = cli.Account.CreateVnetWithRetry(vnetName, "10.239.0.0/16", 3*time.Second, cli.Config.Timeout) if err != nil { return errors.Errorf("Error trying to create vnet:%s", err.Error()) } - err = cli.Account.CreateSubnet(vnetName, masterSubnetName, "10.239.255.0/24") + err = cli.Account.CreateSubnetWithRetry(vnetName, masterSubnetName, "10.239.255.0/24", 3*time.Second, cli.Config.Timeout) if err != nil { return errors.Errorf("Error trying to create subnet:%s", err.Error()) } subnets = append(subnets, masterSubnetName) for i, pool := range cs.ContainerService.Properties.AgentPoolProfiles { subnetName := fmt.Sprintf("%sCustomSubnet", pool.Name) - err = cli.Account.CreateSubnet(vnetName, subnetName, fmt.Sprintf("10.239.%d.0/22", i*4)) + err = cli.Account.CreateSubnetWithRetry(vnetName, subnetName, fmt.Sprintf("10.239.%d.0/22", i*4), 3*time.Second, cli.Config.Timeout) if err != nil { return errors.Errorf("Error trying to create subnet:%s", err.Error()) } @@ -220,7 +220,7 @@ func (cli *CLIProvisioner) provision() error { if cli.Config.IsKubernetes() { // Store the hosts for future introspection - hosts, err := cli.Account.GetHosts(cli.Config.Name) + hosts, err := cli.Account.GetHostsWithRetry(cli.Config.Name, 3*time.Second, cli.Config.Timeout) if err != nil { return errors.Wrap(err, "GetHosts:%s") } @@ -277,7 +277,7 @@ func (cli *CLIProvisioner) generateAndDeploy() error { //if we use Generate, then we need to call CreateDeployment if !cli.Config.UseDeployCommand { - err = cli.Account.CreateDeployment(cli.Config.Name, cli.Engine) + err = cli.Account.CreateDeploymentWithRetry(cli.Config.Name, cli.Engine, 30*time.Second, 60*time.Minute) if err != nil { return errors.Wrap(err, "Error while trying to create deployment") }