diff --git a/e2e/vmss.go b/e2e/vmss.go index 4138a0659d1..3b1b9202d45 100644 --- a/e2e/vmss.go +++ b/e2e/vmss.go @@ -274,9 +274,22 @@ func CreateVMSSWithRetry(ctx context.Context, s *Scenario) (*ScenarioVM, error) delay := 5 * time.Second retryOn := func(err error) bool { var respErr *azcore.ResponseError + // only retry on Azure API errors with specific error codes + if !errors.As(err, &respErr) { + return false + } // AllocationFailed sometimes happens for exotic SKUs (new GPUs) with limited availability, sometimes retrying helps // It's not a quota issue - return errors.As(err, &respErr) && respErr.StatusCode == 200 && respErr.ErrorCode == "AllocationFailed" + if respErr.StatusCode == 200 && respErr.ErrorCode == "AllocationFailed" { + return true + } + // GalleryImageNotFound can happen transiently after image replication completes + // due to Azure eventual consistency - the gallery API reports success but the + // compute fabric in the target region hasn't fully propagated the image yet + if respErr.StatusCode == 404 && respErr.ErrorCode == "GalleryImageNotFound" { + return true + } + return false } maxAttempts := 10