{CI} Split automation full test and automatic schedule test modules (#21976)

* test speed up * test speed up * Update automation_test2.yml * update * Update automation_test2.yml * Update automation_test2.yml * update * update * update * Update automation_full_test.py * update * update * update * Update automation_full_test.py * Update automation_full_test.py * Update automation_full_test.py * Update automation_full_test.py * test speed up * update * Delete loa1.pdf * update * update * update * instance * add example * add example * update example * update * update * update * Update azure-pipelines-full-tests.yml
2022-04-20 19:43:10 +08:00 · 2022-04-20 19:43:10 +08:00 · 9bf6fcfc71
--- a/.azure-pipelines/templates/automation_test.yml
+++ b/.azure-pipelines/templates/automation_test.yml
@ -7,6 +7,14 @@ parameters:
 - name: profile
  type: string
  default: latest
+- name: instance_cnt
+  type: string
+  default: '1'
+- name: instance_idx
+  type: string
+  default: '1'
+  # instance_cnt = 8, instance_idx = 3: means we have 8 instances totally, and now we are scheduling modules on third instance
+  # instance_cnt = 1, instance_idx = 1: means we only have 1 instance, so we don't need to schedule modules
 - name: fullTest
  displayName: Run full test?
  type: boolean
@ -56,7 +64,7 @@ steps:
        azdev test --no-exitfirst --repo=./ --src=HEAD --tgt=origin/$(System.PullRequest.TargetBranch) --cli-ci --profile ${{ parameters.profile }} --verbose --series --pytest-args "--durations=0"
      else
        echo "Running full test"
-        python scripts/ci/automation_full_test.py "${{ parameters.profile }}" "$serial_modules"
+        python scripts/ci/automation_full_test.py "${{ parameters.instance_cnt }}" "${{ parameters.instance_idx }}" "${{ parameters.profile }}" "$serial_modules"
      fi
    displayName: "azdev test"
    env:
--- a/azure-pipelines-full-tests.yml
+++ b/azure-pipelines-full-tests.yml
@ -13,26 +13,6 @@ pr:
      - '*'

 jobs:
- job: AutomationTest
-  displayName: Automation Test (Profile Latest)
-  timeoutInMinutes: 120
-  pool:
-    vmImage: 'ubuntu-20.04'
-  strategy:
-    matrix:
-      Python36:
-        python.version: '3.6'
-      Python38:
-        python.version: '3.8'
-      Python310:
-        python.version: '3.10'
-  steps:
-    - template: .azure-pipelines/templates/automation_test.yml
-      parameters:
-        pythonVersion: '$(python.version)'
-        profile: 'latest'
-        fullTest: true
-
 - job: AutomationTest20200901
  displayName: Automation Test (Profile 2020-09-01)
  timeoutInMinutes: 120
@ -92,3 +72,102 @@ jobs:
        pythonVersion: '$(python.version)'
        profile: '2018-03-01-hybrid'
        fullTest: true
+
+- job: AutomationFullTestPython36ProfileLatest
+  displayName: Automation Full Test Python36 Profile Latest
+  timeoutInMinutes: 9999
+  strategy:
+    maxParallel: 8
+    matrix:
+      instance1:
+        Instance_idx: 1
+      instance2:
+        Instance_idx: 2
+      instance3:
+        Instance_idx: 3
+      instance4:
+        Instance_idx: 4
+      instance5:
+        Instance_idx: 5
+      instance6:
+        Instance_idx: 6
+      instance7:
+        Instance_idx: 7
+      instance8:
+        Instance_idx: 8
+  pool:
+    vmImage: 'ubuntu-20.04'
+  steps:
+    - template: .azure-pipelines/templates/automation_test.yml
+      parameters:
+        pythonVersion: '3.6'
+        profile: 'latest'
+        instance_cnt: '8'
+        instance_idx: '$(Instance_idx)'
+        fullTest: true
+
+- job: AutomationFullTestPython38ProfileLatest
+  displayName: Automation Full Test Python38 Profile Latest
+  timeoutInMinutes: 9999
+  strategy:
+    maxParallel: 8
+    matrix:
+      instance1:
+        Instance_idx: 1
+      instance2:
+        Instance_idx: 2
+      instance3:
+        Instance_idx: 3
+      instance4:
+        Instance_idx: 4
+      instance5:
+        Instance_idx: 5
+      instance6:
+        Instance_idx: 6
+      instance7:
+        Instance_idx: 7
+      instance8:
+        Instance_idx: 8
+  pool:
+    vmImage: 'ubuntu-20.04'
+  steps:
+    - template: .azure-pipelines/templates/automation_test.yml
+      parameters:
+        pythonVersion: '3.8'
+        profile: 'latest'
+        instance_cnt: '8'
+        instance_idx: '$(Instance_idx)'
+        fullTest: true
+
+- job: AutomationFullTestPython310ProfileLatest
+  displayName: Automation Full Test Python310 Profile Latest
+  timeoutInMinutes: 9999
+  strategy:
+    maxParallel: 8
+    matrix:
+      instance1:
+        Instance_idx: 1
+      instance2:
+        Instance_idx: 2
+      instance3:
+        Instance_idx: 3
+      instance4:
+        Instance_idx: 4
+      instance5:
+        Instance_idx: 5
+      instance6:
+        Instance_idx: 6
+      instance7:
+        Instance_idx: 7
+      instance8:
+        Instance_idx: 8
+  pool:
+    vmImage: 'ubuntu-20.04'
+  steps:
+    - template: .azure-pipelines/templates/automation_test.yml
+      parameters:
+        pythonVersion: '3.10'
+        profile: 'latest'
+        instance_cnt: '8'
+        instance_idx: '$(Instance_idx)'
+        fullTest: true
--- a/scripts/ci/automation_full_test.py
+++ b/scripts/ci/automation_full_test.py
@ -15,15 +15,120 @@ logger.setLevel(logging.DEBUG)
 ch = logging.StreamHandler()
 ch.setLevel(logging.DEBUG)
 logger.addHandler(ch)
-profile = sys.argv[1]
-serial_modules = sys.argv[2].split()
+
+# sys.argv is passed by .azure-pipelines/templates/automation_test.yml in section `Running full test`
+instance_cnt = int(sys.argv[1])
+instance_idx = int(sys.argv[2])
+profile = sys.argv[3]
+serial_modules = sys.argv[4].split()
+jobs = {
+            'acr': 45,
+            'acs': 62,
+            'advisor': 18,
+            'ams': 136,
+            'apim': 30,
+            'appconfig': 41,
+            'appservice': 150,  # series
+            # 'appservice': 157,  # parallel
+            'aro': 33,
+            'backup': 76,
+            'batch': 21,
+            'batchai': 24,
+            'billing': 21,
+            'botservice': 25,  # series
+            # 'botservice': 28,  # parallel
+            'cdn': 36,
+            'cloud': 18,  # series
+            # 'cloud': 22,  # parallel
+            'cognitiveservices': 24,
+            'config': 21,
+            'configure': 17,
+            'consumption': 21,
+            'container': 19,
+            'cosmosdb': 45,
+            'databoxedge': 25,
+            'deploymentmanager': 18,
+            'dla': 19,
+            'dls': 22,
+            'dms': 22,
+            'eventgrid': 24,
+            'eventhubs': 24,
+            'extension': 0,
+            'feedback': 31,
+            'find': 22,
+            'hdinsight': 34,
+            'identity': 18,
+            'interactive': 18,
+            'iot': 57,
+            'keyvault': 39,
+            'kusto': 23,
+            'lab': 19,
+            'managedservices': 18,
+            'maps': 19,
+            'marketplaceordering': 18,
+            'monitor': 66,
+            'natgateway': 22,
+            'netappfiles': 48,
+            'network': 364,  # series
+            # 'network': 182,  # parallel
+            'policyinsights': 20,
+            'privatedns': 29,
+            'profile': 20,
+            'rdbms': 89,
+            'redis': 31,
+            'relay': 22,
+            'reservations': 20,
+            'resource': 101,
+            'role': 38,
+            'search': 34,
+            'security': 23,
+            'servicebus': 24,
+            'serviceconnector': 56,
+            'servicefabric': 49,
+            'signalr': 20,
+            'sql': 117,
+            'sqlvm': 31,
+            'storage': 108,
+            'synapse': 45,
+            'util': 18,
+            'vm': 313,
+            'azure-cli': 16,
+            'azure-cli-core': 26,
+            'azure-cli-telemetry': 18,
+            'azure-cli-testsdk': 20,
+        }


 class AutomaticScheduling(object):

    def __init__(self):
+        """
+        self.jobs: Record the test time of each module
+        self.modules: All modules and core, ignore extensions
+        self.serial_modules: All modules which need to execute in serial mode
+        self.works: Record which modules each worker needs to test
+        self.instance_cnt:
+        The total number of concurrent automation full test pipeline instance with specify python version
+        Because we share the vm pool with azure-sdk team, so we can't set the number of concurrency arbitrarily
+        Best practice is to keep the number of concurrent tasks below 50
+        If you set a larger number of concurrency, it will cause many instances to be in the waiting state
+        And the network module has the largest number of test cases and can only be tested serially for now, so setting instance_cnt = 8 is sufficient
+        Total concurrent number: AutomationTest20200901 * 3 + AutomationTest20190301 * 3 + AutomationTest20180301 * 3 + AutomationFullTest * 8 * 3 (python_version) = 33
+        self.instance_idx:
+        The index of concurrent automation full test pipeline instance with specify python version
+        For example:
+        instance_cnt = 8, instance_idx = 3: means we have 8 instances totally, and now we are scheduling modules on third instance
+        instance_cnt = 1, instance_idx = 1: means we only have 1 instance, so we don't need to schedule modules
+        """
+        self.jobs = []
        self.modules = {}
        self.serial_modules = serial_modules
+        self.works = []
+        self.instance_cnt = instance_cnt
+        self.instance_idx = instance_idx
+        for i in range(self.instance_cnt):
+            worker = {}
+            self.works.append(worker)
        self.profile = profile

    def get_all_modules(self):
@ -31,12 +136,46 @@ class AutomaticScheduling(object):
        # only get modules and core, ignore extensions
        self.modules = {**result['mod'], **result['core']}

-    def run_modules(self):
-        # divide all modules into parallel or serial execution
+    def append_new_modules(self):
+        # If add a new module, use average test time
+        avg_cost = int(sum(jobs.values()) / len(jobs.values()))
+        for module in self.modules:
+            if module not in jobs.keys():
+                jobs[module] = avg_cost
+        # sort jobs by time cost (desc)
+        self.jobs = sorted(jobs.items(), key=lambda item: -item[1])
+
+    def get_worker(self):
+        """
+        Use greedy algorithm distribute jobs to each worker
+        For each job, we assign it to the worker with the fewest jobs currently
+        :return worker number
+        """
+        for idx, worker in enumerate(self.works):
+            tmp_time = sum(worker.values()) if sum(worker.values()) else 0
+            if idx == 0:
+                worker_time = tmp_time
+                worker_num = idx
+            if tmp_time < worker_time:
+                worker_time = tmp_time
+                worker_num = idx
+        return worker_num
+
+    def get_instance_modules(self):
+        # get modules which need to execute in the pipeline instance with specific index
+        for k, v in self.jobs:
+            idx = self.get_worker()
+            self.works[idx][k] = v
+        # instance_idx: 1~n, python list index: 0~n-1
+        self.instance_idx -= 1
+        return self.works[self.instance_idx]
+
+    def run_instance_modules(self, instance_modules):
+        # divide the modules that the current instance needs to execute into parallel or serial execution
        error_flag = False
        serial_tests = []
        parallel_tests = []
-        for k, v in self.modules.items():
+        for k, v in instance_modules.items():
            if k in self.serial_modules:
                serial_tests.append(k)
            else:
@ -62,9 +201,11 @@ class AutomaticScheduling(object):

 def main():
    logger.info("Start automation full test ...\n")
-    autoschduling = AutomaticScheduling()
-    autoschduling.get_all_modules()
-    sys.exit(1) if autoschduling.run_modules() else sys.exit(0)
+    autoscheduling = AutomaticScheduling()
+    autoscheduling.get_all_modules()
+    autoscheduling.append_new_modules()
+    instance_modules = autoscheduling.get_instance_modules()
+    sys.exit(1) if autoscheduling.run_instance_modules(instance_modules) else sys.exit(0)


 if __name__ == '__main__':