Introducing a flexible test tagging system:

Each test is now tagged, so we can control what runs and when. Tests tagged with 'bvt-*' are run on every checkin job in a build lab Tests tagged with 'nightly-*' are run every night Predicates mechanism embedded into the new tagging system allows to tag a test in a conditional way, specifying python expression as a predicate. That allows us to selectively reduce BVT test matrix and speed-up execution of BVT runs in a lab
2015-09-03 14:33:29 -07:00 · 2015-09-03 14:33:29 -07:00 · 7f0460ef5f
--- a/Tests/ParallelTraining/NoQuantization/DoublePrecision/testcases.yml
+++ b/Tests/ParallelTraining/NoQuantization/DoublePrecision/testcases.yml
@ -1,4 +1,10 @@
 dataDir: ../../Data
+tags:
+     # running on every BVT job in 'P' (Parallel) leg in Debug-GPU and Release-CPU configurations:
+     # Also skipping Release-CPU on linux to save time (for now due to unknown reason it runs much longer comparing to windows)
+     - bvt-p  ((flavor=='debug') ^ (device=='cpu')) and not (os=='linux' and device=='cpu')
+     # running unconditionally on every Nightly job in 'P' leg
+     - nightly-p

 testCases:
  Must train epochs in exactly same order and parameters for each MPI Rank:
--- a/Tests/ParallelTraining/NoQuantization/SinglePrecision/testcases.yml
+++ b/Tests/ParallelTraining/NoQuantization/SinglePrecision/testcases.yml
@ -1,5 +1,11 @@
 dataDir: ../../Data

+tags:
+     # running on every BVT job in 'P' (Parallel) leg in Debug-GPU and Release-CPU configurations:
+     - bvt-p  (flavor=='debug') ^ (device=='cpu')
+     # running unconditionally on every Nightly job in 'P' leg
+     - nightly-p
+
 testCases:
  Must train epochs in exactly same order and parameters for each MPI Rank:
    patterns:
--- a/Tests/Speech/LSTM/testcases.yml
+++ b/Tests/Speech/LSTM/testcases.yml
@ -1,4 +1,9 @@
 dataDir: ../Data
+tags:
+     # running on every BVT job in 'L' (LSTM) leg in Debug-GPU and Release-CPU configurations:
+     - bvt-l  (flavor=='debug') ^ (device=='cpu')
+     # running unconditionally on every Nightly job in 'L' leg
+     - nightly-l

 testCases:
  CNTK Run must be completed:
--- a/Tests/Speech/QuickE2E/testcases.yml
+++ b/Tests/Speech/QuickE2E/testcases.yml
@ -1,4 +1,9 @@
 dataDir: ../Data
+tags:
+     # running on every BVT job in 'S' (Speech) leg in Debug-GPU and Release-CPU configurations:
+     - bvt-s  (flavor=='debug') ^ (device=='cpu')
+     # running unconditionally on every Nightly job in 'S' leg
+     - nightly-s

 testCases:
  CNTK Run must be completed:
--- a/Tests/TestDriver.py
+++ b/Tests/TestDriver.py
@ -20,6 +20,10 @@
 #
 # ----- testcases.yml format -------
 # dataDir: <path> #<relative-path-to the data directory
+# tags: # optional tags - see tagging system
+#   - <tag1> <optional-predicate> 
+#   - <tag2> <optional-predicate>
+#   - ....
 #
 # testCases:
 #   <name of the testcase 1>:  
@ -62,7 +66,18 @@
 #   8. baseline.txt
 #        where <flavor> = { debug | release }
 #              <device> = { cpu | gpu }
-# 
+#
+# Baseline files are optional. They only evaluate if test defines one or more pattern-drivern test cases.
+# If no test cases are defined, then TestDriver uses exit code of the run-test script as the only criteria
+# of successful copmpletion of the test.
+
+# ----- Tagging system ------
+# Unit tests can be optionally tagged with 1 or many tags
+# CNTK build/test lab uses those tags to understand which tests to run during different flavors of build jobs (nightly, BVT, checkin)
+#
+# Tag can be optionally predicated with a python boolean expression over 'flavor' (debug/release), 'device' (cpu/gpu), 'os' (windows/linux) variables.
+# this allows to restrict tagging of the test to specific combinations of those variables
+#
 # ----- Algorithm ------
 # Baseline verification:
 #   For each testcase 
@ -80,7 +95,7 @@
 # matching against all test-cases/pattern simulteneously
 #

-import sys, os, argparse, traceback, yaml, subprocess, random, re, time
+import sys, os, argparse, traceback, yaml, subprocess, random, re, time, sets

 thisDir = os.path.dirname(os.path.realpath(__file__))
 windows = os.getenv("OS")=="Windows_NT"
@ -97,8 +112,10 @@ class Test:
    self.suite = suite
    self.name = name
    self.fullName = suite + "/" + name
+
    # computing location of test directory (yml file directory)
    self.testDir = os.path.dirname(pathToYmlFile)
+
    # parsing yml file with testcases 
    with open(pathToYmlFile, "r") as f:
      self.rawYamlData = yaml.safe_load(f.read())
@ -109,14 +126,42 @@ class Test:
    else:
      self.dataDir = self.testDir

-    testCasesYaml = self.rawYamlData["testCases"]
+    # parsing test cases
    self.testCases = []
-    for name in testCasesYaml.keys():
-      try:
-        self.testCases.append(TestCase(name, testCasesYaml[name]))
-      except Exception as e:
-        print >>sys.stderr, "ERROR registering test case: " + name
-        raise
+    if "testCases" in self.rawYamlData.keys():
+      testCasesYaml = self.rawYamlData["testCases"]
+      for name in testCasesYaml.keys():
+        try:
+          self.testCases.append(TestCase(name, testCasesYaml[name]))
+        except Exception as e:
+          print >>sys.stderr, "ERROR registering test case: " + name
+          raise 
+
+    # parsing all tags, example input:
+    # tags:
+    # - bvt-l  (flavor=='debug') ^ (device=='cpu')  # tag with a python predicate expression
+    # - nightly-l  #tag without a predicate
+    #
+    # Predicate expressions must produce boolean value and may refer to following variables: flavor, device, os
+    self.tags = {}
+    if self.rawYamlData["tags"]:
+      for tagLine in self.rawYamlData["tags"]:
+        tagLineSplit = tagLine.split(' ', 1) # splitting tag name from predicate expression
+        tagName = tagLineSplit[0].lower().strip()
+
+        # using specified python expression (or 'True' if former isn't provided)
+        pythonExpr = tagLineSplit[1] if len(tagLineSplit)==2 else "True"
+
+        # converting python expression into lambda and doing a smoke test by calling it with dummy parameters
+        predicate = lambda pythonExpr=pythonExpr, **kwargs: eval(pythonExpr, kwargs)
+        try:
+          assert(type(predicate(flavor='foo', device='var', os='foobar')) == bool)
+        except Exception as e:
+          print "Can't parse tag predicate expression in {0} ({1}):\n{2}".format(pathToYmlFile, pythonExpr, e)
+          raise e
+
+        # saving generated lambda into tags dictionary
+        self.tags[tagName] = predicate

  # Populates Tests.allTestsIndexedByFullName by scanning directory tree
  # and finding all testcases.yml files
@ -150,27 +195,30 @@ class Test:
    return result

  def runImpl(self, flavor, device, args):
-    # Locating and reading baseline file
-    baselineFile = self.findBaselineFile(flavor, device)
-    if baselineFile == None:
-      return TestRunResult.fatalError("Baseline file sanity check", "Can't find baseline file")
+    result = TestRunResult()
+    result.succeeded = True

-    with open(baselineFile, "r") as f:
-      baseline = f.read().split("\n")
-      if args.verbose:
-         print "Baseline:", baselineFile
+    # Preparation for pattern-based test cases
+    if len(self.testCases) > 0:
+      # Locating and reading baseline file
+      baselineFile = self.findBaselineFile(flavor, device)
+      if baselineFile == None:
+        return TestRunResult.fatalError("Baseline file sanity check", "Can't find baseline file")
+  
+      with open(baselineFile, "r") as f:
+        baseline = f.read().split("\n")
+        if args.verbose:
+           print "Baseline:", baselineFile

    # Before running the test, pre-creating TestCaseRunResult object for each test case
    # and compute filtered lines from baseline file.
    # Note: some test cases might fail at this time if baseline and/or patterns are inconsistant
-    result = TestRunResult()
-    result.succeeded = True
-    if not args.update_baseline:
-      for testCase in self.testCases:
-        testCaseRunResult = testCase.processBaseline(baseline)
-        if not testCaseRunResult.succeeded:
-           result.succeeded = False
-        result.testCaseRunResults.append(testCaseRunResult)
+      if not args.update_baseline:
+        for testCase in self.testCases:
+          testCaseRunResult = testCase.processBaseline(baseline)
+          if not testCaseRunResult.succeeded:
+             result.succeeded = False
+          result.testCaseRunResults.append(testCaseRunResult)
  
    # preparing run directory
    runDir = os.path.join(args.run_dir, "{0}_{1}@{2}_{3}".format(self.suite, self.name, flavor, device))
@ -236,7 +284,7 @@ class Test:
      if not testCaseRunResult.succeeded:
        result.succeeded = False

-    if args.update_baseline and result.succeeded:
+    if (self.testCases)>0 and args.update_baseline and result.succeeded:
      # When running in --update-baseline mode 
      # verifying that new output is succesfully matching every pattern in the testcases.yml
      # If this is not the case then baseline update will be rejected
@ -273,6 +321,20 @@ class Test:
            return fullPath
    return None

+  # Checks whether the test matches the specified tag,
+  # returns matched tag name on succes, or None if there is no match(boolean, string) tuple
+  def matchesTag(self, tag, flavor, device, os):
+    tagL = tag.lower() # normalizing the tag for comparison
+    # enumerating all the tags
+    for tag in self.tags.keys():
+      # match by direct string comparison or by prefix matching rule: 
+      # e.g: 'bvt' matches 'bvt' 'bvt-a', 'bvt-b' but not 'bvtx'
+      if tag==tagL or tag.startswith(tagL + "-"):
+        # evaluating tag's predicate
+        if self.tags[tag](flavor=flavor, device=device, os=os):
+          return tag
+    return None
+
 # This class encapsulates one testcase (in testcases.yml file)
 class TestCase:
  def __init__(self, name, yamlNode):
@ -451,8 +513,22 @@ class TestCaseRunResult:

 # Lists all available tests
 def listCommand(args):
-  for t in Test.allTestsIndexedByFullName.values():
-    print t.fullName
+  testsByTag = {}
+  for test in Test.allTestsIndexedByFullName.values():
+     for flavor in args.flavors:
+        for device in args.devices:
+           for os in args.oses:
+             tag = test.matchesTag(args.tag, flavor, device, os) if args.tag else '*'
+             if tag:
+               if tag in testsByTag.keys():
+                 testsByTag[tag].add(test.fullName)
+               else:
+                 testsByTag[tag] = sets.Set([test.fullName])
+  for tag in sorted(testsByTag.keys()):
+    if tag=="*":
+      print ' '.join(sorted(testsByTag[tag]))
+    else:
+      print tag+":", ' '.join(sorted(testsByTag[tag]))

 # Runs given test(s) or all tests
 def runCommand(args):
@ -466,21 +542,9 @@ def runCommand(args):
         return 1
  else:
     testsToRun = Test.allTestsIndexedByFullName.values()
-  devices = ["cpu", "gpu"]
-  if (args.device):
-    args.device = args.device.lower()
-    if not args.device in devices:
-      print >>sys.stderr, "--device must be one of", devices
-      return 1
-    devices = [args.device]

-  flavors = ["debug", "release"]
-  if (args.flavor):
-    args.flavor = args.flavor.lower()
-    if not args.flavor in flavors:
-      print >>sys.stderr, "--flavor must be one of", flavors
-      return 1
-    flavors = [args.flavor]
+  devices = args.devices
+  flavors = args.flavors

  print "CNTK Test Driver is started"
  print "Running tests:  ", " ".join([y.fullName for y in testsToRun])
@ -495,16 +559,24 @@ def runCommand(args):
  for test in testsToRun:
    for flavor in flavors:
      for device in devices:
+        if args.tag and args.tag != '' and not test.matchesTag(args.tag, flavor, device, 'windows' if windows else 'linux'):
+          continue
        totalCount = totalCount + 1
+        if len(test.testCases)==0:
+          # forcing verbose mode (showing all output) for all test which are based on exit code (no pattern-based test cases)
+          args.verbose = True
        # Printing the test which is about to run (without terminating the line)
        sys.stdout.write("Running test {0} ({1} {2}) - ".format(test.fullName, flavor, device));
+        if args.dry_run:
+           print "[SKIPPED] (dry-run)"
+           continue
        # in verbose mode, terminate the line, since there will be a lot of output
        if args.verbose:
          sys.stdout.write("\n");
        sys.stdout.flush()
        # Running the test and collecting a run results
        result = test.run(flavor, device, args)
-      
+
        if args.verbose:
          # writing the test name one more time (after possibly long verbose output)
          sys.stdout.write("Test finished {0} ({1} {2}) - ".format(test.fullName, flavor, device));
@ -549,17 +621,24 @@ runSubparser.add_argument("test", nargs="*",
 defaultBuildLocation=os.path.realpath(os.path.join(thisDir, "..", "x64" if windows else "build"))

 runSubparser.add_argument("-b", "--build-location", default=defaultBuildLocation, help="location of the CNTK build to run")
-runSubparser.add_argument("-d", "--device", help="cpu|gpu - run on a specific device")
-runSubparser.add_argument("-f", "--flavor", help="release|debug - run only a specific flavor")
+runSubparser.add_argument("-t", "--tag", help="runs tests which match the spacified tag")
+runSubparser.add_argument("-d", "--device", help="cpu|gpu - run on a specified device")
+runSubparser.add_argument("-f", "--flavor", help="release|debug - run only a specified flavor")
 tmpDir = os.getenv("TEMP") if windows else "/tmp"
 defaultRunDir=os.path.join(tmpDir, "cntk-test-{0}.{1}".format(time.strftime("%Y%m%d%H%M%S"), random.randint(0,1000000)))
 runSubparser.add_argument("-r", "--run-dir", default=defaultRunDir, help="directory where to store test output, default: a random dir within /tmp")
 runSubparser.add_argument("--update-baseline", action='store_true', help="update baseline file(s) instead of matching them")
 runSubparser.add_argument("-v", "--verbose", action='store_true', help="verbose output - dump all output of test script")
+runSubparser.add_argument("-n", "--dry-run", action='store_true', help="do not run the tests, only print test names and condfigurations to be run")

 runSubparser.set_defaults(func=runCommand)

 listSubparser = subparsers.add_parser("list", help="list available tests")
+listSubparser.add_argument("-t", "--tag", help="limits a resulting list to tests matching the spacified tag")
+listSubparser.add_argument("-d", "--device", help="cpu|gpu - tests for a specified device")
+listSubparser.add_argument("-f", "--flavor", help="release|debug - tests for specified flavor")
+listSubparser.add_argument("--os", help="windows|linux - tests for a specified operating system")
+
 listSubparser.set_defaults(func=listCommand)

 if len(sys.argv)==1:
@ -568,6 +647,32 @@ if len(sys.argv)==1:

 args = parser.parse_args(sys.argv[1:])

+# parsing a --device, --flavor and --os options:
+args.devices = ["cpu", "gpu"]
+if (args.device):
+  args.device = args.device.lower()
+  if not args.device in args.devices:
+    print >>sys.stderr, "--device must be one of", args.devices
+    sys.exit(1)
+  args.devices = [args.device]
+
+args.flavors = ["debug", "release"]
+if (args.flavor):
+  args.flavor = args.flavor.lower()
+  if not args.flavor in args.flavors:
+    print >>sys.stderr, "--flavor must be one of", args.flavors
+    sys.exit(1)
+  args.flavors = [args.flavor]
+
+if args.func == listCommand:
+  args.oses = ["windows", "linux"]
+  if (args.os):
+    args.os = args.os.lower()
+    if not args.os in args.oses:
+      print >>sys.stderr, "--os must be one of", args.oses
+      sys.exit(1)
+  args.oses = [args.os]
+
 # discover all the tests
 Test.discoverAllTests()