From 7f65e0f0d69f12bfb3bd8223d26e55f98e35dc0b Mon Sep 17 00:00:00 2001
From: Young Kim <youngseongkim93@gmail.com>
Date: Thu, 29 Jun 2017 16:26:20 -0700
Subject: [PATCH] Clean up crlf and mixed line ending issues. (#391)

---
 CMakeSettings.json                            |  728 +-
 docs/DXIL.rst                                 | 5960 ++++++------
 lib/HLSL/DxilOutputColorBecomesConstant.cpp   |  348 +-
 .../clang/test/HLSL/pix/constantcolorMRT.hlsl |   20 +-
 .../test/HLSL/pix/constantcolorOtherSIVs.hlsl |    2 +-
 .../test/HLSL/pix/constantcolorUAVs.hlsl      |   20 +-
 .../clang/test/HLSL/pix/constantcolorint.hlsl |    2 +-
 tools/clang/unittests/HLSL/ExecutionTest.cpp  | 8640 ++++++++---------
 tools/clang/unittests/HLSL/HlslTestUtils.h    |  666 +-
 .../unittests/HLSL/ShaderOpArithTable.xml     | 6840 ++++++-------
 .../clang/unittests/HLSL/clang-hlsl-tests.rc  |    4 +-
 11 files changed, 11615 insertions(+), 11615 deletions(-)

diff --git a/CMakeSettings.json b/CMakeSettings.json
index 0039321bc..a49709d5b 100644
--- a/CMakeSettings.json
+++ b/CMakeSettings.json
@@ -1,365 +1,365 @@
-{
-    // See https://go.microsoft.com//fwlink//?linkid=834763 for more information about this file.
-    "configurations": [
-        {
-        "name": "x64-Debug",
-        "generator": "Visual Studio 15 2017 Win64",
-        "configurationType" : "Debug",
-        "buildRoot": "${projectDir}\\..\\hlsl.bin",
-        "cmakeCommandArgs":  "",
-        "buildCommandArgs": "-m -v:minimal",
-        "variables": [
-          {
-            "name": "DXC_BUILD_ARCH",
-            "value": "x64"
-          },
-          {
-            "name": "CLANG_ENABLE_ARCMT:BOOL",
-            "value": "OFF"
-          },
-          {
-            "name": "CLANG_ENABLE_STATIC_ANALYZER:BOOL",
-            "value": "OFF"
-          },
-          {
-            "name": "CLANG_INCLUDE_TESTS:BOOL",
-            "value": "OFF"
-          },
-          {
-            "name": "LLVM_INCLUDE_TESTS:BOOL",
-            "value": "OFF"
-          },
-          {
-            "name": "HLSL_INCLUDE_TESTS:BOOL",
-            "value": "ON"
-          },
-          {
-            "name": "LLVM_TARGETS_TO_BUILD:STRING",
-            "value": "None"
-          },
-          {
-            "name": "LLVM_INCLUDE_DOCS:BOOL",
-            "value": "OFF"
-          },
-          {
-            "name": "LLVM_INCLUDE_EXAMPLES:BOOL",
-            "value": "OFF"
-          },
-          {
-            "name": "LIBCLANG_BUILD_STATIC:BOOL",
-            "value": "ON"
-          },
-          {
-            "name": "LLVM_OPTIMIZED_TABLEGEN:BOOL",
-            "value": "OFF"
-          },
-          {
-            "name": "LLVM_REQUIRES_EH:BOOL",
-            "value": "ON"
-          },
-          {
-            "name": "LLVM_APPEND_VC_REV:BOOL",
-            "value": "ON"
-          },
-          {
-            "name": "LLVM_ENABLE_RTTI:BOOL",
-            "value": "ON"
-          },
-          {
-            "name": "LLVM_ENABLE_EH:BOOL",
-            "value": "ON"
-          },
-          {
-            "name": "LLVM_DEFAULT_TARGET_TRIPLE:STRING",
-            "value": "dxil-ms-dx"
-          },
-          {
-            "name": "CLANG_BUILD_EXAMPLES:BOOL",
-            "value": "OFF"
-          },
-          {
-            "name": "-DLLVM_REQUIRES_RTTI:BOOL",
-            "value": "ON"
-          },
-          {
-            "name": "-DCLANG_CL:BOOL",
-            "value": "OFF"
-          },
-          {
-            "name": "-DCMAKE_SYSTEM_VERSION",
-            "value": "10.0.14393.0"
-          }
-        ]
-        },
-        {
-        "name": "x86-Debug",
-        "generator": "Visual Studio 15 2017",
-        "configurationType" : "Debug",
-        "buildRoot": "${projectDir}\\..\\hlsl.bin.${name}",
-        "cmakeCommandArgs":  "",
-        "buildCommandArgs": "-m -v:minimal",
-        "variables": [
-          {
-            "name": "DXC_BUILD_ARCH",
-            "value": "Win32"
-          },
-          {
-            "name": "CLANG_ENABLE_ARCMT:BOOL",
-            "value": "OFF"
-          },
-          {
-            "name": "CLANG_ENABLE_STATIC_ANALYZER:BOOL",
-            "value": "OFF"
-          },
-          {
-            "name": "CLANG_INCLUDE_TESTS:BOOL",
-            "value": "OFF"
-          },
-          {
-            "name": "LLVM_INCLUDE_TESTS:BOOL",
-            "value": "OFF"
-          },
-          {
-            "name": "HLSL_INCLUDE_TESTS:BOOL",
-            "value": "ON"
-          },
-          {
-            "name": "LLVM_TARGETS_TO_BUILD:STRING",
-            "value": "None"
-          },
-          {
-            "name": "LLVM_INCLUDE_DOCS:BOOL",
-            "value": "OFF"
-          },
-          {
-            "name": "LLVM_INCLUDE_EXAMPLES:BOOL",
-            "value": "OFF"
-          },
-          {
-            "name": "LIBCLANG_BUILD_STATIC:BOOL",
-            "value": "ON"
-          },
-          {
-            "name": "LLVM_OPTIMIZED_TABLEGEN:BOOL",
-            "value": "OFF"
-          },
-          {
-            "name": "LLVM_REQUIRES_EH:BOOL",
-            "value": "ON"
-          },
-          {
-            "name": "LLVM_APPEND_VC_REV:BOOL",
-            "value": "ON"
-          },
-          {
-            "name": "LLVM_ENABLE_RTTI:BOOL",
-            "value": "ON"
-          },
-          {
-            "name": "LLVM_ENABLE_EH:BOOL",
-            "value": "ON"
-          },
-          {
-            "name": "LLVM_DEFAULT_TARGET_TRIPLE:STRING",
-            "value": "dxil-ms-dx"
-          },
-          {
-            "name": "CLANG_BUILD_EXAMPLES:BOOL",
-            "value": "OFF"
-          },
-          {
-            "name": "-DLLVM_REQUIRES_RTTI:BOOL",
-            "value": "ON"
-          },
-          {
-            "name": "-DCLANG_CL:BOOL",
-            "value": "OFF"
-          },
-          {
-            "name": "-DCMAKE_SYSTEM_VERSION",
-            "value": "10.0.14393.0"
-          }
-        ]
-        },
-        {
-        "name": "x86-Release",
-        "generator": "Visual Studio 15 2017",
-        "configurationType" : "Release",
-        "buildRoot": "${projectDir}\\..\\hlsl.bin.${name}",
-        "cmakeCommandArgs":  "",
-        "buildCommandArgs": "-m -v:minimal",
-        "variables": [
-          {
-            "name": "DXC_BUILD_ARCH",
-            "value": "Win32"
-          },
-          {
-            "name": "CLANG_ENABLE_ARCMT:BOOL",
-            "value": "OFF"
-          },
-          {
-            "name": "CLANG_ENABLE_STATIC_ANALYZER:BOOL",
-            "value": "OFF"
-          },
-          {
-            "name": "CLANG_INCLUDE_TESTS:BOOL",
-            "value": "OFF"
-          },
-          {
-            "name": "LLVM_INCLUDE_TESTS:BOOL",
-            "value": "OFF"
-          },
-          {
-            "name": "HLSL_INCLUDE_TESTS:BOOL",
-            "value": "ON"
-          },
-          {
-            "name": "LLVM_TARGETS_TO_BUILD:STRING",
-            "value": "None"
-          },
-          {
-            "name": "LLVM_INCLUDE_DOCS:BOOL",
-            "value": "OFF"
-          },
-          {
-            "name": "LLVM_INCLUDE_EXAMPLES:BOOL",
-            "value": "OFF"
-          },
-          {
-            "name": "LIBCLANG_BUILD_STATIC:BOOL",
-            "value": "ON"
-          },
-          {
-            "name": "LLVM_OPTIMIZED_TABLEGEN:BOOL",
-            "value": "OFF"
-          },
-          {
-            "name": "LLVM_REQUIRES_EH:BOOL",
-            "value": "ON"
-          },
-          {
-            "name": "LLVM_APPEND_VC_REV:BOOL",
-            "value": "ON"
-          },
-          {
-            "name": "LLVM_ENABLE_RTTI:BOOL",
-            "value": "ON"
-          },
-          {
-            "name": "LLVM_ENABLE_EH:BOOL",
-            "value": "ON"
-          },
-          {
-            "name": "LLVM_DEFAULT_TARGET_TRIPLE:STRING",
-            "value": "dxil-ms-dx"
-          },
-          {
-            "name": "CLANG_BUILD_EXAMPLES:BOOL",
-            "value": "OFF"
-          },
-          {
-            "name": "-DLLVM_REQUIRES_RTTI:BOOL",
-            "value": "ON"
-          },
-          {
-            "name": "-DCLANG_CL:BOOL",
-            "value": "OFF"
-          },
-          {
-            "name": "-DCMAKE_SYSTEM_VERSION",
-            "value": "10.0.14393.0"
-          }
-        ]
-        },
-        {
-        "name": "x64-Release",
-        "generator": "Visual Studio 15 2017 Win64",
-        "configurationType" : "Release",
-        "buildRoot": "${projectDir}\\..\\hlsl.bin.${name}",
-        "cmakeCommandArgs":  "",
-        "buildCommandArgs": "-m -v:minimal",
-        "variables": [
-          {
-            "name": "DXC_BUILD_ARCH",
-            "value": "x64"
-          },
-          {
-            "name": "CLANG_ENABLE_ARCMT:BOOL",
-            "value": "OFF"
-          },
-          {
-            "name": "CLANG_ENABLE_STATIC_ANALYZER:BOOL",
-            "value": "OFF"
-          },
-          {
-            "name": "CLANG_INCLUDE_TESTS:BOOL",
-            "value": "OFF"
-          },
-          {
-            "name": "LLVM_INCLUDE_TESTS:BOOL",
-            "value": "OFF"
-          },
-          {
-            "name": "HLSL_INCLUDE_TESTS:BOOL",
-            "value": "ON"
-          },
-          {
-            "name": "LLVM_TARGETS_TO_BUILD:STRING",
-            "value": "None"
-          },
-          {
-            "name": "LLVM_INCLUDE_DOCS:BOOL",
-            "value": "OFF"
-          },
-          {
-            "name": "LLVM_INCLUDE_EXAMPLES:BOOL",
-            "value": "OFF"
-          },
-          {
-            "name": "LIBCLANG_BUILD_STATIC:BOOL",
-            "value": "ON"
-          },
-          {
-            "name": "LLVM_OPTIMIZED_TABLEGEN:BOOL",
-            "value": "OFF"
-          },
-          {
-            "name": "LLVM_REQUIRES_EH:BOOL",
-            "value": "ON"
-          },
-          {
-            "name": "LLVM_APPEND_VC_REV:BOOL",
-            "value": "ON"
-          },
-          {
-            "name": "LLVM_ENABLE_RTTI:BOOL",
-            "value": "ON"
-          },
-          {
-            "name": "LLVM_ENABLE_EH:BOOL",
-            "value": "ON"
-          },
-          {
-            "name": "LLVM_DEFAULT_TARGET_TRIPLE:STRING",
-            "value": "dxil-ms-dx"
-          },
-          {
-            "name": "CLANG_BUILD_EXAMPLES:BOOL",
-            "value": "OFF"
-          },
-          {
-            "name": "-DLLVM_REQUIRES_RTTI:BOOL",
-            "value": "ON"
-          },
-          {
-            "name": "-DCLANG_CL:BOOL",
-            "value": "OFF"
-          },
-          {
-            "name": "-DCMAKE_SYSTEM_VERSION",
-            "value": "10.0.14393.0"
-          }
-        ]
-        }
-    ]
+{
+    // See https://go.microsoft.com//fwlink//?linkid=834763 for more information about this file.
+    "configurations": [
+        {
+        "name": "x64-Debug",
+        "generator": "Visual Studio 15 2017 Win64",
+        "configurationType" : "Debug",
+        "buildRoot": "${projectDir}\\..\\hlsl.bin",
+        "cmakeCommandArgs":  "",
+        "buildCommandArgs": "-m -v:minimal",
+        "variables": [
+          {
+            "name": "DXC_BUILD_ARCH",
+            "value": "x64"
+          },
+          {
+            "name": "CLANG_ENABLE_ARCMT:BOOL",
+            "value": "OFF"
+          },
+          {
+            "name": "CLANG_ENABLE_STATIC_ANALYZER:BOOL",
+            "value": "OFF"
+          },
+          {
+            "name": "CLANG_INCLUDE_TESTS:BOOL",
+            "value": "OFF"
+          },
+          {
+            "name": "LLVM_INCLUDE_TESTS:BOOL",
+            "value": "OFF"
+          },
+          {
+            "name": "HLSL_INCLUDE_TESTS:BOOL",
+            "value": "ON"
+          },
+          {
+            "name": "LLVM_TARGETS_TO_BUILD:STRING",
+            "value": "None"
+          },
+          {
+            "name": "LLVM_INCLUDE_DOCS:BOOL",
+            "value": "OFF"
+          },
+          {
+            "name": "LLVM_INCLUDE_EXAMPLES:BOOL",
+            "value": "OFF"
+          },
+          {
+            "name": "LIBCLANG_BUILD_STATIC:BOOL",
+            "value": "ON"
+          },
+          {
+            "name": "LLVM_OPTIMIZED_TABLEGEN:BOOL",
+            "value": "OFF"
+          },
+          {
+            "name": "LLVM_REQUIRES_EH:BOOL",
+            "value": "ON"
+          },
+          {
+            "name": "LLVM_APPEND_VC_REV:BOOL",
+            "value": "ON"
+          },
+          {
+            "name": "LLVM_ENABLE_RTTI:BOOL",
+            "value": "ON"
+          },
+          {
+            "name": "LLVM_ENABLE_EH:BOOL",
+            "value": "ON"
+          },
+          {
+            "name": "LLVM_DEFAULT_TARGET_TRIPLE:STRING",
+            "value": "dxil-ms-dx"
+          },
+          {
+            "name": "CLANG_BUILD_EXAMPLES:BOOL",
+            "value": "OFF"
+          },
+          {
+            "name": "-DLLVM_REQUIRES_RTTI:BOOL",
+            "value": "ON"
+          },
+          {
+            "name": "-DCLANG_CL:BOOL",
+            "value": "OFF"
+          },
+          {
+            "name": "-DCMAKE_SYSTEM_VERSION",
+            "value": "10.0.14393.0"
+          }
+        ]
+        },
+        {
+        "name": "x86-Debug",
+        "generator": "Visual Studio 15 2017",
+        "configurationType" : "Debug",
+        "buildRoot": "${projectDir}\\..\\hlsl.bin.${name}",
+        "cmakeCommandArgs":  "",
+        "buildCommandArgs": "-m -v:minimal",
+        "variables": [
+          {
+            "name": "DXC_BUILD_ARCH",
+            "value": "Win32"
+          },
+          {
+            "name": "CLANG_ENABLE_ARCMT:BOOL",
+            "value": "OFF"
+          },
+          {
+            "name": "CLANG_ENABLE_STATIC_ANALYZER:BOOL",
+            "value": "OFF"
+          },
+          {
+            "name": "CLANG_INCLUDE_TESTS:BOOL",
+            "value": "OFF"
+          },
+          {
+            "name": "LLVM_INCLUDE_TESTS:BOOL",
+            "value": "OFF"
+          },
+          {
+            "name": "HLSL_INCLUDE_TESTS:BOOL",
+            "value": "ON"
+          },
+          {
+            "name": "LLVM_TARGETS_TO_BUILD:STRING",
+            "value": "None"
+          },
+          {
+            "name": "LLVM_INCLUDE_DOCS:BOOL",
+            "value": "OFF"
+          },
+          {
+            "name": "LLVM_INCLUDE_EXAMPLES:BOOL",
+            "value": "OFF"
+          },
+          {
+            "name": "LIBCLANG_BUILD_STATIC:BOOL",
+            "value": "ON"
+          },
+          {
+            "name": "LLVM_OPTIMIZED_TABLEGEN:BOOL",
+            "value": "OFF"
+          },
+          {
+            "name": "LLVM_REQUIRES_EH:BOOL",
+            "value": "ON"
+          },
+          {
+            "name": "LLVM_APPEND_VC_REV:BOOL",
+            "value": "ON"
+          },
+          {
+            "name": "LLVM_ENABLE_RTTI:BOOL",
+            "value": "ON"
+          },
+          {
+            "name": "LLVM_ENABLE_EH:BOOL",
+            "value": "ON"
+          },
+          {
+            "name": "LLVM_DEFAULT_TARGET_TRIPLE:STRING",
+            "value": "dxil-ms-dx"
+          },
+          {
+            "name": "CLANG_BUILD_EXAMPLES:BOOL",
+            "value": "OFF"
+          },
+          {
+            "name": "-DLLVM_REQUIRES_RTTI:BOOL",
+            "value": "ON"
+          },
+          {
+            "name": "-DCLANG_CL:BOOL",
+            "value": "OFF"
+          },
+          {
+            "name": "-DCMAKE_SYSTEM_VERSION",
+            "value": "10.0.14393.0"
+          }
+        ]
+        },
+        {
+        "name": "x86-Release",
+        "generator": "Visual Studio 15 2017",
+        "configurationType" : "Release",
+        "buildRoot": "${projectDir}\\..\\hlsl.bin.${name}",
+        "cmakeCommandArgs":  "",
+        "buildCommandArgs": "-m -v:minimal",
+        "variables": [
+          {
+            "name": "DXC_BUILD_ARCH",
+            "value": "Win32"
+          },
+          {
+            "name": "CLANG_ENABLE_ARCMT:BOOL",
+            "value": "OFF"
+          },
+          {
+            "name": "CLANG_ENABLE_STATIC_ANALYZER:BOOL",
+            "value": "OFF"
+          },
+          {
+            "name": "CLANG_INCLUDE_TESTS:BOOL",
+            "value": "OFF"
+          },
+          {
+            "name": "LLVM_INCLUDE_TESTS:BOOL",
+            "value": "OFF"
+          },
+          {
+            "name": "HLSL_INCLUDE_TESTS:BOOL",
+            "value": "ON"
+          },
+          {
+            "name": "LLVM_TARGETS_TO_BUILD:STRING",
+            "value": "None"
+          },
+          {
+            "name": "LLVM_INCLUDE_DOCS:BOOL",
+            "value": "OFF"
+          },
+          {
+            "name": "LLVM_INCLUDE_EXAMPLES:BOOL",
+            "value": "OFF"
+          },
+          {
+            "name": "LIBCLANG_BUILD_STATIC:BOOL",
+            "value": "ON"
+          },
+          {
+            "name": "LLVM_OPTIMIZED_TABLEGEN:BOOL",
+            "value": "OFF"
+          },
+          {
+            "name": "LLVM_REQUIRES_EH:BOOL",
+            "value": "ON"
+          },
+          {
+            "name": "LLVM_APPEND_VC_REV:BOOL",
+            "value": "ON"
+          },
+          {
+            "name": "LLVM_ENABLE_RTTI:BOOL",
+            "value": "ON"
+          },
+          {
+            "name": "LLVM_ENABLE_EH:BOOL",
+            "value": "ON"
+          },
+          {
+            "name": "LLVM_DEFAULT_TARGET_TRIPLE:STRING",
+            "value": "dxil-ms-dx"
+          },
+          {
+            "name": "CLANG_BUILD_EXAMPLES:BOOL",
+            "value": "OFF"
+          },
+          {
+            "name": "-DLLVM_REQUIRES_RTTI:BOOL",
+            "value": "ON"
+          },
+          {
+            "name": "-DCLANG_CL:BOOL",
+            "value": "OFF"
+          },
+          {
+            "name": "-DCMAKE_SYSTEM_VERSION",
+            "value": "10.0.14393.0"
+          }
+        ]
+        },
+        {
+        "name": "x64-Release",
+        "generator": "Visual Studio 15 2017 Win64",
+        "configurationType" : "Release",
+        "buildRoot": "${projectDir}\\..\\hlsl.bin.${name}",
+        "cmakeCommandArgs":  "",
+        "buildCommandArgs": "-m -v:minimal",
+        "variables": [
+          {
+            "name": "DXC_BUILD_ARCH",
+            "value": "x64"
+          },
+          {
+            "name": "CLANG_ENABLE_ARCMT:BOOL",
+            "value": "OFF"
+          },
+          {
+            "name": "CLANG_ENABLE_STATIC_ANALYZER:BOOL",
+            "value": "OFF"
+          },
+          {
+            "name": "CLANG_INCLUDE_TESTS:BOOL",
+            "value": "OFF"
+          },
+          {
+            "name": "LLVM_INCLUDE_TESTS:BOOL",
+            "value": "OFF"
+          },
+          {
+            "name": "HLSL_INCLUDE_TESTS:BOOL",
+            "value": "ON"
+          },
+          {
+            "name": "LLVM_TARGETS_TO_BUILD:STRING",
+            "value": "None"
+          },
+          {
+            "name": "LLVM_INCLUDE_DOCS:BOOL",
+            "value": "OFF"
+          },
+          {
+            "name": "LLVM_INCLUDE_EXAMPLES:BOOL",
+            "value": "OFF"
+          },
+          {
+            "name": "LIBCLANG_BUILD_STATIC:BOOL",
+            "value": "ON"
+          },
+          {
+            "name": "LLVM_OPTIMIZED_TABLEGEN:BOOL",
+            "value": "OFF"
+          },
+          {
+            "name": "LLVM_REQUIRES_EH:BOOL",
+            "value": "ON"
+          },
+          {
+            "name": "LLVM_APPEND_VC_REV:BOOL",
+            "value": "ON"
+          },
+          {
+            "name": "LLVM_ENABLE_RTTI:BOOL",
+            "value": "ON"
+          },
+          {
+            "name": "LLVM_ENABLE_EH:BOOL",
+            "value": "ON"
+          },
+          {
+            "name": "LLVM_DEFAULT_TARGET_TRIPLE:STRING",
+            "value": "dxil-ms-dx"
+          },
+          {
+            "name": "CLANG_BUILD_EXAMPLES:BOOL",
+            "value": "OFF"
+          },
+          {
+            "name": "-DLLVM_REQUIRES_RTTI:BOOL",
+            "value": "ON"
+          },
+          {
+            "name": "-DCLANG_CL:BOOL",
+            "value": "OFF"
+          },
+          {
+            "name": "-DCMAKE_SYSTEM_VERSION",
+            "value": "10.0.14393.0"
+          }
+        ]
+        }
+    ]
 }
\ No newline at end of file
diff --git a/docs/DXIL.rst b/docs/DXIL.rst
index e6fca745c..1641836cd 100644
--- a/docs/DXIL.rst
+++ b/docs/DXIL.rst
@@ -1,2980 +1,2980 @@
-=============================
-DirectX Intermediate Language
-=============================
-
-.. contents::
-   :local:
-   :depth: 2
-
-Introduction
-============
-
-This document presents the design of the DirectX Intermediate Language (DXIL) for GPU shaders. DXIL is intended to support a direct mapping of the HLSL programming language into Low-Level Virtual Machine Intermediate Representation (LLVM IR), suitable for consumption in GPU drivers. This version of the specification is based on LLVM 3.7 in the use of metadata syntax.
-
-We distinguish between DXIL, which is a low-level IR for GPU driver compilers, and DXIR, which is a high-level IR, more suitable for emission by IR producers, such as Clang. DXIR is transformed to DXIL by the optimizer. DXIR accepts high-level constructs, such as user-defined types, multi-dimensional arrays, matrices, and vectors. These, however, are not suitable for fast JIT-ing in the driver compilers, and so are lowered by the optimizer, such that DXIL works on simpler abstractions. Both DXIL and DXIR are derived from LLVM IR. This document does not describe DXIR.
-
-LLVM is quickly becoming a de facto standard in modern compilation technology. The LLVM framework offers several distinct features, such as a vibrant ecosystem, complete compilation framework, modular design, and reasonable documentation. We can leverage these to achieve two important objectives.
-
-First, unification of shader compilation tool chain. DXIL is a contract between IR producers, such as compilers for HLSL and other domain-specific languages, and IR consumers, such as IHV driver JIT compilers or offline XBOX shader compiler. In addition, the design provides for conversion the current HLSL IL, called DXBC IL in this document, to DXIL.
-
-Second, leveraging the LLVM ecosystem. Microsoft will publicly document DXIL and DXIR to attract domain language implementers and spur innovation. Using LLVM-based IR offers reduced entry costs for small teams, simply because small teams are likely to use LLVM and Clang as their main compilation framework. We will provide DXIL verifier to check consistency of generated DXIL.
-
-The following diagram shows how some of these components tie together::
-
-  HLSL   Other shading langs  DSL          DXBC IL
-  +      +                    +            +
-  |      |                    |            |
-  v      v                    v            v
-  Clang  Clang                Other Tools  dxbc2dxil
-  +      +                    +            +
-  |      |                    |            |
-  v      v                    v            |
-  +------+--------------------+---------+  |
-  |          High level IR (DXIR)       |  |
-  +-------------------------------------+  |
-                    |                      |
-                    |                      |
-                    v                      |
-                Optimizer <-----+ Linker   |
-                +      ^             +     |
-                |      |             |     |
-                |      |             |     |
-   +------------v------+-------------v-----v-------+
-   |              Low level IR (DXIL)              |
-   +------------+----------------------+-----------+
-                |                      |
-                v                      v
-        Driver Compiler             Verifier
-
-The *dxbc2dxil* element in the diagram is a component that converts existing DXBC shader byte code into DXIL. The *Optimizer* element is a component that consumes DXIR, verifies it is valid, optimizes it, and produces a valid DXIL form. The *Verifier* element is a public component that verifies and signs DXIL. The *Linker* is a component that combines precompiled DXIL libraries with the entry function to produce a valid shader.
-
-DXIL does not support the following HLSL features that were present in prior implementations.
-
-* Shader models 9 and below. Microsoft may implement 10level9 shader models via DXIL capability tiers.
-* Effects.
-* HLSL interfaces.
-* Shader compression/decompression.
-* Partial precision. Half data type should be used instead.
-* min10float type. Half data type should be used instead.
-* HLSL *uniform* parameter qualifier.
-* Current fxc legacy compatibility mode for old shader models (e.g., c-register binding).
-* PDB. Debug Information annotations are used instead.
-* Compute shader model cs_4_0.
-* DXBC label, call, fcall constructs.
-
-The following principles are used to ease reuse with LLVM components and aid extensibility.
-
-* DXIL uses a subset of LLVM IR constructs that makes sense for HLSL.
-* No modifications to the core LLVM IR; i.e., no new instructions or fundamental types.
-* Additional information is conveyed via metadata, LLVM intrinsics or external functions.
-* Name prefixes: 'llvm.dx.', 'llvm.dxil.', 'llvm.dxir.', 'dx.', 'dxil.', and 'dxir.' are reserved.
-
-LLVM IR has three equivalent forms: human-readable, binary (bitcode), and in-memory. DXIL is a binary format and is based on a subset of LLVM IR bitcode format. The document uses only human-readable form to describe DXIL.
-
-Versioning
-==========
-
-There are three versioning mechanisms in DXIL shaders: shader model, DXIL version, and LLVM bitcode version.
-
-At a high-level, the shader model describes the target execution model and environment; DXIL provides a mechanism to express programs (including rules around expressing data types and operations); and LLVM bitcode provides a way to encode a DXIL program.
-
-Shader Model
-------------
-
-The shader model in DXIL is similar to DXBC shader model. A shader model specifies the execution model, the set of capabilities that shader instructions can use and the constraints that a shader program must adhere to.
-
-The shader model is specified as a named metadata in DXIL::
-
-  !dx.shaderModel = !{ !0 }
-  !0 = !{ !"<shadelModelName>", i32 <major>, i32 <minor> }
-
-The following values of <shaderModelName>_<major>_<minor> are supported:
-
-==================== ===================================== ===========
-Target               Legacy Models                         DXIL Models
-==================== ===================================== ===========
-Vertex shader (VS)   vs_4_0, vs_4_1, vs_5_0, vs_5_1        vs_6_0
-Hull shader (HS)     hs_5_0, hs_5_1                        hs_6_0
-Domain shader (DS)   ds_5_0, ds_5_1                        ds_6_0
-Geometry shader (GS) gs_4_0, gs_4_1, gs_5_0, gs_5_1        gs_6_0
-Pixel shader (PS)    ps_4_0, ps_4_1, ps_5_0, ps_5_1        ps_6_0
-Compute shader (CS)  cs_5_0 (cs_4_0 is mapped onto cs_5_0) cs_6_0
-Shader library       no support                            lib_6_1
-==================== ===================================== ===========
-
-The DXIL verifier ensures that DXIL conforms to the specified shader model.
-
-For shader models prior to 6.0, only the rules applicable to the DXIL representation are valid. For example, the limits on maximum number of resources is honored, but the limits on registers aren't because DXIL does not have a representation for registers.
-
-DXIL version
-------------
-
-The primary mechanism to evolve HLSL capabilities is through shader models. However, DXIL version is reserved for additional flexibility of future extensions. There are two currently defined versions: 1.0 and 1.1.
-
-DXIL version has major and minor versions that are specified as named metadata::
-
-  !dx.version = !{ !0 }
-  !0 = !{ i32 <major>, i32 <minor> }
-
-DXIL version must be declared exactly once per LLVM module (translation unit) and is valid for the entire module.
-
-DXIL will evolve in a manner that retains backward compatibility.
-
-LLVM Bitcode version
---------------------
-
-The current version of DXIL is based on LLVM bitcode v3.7. This encoding is necessarily implied by something outside the DXIL module.
-
-General Issues
-==============
-
-An important goal is to enable HLSL to be closer to a strict subset of C/C++. This has implications for DXIL design and future hardware feature requests outlined below.
-
-Terminology
------------
-Resource refers to one of the following:
-
-* SRV - shader resource view (read-only)
-* UAV - unordered access view (read-write)
-* CBV - constant buffer view (read-only)
-* Sampler
-
-Intrinsics typically refer to operations missing in the core LLVM IR. DXIL represents HLSL built-in functions (also called intrinsics) not as LLVM intrinsics, but rather as external function calls.
-
-
-DXIL abstraction level
-----------------------
-
-DXIL has level of abstraction similar to a 'scalarized' DXBC. DXIL is lower level IR than DXIR emitted by the front-end to be amenable to fast and robust JIT-ing in driver compilers.
-
-In particular, the following passes are performed to lower the HLSL/DXIR abstractions down to DXIL:
-
-* optimize function parameter copies
-* inline functions
-* allocate and transform shader signatures
-* lower matrices, optimizing intermediate storage
-* linearize multi-dimensional arrays and user-defined type accesses
-* scalarize vectors
-
-Scalar IR
----------
-DXIL operations work with scalar quantities. Several scalar quantities may be grouped together in a struct to represent several return values, which is used for memory operations, e.g., load/store, sample, etc., that benefit from access coalescing.
-
-Metadata, resource declarations, and debugging info may contain vectors to more closely convey source code shape to tools and debuggers.
-
-Future versions of IR may contain vectors or grouping hints for less-than-32-bit quantities, such as half and i16.
-
-Memory accesses
----------------
-
-DXIL conceptually aligns with DXBC in how different memory types are accessed. Out-of-bounds behavior and various restrictions are preserved.
-
-Indexable thread-local and groupshared variables are represented as variables and accessed via LLVM C-like pointers.
-
-Swizzled resources, such as textures, have opaque memory layouts from a DXIL point of view. Accesses to these resources are done via intrinsics.
-
-There are two layouts for constant buffer memory: (1) legacy, matching DXBC's layout and (2) linear layout. SM6 DXIL uses intrinsics to read cbuffer for either layout.
-
-Shader signatures require packing and are located in a special type of memory that cannot be viewed as linear. Accesses to signature values are done via special intrinsics in DXIL. If a signature parameter needs to be passed to a function, a copy is created first in threadlocal memory and the copy is passed to the function.
-
-Typed buffers represent memory with in-flight data conversion. Typed buffer load/store/atomics are done via special functions in DXIL with element-granularity indexing.
-
-The following pointer types are supported:
-
-* Non-indexable thread-local variables.
-* Indexable thread-local variables (DXBC x-registers).
-* Groupshared variables (DXBC g-registers).
-* Device memory pointer.
-* Constant-buffer-like memory pointer.
-
-The type of DXIL pointer is differentiated by LLVM addrspace construct. The HLSL compiler will make the best effort to infer the exact pointer addrspace such that a driver compiler can issue the most efficient instruction.
-
-A pointer can come into being in a number of ways:
-
-* Global Variables.
-* AllocaInst.
-* Synthesized as a result of some pointer arithmetic.
-
-DXIL uses 32-bit pointers in its representation.
-
-Out-of-bounds behavior
-----------------------
-
-Indexable thread-local accesses are done via LLVM pointer and have C-like OOB semantics.
-Groupshared accesses are done via LLVM pointer too. The origin of a groupshared pointer must be a single TGSM allocation.
-If a groupshared pointer uses in-bound GEP instruction, it should not OOB. The behavior for an OOB access for in-bound pointer is undefined.
-For groupshared pointer from regular GEP, OOB will has same behavior as DXBC. Loads return 0 for OOB accesses; OOB stores are silently dropped.
-
-Resource accesses keeps the same out-of-bounds behavior as DXBC. Loads return 0 for OOB accesses; OOB stores are silently dropped.
-
-OOB pointer accesses in SM6.0 and later have undefined (C-like) behavior. LLVM memory optimization passes can be used to optimize such accesses. Where out-of-bound behavior is desired, intrinsic functions are used to access memory.
-
-Memory access granularity
--------------------------
-
-Intrinsic and resource accesses may imply a wider access than requested by an instruction. DXIL defines memory accesses for i1, i16, i32, i64, f16, f32, f64 on thread local memory, and i32, f32, f64 for memory I/O (that is, groupshared memory and memory accessed via resources such as CBs, UAVs and SRVs).
-
-
-Number of virtual values
-------------------------
-
-There is no limit on the number of virtual values in DXIL. The IR is guaranteed to be in an SSA form. For optimized shaders, the optimizer will run -mem2reg LLVM pass as well as perform other memory to register promotions if profitable.
-
-Control-flow restrictions
--------------------------
-
-The DXIL control-flow graph must be reducible, as checked by T1-T2 test. DXIL does not preserve structured control flow of DXBC. Preserving structured control-flow property would impose significant burden on third-party tools optimizing to DXIL via LLVM, reducing appeal of DXIL.
-
-DXIL allows fall-through for switch label blocks. This is a difference from DXBC, in which the fall-through is prohibited.
-
-DXIL will not support the DXBC label and call instructions; LLVM functions can be used instead (see below). The primary uses for these are (1) HLSL interfaces, which are not supported, and (2) outlining of case-bodies in a switch statement annotated with [call], which is not a scenario of interest.
-
-Functions
----------
-
-Instead of DXBC labels/calls, DXIL supports functions and call instructions. Recursion is not allowed; DXIL validator enforces this.
-
-The functions are regular LLVM functions. Parameters can be passed by-value or by-reference. The functions are to facilitate separate compilation for big, complex shaders. However, driver compilers are free to inline functions as they see fit.
-
-Identifiers
------------
-
-DXIL identifiers must conform to LLVM IR identifier rules.
-
-Identifier mangling rules are the ones used by Clang 3.7 with the HLSL target.
-
-The following identifier prefixes are reserved:
-
-* dx.*, dxil.*, dxir.*
-* llvm.dx.*, llvm.dxil.*, llvm.dxir.*
-
-Address Width
--------------
-
-DXIL will use only 32-bit addresses for pointers. Byte offsets are also 32-bit.
-
-Shader restrictions
--------------------
-
-There is no support for the following in DXIL:
-
-* recursion
-* exceptions
-* indirect function calls and dynamic dispatch
-
-Entry points
-------------
-
-The dx.entryPoints metadata specifies a list of entry point records, one for each entry point. Libraries could specify more than one entry point per module but currently exist outside the DXIL specification; the other shader models must specify exactly one entry point.
-
-For example::
-
- define void @"\01?myfunc1@@YAXXZ"() #0 { ... }
- define float @"\01?myfunc2@@YAMXZ"() #0 { ... }
-
- !dx.entryPoints = !{ !1, !2 }
-
- !1 = !{ void  ()* @"\01?myfunc1@@YAXXZ", !"myfunc1", !3, null, null }
- !2 = !{ float ()* @"\01?myfunc2@@YAMXZ", !"myfunc2", !5, !6, !7 }
-
-Each entry point metadata record specifies:
-
-* reference to the entry point function global symbol
-* unmangled name
-* list of signatures
-* list of resources
-* list of tag-value pairs of shader capabilities and other properties
-
-A 'null' value specifies absence of a particular node.
-
-Shader capabilities are properties that are additional to properties dictated by shader model. The list is organized as pairs of i32 tag, followed immediately by the value itself.
-
-Hull shader representation
---------------------------
-
-The hull shader is represented as two functions, related via metadata: (1) control point phase function, which is the entry point of the hull shader, and (2) patch constant phase function.
-
-For example::
-
- !dx.entryPoints = !{ !1 }
- !1 = !{ void ()* @"ControlPointFunc", ..., !2 }  ; shader entry record
- !2 = !{ !"HS", !3 }
- !3 = !{ void ()* @"PatchConstFunc", ... }        ; additional hull shader state
-
-The patch constant function represents original HLSL computation, and is not separated into fork and join phases, as it is the case in DXBC. The driver compiler may perform such separation if this is profitable for the target GPU.
-
-In DXBC to DXIL conversion, the original patch constant function cannot be recovered during DXBC-to-DXIL conversion. Instead, instructions of each fork and join phases are 'wrapped' by a loop that iterates the corresponding number of phase-instance-count iterations. Thus, fork/join instance ID becomes the loop induction variable. LoadPatchConstant intrinsic (see below) represents load from DXBC vpc register.
-
-The following table summarizes the names of intrinsic functions to load inputs and store outputs of hull and domain shaders. CP stands for Control Point, PC - for Patch Constant.
-
-=================== ==================== ====================== ======================
-Operation           Control Point (Hull) Patch Constant         Domain
-=================== ==================== ====================== ======================
-Store Input CP
-Load Input CP       LoadInput            LoadInput
-Store Output CP     StoreOutput
-Load Output CP                           LoadOutputControlPoint LoadInput
-Store PC                                 StorePatchConstant
-Load PC                                  LoadPatchConstant      LoadPatchConstant
-Store Output Vertex                                             StoreOutput
-=================== ==================== ====================== ======================
-
-LoadPatchConstant function in PC stage is generated only by DXBC-to-DXIL converter, to access DXBC vpc registers. HLSL compiler produces IR that references LLVM IR values directly.
-
-Type System
-===========
-
-Most of LLVM type system constructs are legal in DXIL.
-
-Primitive Types
----------------
-
-The following types are supported:
-
-* void
-* metadata
-* i1, i8, i16, i32, i64
-* half, float, double
-
-SM6.0 assumes native hardware support for i32 and float types.
-
-i8 is supported only in a few intrinsics to signify masks, enumeration constant values, or in metadata. It's not supported for memory access or computation by the shader.
-
-HLSL min12int, min16int and min16uint data types are mapped to i16.
-
-half and i16 are treated as corresponding DXBC min-presicion types (min16float, min16int/min16uint) in SM6.0.
-
-The HLSL compiler optimizer treats half, i16 and i8 data as data types natively supported by the hardware; i.e., saturation, range clipping, INF/NaN are done according to the IEEE standard. Such semantics allow the optimizer to reuse LLVM optimization passes.
-
-Hardware support for doubles in optional and is guarded by RequiresHardwareDouble CAP bit.
-
-Hardware support for i64 is optional and is guarded by a CAP bit.
-
-Vectors
--------
-
-HLSL vectors are scalarized. They do not participate in computation; however, they may be present in declarations to convey original variable layout to tools, debuggers, and reflection.
-
-Future DXIL may add support for <2 x half> and <2 x i16> vectors or hints for packing related half and i16 quantities.
-
-Matrices
---------
-
-Matrices are lowered to vectors, and are not referenced by instructions. They may be present in declarations to convey original variable layout to tools, debuggers, and reflection.
-
-Arrays
-------
-
-Instructions may reference only 1D arrays of primitive types. However, complex arrays, e.g., multidimensional arrays or user-defined types, may be present to convey original variable layout to tools, debuggers, and reflection.
-
-User-defined types
-------------------
-
-Original HLSL UDTs are lowered and are not referenced by instructions. However, they may be present in declarations to convey original variable layout to tools, debuggers, and reflection. Some resource operations return 'grouping' UDTs that group several return values; such UDTs are immediately 'decomposed' into components that are then consumed by other instructions.
-
-Type conversions
-----------------
-
-Explicit conversions between types are supported via LLVM instructions.
-
-Precise qualifier
------------------
-
-By default, all floating-point HLSL operations are considered 'fast' or non-precise. HLSL and driver compilers are allowed to refactor such operations. Non-precise LLVM instructions: fadd, fsub, fmul, fdiv, frem, fcmp are marked with 'fast' math flags.
-
-HLSL precise type qualifier requires that all operations contributing to the value be IEEE compliant with respect to optimizations. The /Gis compiler switch implicitly declares all variables and values as precise.
-
-Precise behavior is represented in LLVM instructions: fadd, fsub, fmul, fdiv, frem, fcmp by not having 'fast' math flags set. Each relevant call instruction that contributes to computation of a precise value is annotated with dx.precise metadata that indicates that it is illegal for the driver compiler to perform IEEE-unsafe optimizations.
-
-Type annotations
-----------------
-
-User-defined types are annotated in DXIL to 'attach' additional properties to structure fields. For example, DXIL may contain type annotations for reflection purposes::
-
- ; namespace MyNamespace1
- ; {
- ;   struct MyType1
- ;   {
- ;     float field1;
- ;     int2 field2;
- ;   };
- ; }
-
- %struct.MyNamespace1.MyType1 = type { float, <2 x i32> }
- !struct.MyNamespace1.MyType1 = !{ !1, !2 }
- !1 = !{ !"field1", null }
- !2 = !{ !"field2", null }
-
- ; struct MyType2
- ; {
- ;    MyType1 array_field[2];
- ;    float4 float4_field;
- ; };
-
- %struct.MyType2 = type { [2 x %struct.MyType1], <4 x float> }
- !struct.MyType2 = !{ !3, !4 }
- !3 = !{ !"array_field", null }
- !4 = !{ !"float4_field", null }
-
-The type/field annotation metadata hierarchy recursively mimics LLVM type hierarchy.
-
-Each field-annotation record has an optional named-value pair list for infrequent annotations and for future extensions. The lists are null in the example above.
-
-Note that Clang emits '::' to separate namespaces, if any, in type names. We modify Clang to use '.' instead, because it is illegal to use ':' in metadata names.
-
-Shader Properties and Capabilities
-==================================
-
-Additional shader properties are specified via tag-value pair list, which is the last element in the entry function description record.
-
-Shader Flags
-------------
-
-Shaders have additional flags that covey their capabilities via tag-value pair with tag kDxilShaderFlagsTag (0), followed by an i64 bitmask integer. The bits have the following meaning:
-
-=== =====================================================================
-Bit Description
-=== =====================================================================
-0   Disable shader optimizations
-1   Disable math refactoring
-2   Shader uses doubles
-3   Force early depth stencil
-4   Enable raw and structured buffers
-5   Shader uses min-precision, expressed as half and i16
-6   Shader uses double extension intrinsics
-7   Shader uses MSAD
-8   All resources must be bound for the duration of shader execution
-9   Enable view port and RT array index from any stage feeding rasterizer
-10  Shader uses inner coverage
-11  Shader uses stencil
-12  Shader uses intrinsics that access tiled resources
-13  Shader uses relaxed typed UAV load formats
-14  Shader uses Level9 comparison filtering
-15  Shader uses up to 64 UAVs
-16  Shader uses UAVs
-17  Shader uses CS4 raw and structured buffers
-18  Shader uses Rasterizer Ordered Views
-19  Shader uses wave intrinsics
-20  Shader uses int64 instructions
-=== =====================================================================
-
-Geometry Shader
----------------
-
-Geometry shader properties are specified via tag-value pair with tag kDxilGSStateTag (1), followed by a list of GS properties. The format of this list is the following.
-
-=== ==== ===============================================================
-Idx Type Description
-=== ==== ===============================================================
-0   i32  Input primitive (InputPrimitive enum value).
-1   i32  Max vertex count.
-2   i32  Primitive topology for stream 0 (PrimitiveTopology enum value).
-3   i32  Primitive topology for stream 1 (PrimitiveTopology enum value).
-4   i32  Primitive topology for stream 2 (PrimitiveTopology enum value).
-5   i32  Primitive topology for stream 3 (PrimitiveTopology enum value).
-=== ==== ===============================================================
-
-Domain Shader
--------------
-
-Domain shader properties are specified via tag-value pair with tag kDxilDSStateTag (2), followed by a list of DS properties. The format of this list is the following.
-
-=== ==== ===============================================================
-Idx Type Description
-=== ==== ===============================================================
-0   i32  Tessellator domain (TessellatorDomain enum value).
-1   i32  Input control point count.
-=== ==== ===============================================================
-
-Hull Shader
------------
-
-Hull shader properties are specified via tag-value pair with tag kDxilHSStateTag (3), followed by a list of HS properties. The format of this list is the following.
-
-=== ======= =====================================================================
-Idx Type    Description
-=== ======= =====================================================================
-0   MDValue Patch constant function (global symbol).
-1   i32     Input control point count.
-2   i32     Output control point count.
-3   i32     Tessellator domain (TessellatorDomain enum value).
-4   i32     Tessellator partitioning (TessellatorPartitioning enum value).
-5   i32     Tessellator output primitive (TessellatorOutputPrimitive enum value).
-6   float   Max tessellation factor.
-=== ======= =====================================================================
-
-Compute Shader
---------------
-
-Compute shader has the following tag-value properties.
-
-===================== ======================== =============================================
-Tag	                  Value                    Description
-===================== ======================== =============================================
-kDxilNumThreadsTag(4) MD list: (i32, i32, i32) Number of threads (X,Y,Z) for compute shader.
-===================== ======================== =============================================
-
-Shader Parameters and Signatures
-================================
-
-This section formalizes how HLSL shader input and output parameters are expressed in DXIL.
-
-HLSL signatures and semantics
------------------------------
-
-Formal parameters of a shader entry function in HLSL specify how the shader interacts with the graphics pipeline. Input parameters, referred to as an input signature, specify values received by the shader. Output parameters, referred to as an output signature, specify values produced by the shader. The shader compiler maps HLSL input and output signatures into DXIL specifications that conform to hardware constraints outlined in the Direct3D Functional Specification. DXIL specifications are also called signatures.
-
-Signature mapping is a complex process, as there are many constraints. All signature parameters must fit into a finite space of N 4x32-bit registers. For efficiency reasons, parameters are packed together in a way that does not violate specification constraints. The process is called signature packing. Most signatures are tightly packed; however, the VS input signature is not packed, as the values are coming from the Input Assembler (IA) stage rather than the graphics pipeline. Alternately, the PS output signature is allocated to align the SV_Target semantic index with the output register index.
-
-Each HLSL signature parameter is defined via C-like type, interpolation mode, and semantic name and index. The type defines parameter shape, which may be quite complex. Interpolation mode adds to the packing constraints, namely that parameters packed together must have compatible interpolation modes. Semantics are extra names associated with parameters for the following purposes: (1) to specify whether a parameter is as a special System Value (SV) or not, (2) to link parameters to IA or StreamOut API streams, and (3) to aid debugging. Semantic index is used to disambiguate parameters that use the same semantic name, or span multiple rows of the register space.
-
-SV semantics add specific meanings and constraints to associated parameters. A parameter may be supplied by the hardware, and is then known as a System Generated Value (SGV). Alternatively, a parameter may be interpreted by the hardware and is then known as System Interpreted Value (SIV).  SGVs and SIVs are pipeline-stage dependent; moreover, some participate in signature packing and some do not. Non-SV semantics always participate in signature packing.
-
-Most System Generated Values (SGV) are loaded using special Dxil intrinsic functions, rather than loading the input from a signature.  These usually will not be present in the signature at all.  Their presence may be detected by the declaration and use of the special instrinsic function itself.  The exceptions to this are notible.  In one case they are present and loaded from the signature instead of a special intrinsic because they must be part of the packed signature potentially passed from the prior stage, allowing the prior stage to override these values, such as for SV_PrimitiveID and SV_IsFrontFace that may be written in the the Geometry Shader.  In another case, they identify signature elements that still contribute to DXBC signature for informational purposes, but will only use the special intrinsic function to read the value, such as for SV_PrimitiveID for GS input and SampleIndex for PS input.
-
-The classification of behavior for various system values in various signature locations is described in a table organized by SemanticKind and SigPointKind.  The SigPointKind is a new classification that uniquely identifies each set of parameters that may be input or output for each entry point.  For each combination of SemanticKind and SigPointKind, there is a SemanticInterpretationKind that defines the class of treatment for that location.
-
-Each SigPointKind also has a corresponding element allocation (or packing) behavior called PackingKind.  Some SigPointKinds do not result in a signature at all, which corresponds to the packing kind of PackingKind::None.
-
-Signature Points are enumerated as follows in the SigPointKind
-
-.. <py>import hctdb_instrhelp</py>
-.. <py::lines('SIGPOINT-RST')>hctdb_instrhelp.get_sigpoint_rst()</py>
-.. SIGPOINT-RST:BEGIN
-
-== ======== ======= ========== ============== ============= ============================================================================
-ID SigPoint Related ShaderKind PackingKind    SignatureKind Description
-== ======== ======= ========== ============== ============= ============================================================================
-0  VSIn     Invalid Vertex     InputAssembler Input         Ordinary Vertex Shader input from Input Assembler
-1  VSOut    Invalid Vertex     Vertex         Output        Ordinary Vertex Shader output that may feed Rasterizer
-2  PCIn     HSCPIn  Hull       None           Invalid       Patch Constant function non-patch inputs
-3  HSIn     HSCPIn  Hull       None           Invalid       Hull Shader function non-patch inputs
-4  HSCPIn   Invalid Hull       Vertex         Input         Hull Shader patch inputs - Control Points
-5  HSCPOut  Invalid Hull       Vertex         Output        Hull Shader function output - Control Point
-6  PCOut    Invalid Hull       PatchConstant  PatchConstant Patch Constant function output - Patch Constant data passed to Domain Shader
-7  DSIn     Invalid Domain     PatchConstant  PatchConstant Domain Shader regular input - Patch Constant data plus system values
-8  DSCPIn   Invalid Domain     Vertex         Input         Domain Shader patch input - Control Points
-9  DSOut    Invalid Domain     Vertex         Output        Domain Shader output - vertex data that may feed Rasterizer
-10 GSVIn    Invalid Geometry   Vertex         Input         Geometry Shader vertex input - qualified with primitive type
-11 GSIn     GSVIn   Geometry   None           Invalid       Geometry Shader non-vertex inputs (system values)
-12 GSOut    Invalid Geometry   Vertex         Output        Geometry Shader output - vertex data that may feed Rasterizer
-13 PSIn     Invalid Pixel      Vertex         Input         Pixel Shader input
-14 PSOut    Invalid Pixel      Target         Output        Pixel Shader output
-15 CSIn     Invalid Compute    None           Invalid       Compute Shader input
-== ======== ======= ========== ============== ============= ============================================================================
-
-.. SIGPOINT-RST:END
-
-Semantic Interpretations are as follows (SemanticInterpretationKind)
-
-
-.. <py>import hctdb_instrhelp</py>
-.. <py::lines('SEMINT-RST')>hctdb_instrhelp.get_sem_interpretation_enum_rst()</py>
-.. SEMINT-RST:BEGIN
-
-== ========== =============================================================
-ID Name       Description
-== ========== =============================================================
-0  NA         Not Available
-1  SV         Normal System Value
-2  SGV        System Generated Value (sorted last)
-3  Arb        Treated as Arbitrary
-4  NotInSig   Not included in signature (intrinsic access)
-5  NotPacked  Included in signature, but does not contribute to packing
-6  Target     Special handling for SV_Target
-7  TessFactor Special handling for tessellation factors
-8  Shadow     Shadow element must be added to a signature for compatibility
-== ========== =============================================================
-
-.. SEMINT-RST:END
-
-Semantic Interpretations for each SemanticKind at each SigPointKind are as follows
-
-
-.. <py>import hctdb_instrhelp</py>
-.. <py::lines('SEMINT-TABLE-RST')>hctdb_instrhelp.get_sem_interpretation_table_rst()</py>
-.. SEMINT-TABLE-RST:BEGIN
-
-====================== ============ ===== ============ ============ ====== ======= ========== ============ ====== ===== ===== ============ ===== ============= ============= ========
-Semantic               VSIn         VSOut PCIn         HSIn         HSCPIn HSCPOut PCOut      DSIn         DSCPIn DSOut GSVIn GSIn         GSOut PSIn          PSOut         CSIn
-====================== ============ ===== ============ ============ ====== ======= ========== ============ ====== ===== ===== ============ ===== ============= ============= ========
-Arbitrary              Arb          Arb   NA           NA           Arb    Arb     Arb        Arb          Arb    Arb   Arb   NA           Arb   Arb           NA            NA
-VertexID               SV           NA    NA           NA           NA     NA      NA         NA           NA     NA    NA    NA           NA    NA            NA            NA
-InstanceID             SV           Arb   NA           NA           Arb    Arb     NA         NA           Arb    Arb   Arb   NA           Arb   Arb           NA            NA
-Position               Arb          SV    NA           NA           SV     SV      Arb        Arb          SV     SV    SV    NA           SV    SV            NA            NA
-RenderTargetArrayIndex Arb          SV    NA           NA           SV     SV      Arb        Arb          SV     SV    SV    NA           SV    SV            NA            NA
-ViewPortArrayIndex     Arb          SV    NA           NA           SV     SV      Arb        Arb          SV     SV    SV    NA           SV    SV            NA            NA
-ClipDistance           Arb          SV    NA           NA           SV     SV      Arb        Arb          SV     SV    SV    NA           SV    SV            NA            NA
-CullDistance           Arb          SV    NA           NA           SV     SV      Arb        Arb          SV     SV    SV    NA           SV    SV            NA            NA
-OutputControlPointID   NA           NA    NA           NotInSig     NA     NA      NA         NA           NA     NA    NA    NA           NA    NA            NA            NA
-DomainLocation         NA           NA    NA           NA           NA     NA      NA         NotInSig     NA     NA    NA    NA           NA    NA            NA            NA
-PrimitiveID            NA           NA    NotInSig     NotInSig     NA     NA      NA         NotInSig     NA     NA    NA    Shadow       SGV   SGV           NA            NA
-GSInstanceID           NA           NA    NA           NA           NA     NA      NA         NA           NA     NA    NA    NotInSig     NA    NA            NA            NA
-SampleIndex            NA           NA    NA           NA           NA     NA      NA         NA           NA     NA    NA    NA           NA    Shadow _41    NA            NA
-IsFrontFace            NA           NA    NA           NA           NA     NA      NA         NA           NA     NA    NA    NA           SGV   SGV           NA            NA
-Coverage               NA           NA    NA           NA           NA     NA      NA         NA           NA     NA    NA    NA           NA    NotInSig _50  NotPacked _41 NA
-InnerCoverage          NA           NA    NA           NA           NA     NA      NA         NA           NA     NA    NA    NA           NA    NotInSig _50  NA            NA
-Target                 NA           NA    NA           NA           NA     NA      NA         NA           NA     NA    NA    NA           NA    NA            Target        NA
-Depth                  NA           NA    NA           NA           NA     NA      NA         NA           NA     NA    NA    NA           NA    NA            NotPacked     NA
-DepthLessEqual         NA           NA    NA           NA           NA     NA      NA         NA           NA     NA    NA    NA           NA    NA            NotPacked _50 NA
-DepthGreaterEqual      NA           NA    NA           NA           NA     NA      NA         NA           NA     NA    NA    NA           NA    NA            NotPacked _50 NA
-StencilRef             NA           NA    NA           NA           NA     NA      NA         NA           NA     NA    NA    NA           NA    NA            NotPacked _50 NA
-DispatchThreadID       NA           NA    NA           NA           NA     NA      NA         NA           NA     NA    NA    NA           NA    NA            NA            NotInSig
-GroupID                NA           NA    NA           NA           NA     NA      NA         NA           NA     NA    NA    NA           NA    NA            NA            NotInSig
-GroupIndex             NA           NA    NA           NA           NA     NA      NA         NA           NA     NA    NA    NA           NA    NA            NA            NotInSig
-GroupThreadID          NA           NA    NA           NA           NA     NA      NA         NA           NA     NA    NA    NA           NA    NA            NA            NotInSig
-TessFactor             NA           NA    NA           NA           NA     NA      TessFactor TessFactor   NA     NA    NA    NA           NA    NA            NA            NA
-InsideTessFactor       NA           NA    NA           NA           NA     NA      TessFactor TessFactor   NA     NA    NA    NA           NA    NA            NA            NA
-ViewID                 NotInSig _61 NA    NotInSig _61 NotInSig _61 NA     NA      NA         NotInSig _61 NA     NA    NA    NotInSig _61 NA    NotInSig _61  NA            NA
-Barycentrics           NA           NA    NA           NA           NA     NA      NA         NA           NA     NA    NA    NA           NA    NotPacked _61 NA            NA
-====================== ============ ===== ============ ============ ====== ======= ========== ============ ====== ===== ===== ============ ===== ============= ============= ========
-
-.. SEMINT-TABLE-RST:END
-
-Below is a vertex shader example that is used for illustration throughout this section::
-
- struct Foo {
-   float a;
-   float b[2];
- };
-
- struct VSIn {
-   uint    vid     : SV_VertexID;
-   float3  pos     : Position;
-   Foo     foo[3]  : SemIn1;
-   float   f       : SemIn10;
- };
-
- struct VSOut
- {
-   float   f       : SemOut1;
-   Foo     foo[3]  : SemOut2;
-   float4  pos     : SV_Position;
- };
-
- void main(in  VSIn  In, 	// input  signature
-           out VSOut Out)	// output signature
- {
-   ...
- }
-
-Signature packing must be efficient. It should use as few registers as possible, and the packing algorithm should run in reasonable time. The complication is that the problem is NP complete, and the algorithm needs to resort to using a heuristic.
-
-While the details of the packing algorithm are not important at the moment, it is important to outline some concepts related to how a packed signature is represented in DXIL. Packing is further complicated by the complexity of parameter shapes induced by the C/C++ type system. In the example above, fields of Out.foo array field are actually arrays themselves, strided in memory. Allocating such strided shapes efficiently is hard. To simplify packing, the first step is to break user-defined (struct) parameters into constituent components and to make strided arrays contiguous. This preparation step enables the algorithm to operate on dense rectangular shapes, which we call signature elements. The output signature in the example above has the following elements: float Out_f, float Out_foo_a[3], float Out_foo_b[2][3], and float4 pos. Each element is characterized by the number of rows and columns. These are 1x1, 3x1, 6x1, and 1x4, respectively. The packing algorithm reduces to fitting these elements into Nx4 register space, satisfying all packing-compatibility constraints.
-
-Signature element record
-------------------------
-Each signature element is represented in DXIL as a metadata record.
-
-For above example output signature, the element records are as follows::
-
- ;  element ID, semantic name, etype, sv, s.idx, interp,  rows, cols, start row, col, ext. list
- !20 = !{i32 6, !"SemOut",      i8 0, i8 0, !40,   i8 2, i32 1, i8 1, i32 1,    i8 2, null}
- !21 = !{i32 7, !"SemOut",      i8 0, i8 0, !41,   i8 2, i32 3, i8 1, i32 1,    i8 1, null}
- !22 = !{i32 8, !"SemOut",      i8 0, i8 0, !42,   i8 2, i32 6, i8 1, i32 1,    i8 0, null}
- !23 = !{i32 9, !"SV_Position", i8 0, i8 3, !43,   i8 2, i32 1, i8 4, i32 0,    i8 0, null}
-
-A record contains the following fields.
-
-=== =============== ===============================================================================
-Idx Type            Description
-=== =============== ===============================================================================
-0   i32             Unique signature element record ID, used to identify the element in operations.
-1   String metadata Semantic name.
-2   i8              ComponentType (enum value).
-3   i8              SemanticKind (enum value).
-4   Metadata        Metadata list that enumerates all semantic indexes of the flattened parameter.
-5   i8              InterpolationMode (enum value).
-6   i32             Number of element rows.
-7   i8              Number of element columns.
-8   i32             Starting row of element packing location.
-9   i8              Starting column of element packing location.
-10  Metadata        Metadata list of additional tag-value pairs; can be 'null' or empty.
-=== =============== ===============================================================================
-
-Semantic name system values always start with 'S', 'V', '_' , and it is illegal to start a user semantic with this prefix. Non-SVs can be ignored by drivers. Debug layers may use these to help validate signature compatibility between stages.
-
-The last metadata list is used to specify additional properties and future extensions.
-
-Signature record metadata
--------------------------
-
-A shader typically has two signatures: input and output, while domain shader has an additional patch constant signature. The signatures are composed of signature element records and are attached to the shader entry metadata. The examples below clarify metadata details.
-
-Vertex shader HLSL
-~~~~~~~~~~~~~~~~~~
-
-Here is the HLSL of the above vertex shader. The semantic index assignment is explained in section below::
-
- struct Foo
- {
-   float a;
-   float b[2];
- };
-
- struct VSIn
- {
-   uint    vid     : SV_VertexID;
-   float3  pos     : Position;
-   Foo     foo[3]  : SemIn1;
-     // semantic index assignment:
-     // foo[0].a     : SemIn1
-     // foo[0].b[0]  : SemIn2
-     // foo[0].b[1]  : SemIn3
-     // foo[1].a     : SemIn4
-     // foo[1].b[0]  : SemIn5
-     // foo[1].b[1]  : SemIn6
-     // foo[2].a     : SemIn7
-     // foo[2].b[0]  : SemIn8
-     // foo[2].b[1]  : SemIn9
-   float   f       : SemIn10;
- };
-
- struct VSOut
- {
-   float   f       : SemOut1;
-   Foo     foo[3]  : SemOut2;
-     // semantic index assignment:
-     // foo[0].a     : SemOut2
-     // foo[0].b[0]  : SemOut3
-     // foo[0].b[1]  : SemOut4
-     // foo[1].a     : SemOut5
-     // foo[1].b[0]  : SemOut6
-     // foo[1].b[1]  : SemOut7
-     // foo[2].a     : SemOut8
-     // foo[2].b[0]  : SemOut9
-     // foo[2].b[1]  : SemOut10
-   float4  pos     : SV_Position;
- };
-
- void main(in  VSIn  In, 	// input  signature
-           out VSOut Out)	// output signature
- {
-   ...
- }
-
-The input signature is packed to be compatible with the IA stage. A packing algorithm must assign the following starting positions to the input signature elements:
-
-=================== ==== ======= ========= ===========
-Input element       Rows Columns Start row Start column
-=================== ==== ======= ========= ===========
-uint VSIn.vid       1    1       0         0
-float3 VSIn.pos     1    3       1         0
-float VSIn.foo.a[3] 3    1       2         0
-float VSIn.foo.b[6] 6    1       5         0
-float VSIn.f        1    1       11        0
-=================== ==== ======= ========= ===========
-
-A reasonable packing algorithm would assign the following starting positions to the output signature elements:
-
-==================== ==== ======= ========= ===========
-Input element        Rows Columns Start row Start column
-==================== ==== ======= ========= ===========
-uint VSOut.f         1    1       1         2
-float VSOut.foo.a[3] 3    1       1         1
-float VSOut.foo.b[6] 6    1       1         0
-float VSOut.pos      1    4       0         0
-==================== ==== ======= ========= ===========
-
-Semantic index assignment
-~~~~~~~~~~~~~~~~~~~~~~~~~
-Semantic index assignment in DXIL is exactly the same as for DXBC. Semantic index assignment, abbreviated s.idx above, is a consecutive enumeration of all fields under the same semantic name as if the signature were packed for the IA stage. That is, given a complex signature element, e.g., VSOut's foo[3] with semantic name SemOut and starting index 2, the element is flattened into individual fields: foo[0].a, foo[0].b[0], ..., foo[2].b[1], and the fields receive consecutive semantic indexes 2, 3, ..., 10, respectively. Semantic-index pairs are used to set up the IA stage and to capture values of individual signature registers via the StreamOut API.
-
-DXIL for VS signatures
-~~~~~~~~~~~~~~~~~~~~~~
-
-The corresponding DXIL metadata is presented below::
-
- !dx.entryPoints = !{ !1 }
- !1 = !{ void @main(), !"main", !2, null, null }
- ; Signatures: In,   Out,  Patch Constant (optional)
- !2 = !{       !3,   !4,   null }
-
- ; Input signature (packed accordiong to IA rules)
- !3 = !{ !10, !11, !12, !13, !14 }
- ; element idx, semantic name, etype, sv, s.idx, interp,  rows, cols, start row, col, ext. list
- !10 = !{i32 1, !"SV_VertexID", i8 0, i8 1, !30,  i32 0, i32 1, i8 1, i32 0,    i8 0, null}
- !11 = !{i32 2, !"Position",    i8 0, i8 0, !30,  i32 0, i32 1, i8 3, i32 1,    i8 0, null}
- !12 = !{i32 3, !"SemIn",       i8 0, i8 0, !32,  i32 0, i32 3, i8 1, i32 2,    i8 0, null}
- !13 = !{i32 4, !"SemIn",       i8 0, i8 0, !33,  i32 0, i32 6, i8 1, i32 5,    i8 0, null}
- !14 = !{i32 5, !"SemIn",       i8 0, i8 0, !34,  i32 0, i32 1, i8 1, i32 11,   i8 0, null}
- ; semantic index assignment:
- !30 = !{ i32 0 }
- !32 = !{ i32 1, i32 4, i32 7 }
- !33 = !{ i32 2, i32 3, i32 5, i32 6, i32 8, i32 9 }
- !34 = !{ i32 10 }
-
- ; Output signature (tightly packed according to pipeline stage packing rules)
- !4 = !{ !20, !21, !22, !23 }
- ;  element ID, semantic name, etype, sv, s.idx, interp,  rows, cols, start row, col, ext. list
- !20 = !{i32 6, !"SemOut",      i8 0, i8 0, !40,  i32 2, i32 1, i8 1, i32 1,    i8 2, null}
- !21 = !{i32 7, !"SemOut",      i8 0, i8 0, !41,  i32 2, i32 3, i8 1, i32 1,    i8 1, null}
- !22 = !{i32 8, !"SemOut",      i8 0, i8 0, !42,  i32 2, i32 6, i8 1, i32 1,    i8 0, null}
- !23 = !{i32 9, !"SV_Position", i8 0, i8 3, !43,  i32 2, i32 1, i8 4, i32 0,    i8 0, null}
- ; semantic index assignment:
- !40 = !{ i32 1 }
- !41 = !{ i32 2, i32 5, i32 8 }
- !42 = !{ i32 3, i32 4, i32 6, i32 7, i32 9, i32 10 }
- !43 = !{ i32 0 }
-
-Hull shader example
-~~~~~~~~~~~~~~~~~~~
-A hull shader (HS) is defined by two entry point functions: control point (CP) function to compute control points, and patch constant (PC) function to compute patch constant data, including the tessellation factors. The inputs to both functions are the input control points for an entire patch, and therefore each element may be indexed by row and, in addition, is indexed by vertex.
-
-Here is an HS example entry point metadata and signature list::
-
- ; !105 is extended parameter list containing reference to HS State:
- !101 = !{ void @HSMain(), !"HSMain", !102, null, !105 }
- ; Signatures: In,   Out,  Patch Constant
- !102 = !{     !103, !104, !204 }
-
-The entry point record specifies: (1) CP function HSMain as the main symbol, and (2) PC function via optional metadata node !105.
-
-CP-input signature describing one input control point::
-
- !103 = !{ !110, !111 }
- ;  element ID, semantic name, etype, sv, s.idx, interp,  rows, cols, start row, col, ext. list
- !110= !{i32 1, !"SV_Position", i8 0, i8 3, !130, i32 0, i32 1, i8 4, i32 0,    i8 0, null}
- !111= !{i32 2, !"array",       i8 0, i8 0, !131, i32 0, i32 4, i8 3, i32 1,    i8 0, null}
- ; semantic indexing for flattened elements:
- !130 = !{ i32 0 }
- !131 = !{ i32 0, i32 1, i32 2, i32 3 }
-
-Note that SV_OutputControlPointID and SV_PrimitiveID input elements are SGVs loaded through special Dxil intrinsics, and are not present in the signature at all.  These have a semantic interpretation of SemanticInterpretationKind::NotInSig.
-
-CP-output signature describing one output control point::
-
- !104 = !{ !120, !121 }
- ;  element ID, semantic name, etype, sv, s.idx, interp,  rows, cols, start row, col, ext. list
- !120= !{i32 3, !"SV_Position", i8 0, i8 3, !130, i32 0, i32 1, i8 4, i32 0,    i8 0, null}
- !121= !{i32 4, !"array",       i8 0, i8 0, !131, i32 0, i32 4, i8 3, i32 1,    i8 0, null}
-
-Hull shaders require an extended parameter that defines extra state::
-
- ; extended parameter HS State
- !105 = !{ i32 3, !201 }
-
- ; HS State record defines patch constant function and other properties
- ; Patch Constant Function, in CP count, out CP count, tess domain, tess part, out prim, max tess factor
- !201 = !{  void @PCMain(), 4,           4,            3,           1,         3,        16.0 }
-
-PC-output signature::
-
- !204 = !{ !220, !221, !222 }
- ;  element ID, semantic name,         etype,   sv, s.idx,  interp, rows, cols, start row, col, ext. list
- !220= !{i32 3, !"SV_TessFactor",       i8 0, i8 25, !130,  i32 0, i32 4, i8 1, i32 0, i8 3, null}
- !221= !{i32 4, !"SV_InsideTessFactor", i8 0, i8 26, !231,  i32 0, i32 2, i8 1, i32 4, i8 3, null}
- !222= !{i32 5, !"array",               i8 0, i8 0,  !131,  i32 0, i32 4, i8 3, i32 0, i8 0, null}
- ; semantic indexing for flattened elements:
- !231 = !{ i32 0, i32 1 }
-
-Accessing signature value in operations
----------------------------------------
-
-There are no function parameters or variables that correspond to signature elements. Instead loadInput and storeOutput functions are used to access signature element values in operations. The accesses are scalar.
-
-These are the operation signatures::
-
- ; overloads: SM5.1: f16|f32|i16|i32,  SM6.0: f16|f32|f64|i8|i16|i32|i64
- declare float @dx.op.loadInput.f32(
-     i32,                            ; opcode
-     i32,                            ; input ID
-     i32,                            ; row (relative to start row of input ID)
-     i8,                             ; column (relative to start column of input ID), constant in [0,3]
-     i32)                            ; vertex index
-
- ; overloads: SM5.1: f16|f32|i16|i32,  SM6.0: f16|f32|f64|i8|i16|i32|i64
- declare void @dx.op.storeOutput.f32(
-     i32,                            ; opcode
-     i32,                            ; output ID
-     i32,                            ; row (relative to start row of output ID)
-     i8,                             ; column (relative to start column of output ID), constant in [0,3]
-     float)                          ; value to store
-
-LoadInput/storeOutput takes input/output element ID, which is the unique ID of a signature element metadata record. The row parameter is the array element row index from the start of the element; the register index is obtained by adding the start row of the element and the row parameter value. Similarly, the column parameter is relative column index; the packed register component is obtained by adding the start component of the element (packed col) and the column value. Several overloads exist to access elements of different primitive types. LoadInput takes an additional vertex index parameter that represents vertex index for DS CP-inputs and GS inputs; vertex index must be undef in other cases.
-
-Signature packing
------------------
-
-Signature elements must be packed into a space of N 4-32-bit registers according to runtime constraints. DXIL contains packed signatures. The packing algorithm is more aggressive than that for DX11. However, DXIL packing is only a suggestion to the driver implementation. Driver compilers can rearrange signature elements as they see fit, while preserving compatibility of connected pipeline stages. DXIL is designed in such a way that it is easy to 'relocate' signature elements - loadInput/storeOutput row and column indices do not need to change since they are relative to the start row/column for each element.
-
-Signature packing types
-~~~~~~~~~~~~~~~~~~~~~~~
-
-Two pipeline stages can connect in four different ways, resulting in four packing types.
-
-1. Input Assembly: VS input only
-   * Elements all map to unique registers, they may not be packed together.
-   * Interpolation mode is not used.
-2. Connects to Rasterizer: VS output, HS CP-input/output and PC-input, DS CP-input/output, GS input/output, PS input
-   * Elements can be packed according to constraints.
-   * Interpolation mode is used and must be consistent between connecting signatures.
-   * While HS CP-output and DS CP-input signatures do not go through the rasterizer, they are still treated as such. The reason is the pass-through HS case, in which HS CP-input and HS CP-output must have identical packing for efficiency.
-3. Patch Constant: HS PC-output, DS PC-input
-   * SV_TessFactor and SV_InsideTessFactor are the only SVs relevant here, and this is the only location where they are legal. These have special packing considerations.
-   * Interpolation mode is not used.
-4. Pixel Shader Output: PS output only
-   * Only SV_Target maps to output register space.
-   * No packing is performed, semantic index corresponds to render target index.
-
-Packing constraints
-~~~~~~~~~~~~~~~~~~~
-
-The packing algorithm is stricter and more aggressive in DXIL than in DXBC, although still compatible. In particular, array signature elements are not broken up into scalars, even if each array access can be disambiguated to a literal index. DXIL and DXBC signature packing are not identical, so linking them together into a single pipeline is not supported across compiler generations.
-
-The row dimension of a signature element represents an index range. If constraints permit, two adjacent or overlapping index ranges are coalesced into a single index range.
-
-Packing constraints are as follows:
-
-1. A register must have only one interpolation mode for all 4 components.
-2. Register components containing SVs must be to the right of components containing non-SVs.
-3. SV_ClipDistance and SV_CullDistance have additional constraints:
-   a. May be packed together
-   b. Must occupy a maximum of 2 registers (8-components)
-   c. SV_ClipDistance must have linear interpolation mode
-4. Registers containing SVs may not be within an index range, with the exception of Tessellation Factors (TessFactors).
-5. If an index range R1 overlaps with a TessFactor index range R2, R1 must be contained within R2. As a consequence, outside and inside TessFactors occupy disjoint index ranges when packed.
-6. Non-TessFactor index ranges are combined into a larger range, if they overlap.
-7. SGVs must be packed after all non-SGVs have been packed. If there are several SGVs, they are packed in the order of HLSL declaration.
-
-Packing for SGVs
-~~~~~~~~~~~~~~~~
-
-Non-SGV portions of two connecting signatures must match; however, SGV portions don't have to. An example would be a PS declaring SV_PrimitiveID as an input. If VS connects to PS, PS's SV_PrimitiveID value is synthesized by hardware; moreover, it is illegal to output SV_PrimitiveID from a VS. If GS connects PS, GS may declare SV_PrimitiveID as its output.
-
-Unfortunately, SGV specification creates a complication for separate compilation of connecting shaders. For example, GS outputs SV_PrimitiveID, and PS inputs SV_IsFrontFace and SV_PrimitiveID in this order. The positions of SV_PrimitiveID are incompatible in GS and PS signatures. Not much can be done about this ambiguity in SM5.0 and earlier; the programmers will have to rely on SDKLayers to catch potential mismatch.
-
-SM5.1 and later shaders work on D3D12+ runtime that uses PSO objects to describe pipeline state. Therefore, a driver compiler has access to both connecting shaders during compilation, even though the HLSL compiler does not. The driver compiler can resolve SGV ambiguity in signatures easily. For SM5.1 and later, the HLSL compiler will ensure that declared SGVs fit into packed signature; however, it will set SGV's start row-column location to (-1, 0) such that the driver compiler must resolve SGV placement during PSO compilation.
-
-Shader Resources
-================
-
-All global resources referenced by entry points of an LLVM module are described via named metadata dx.resources, which consists of four metadata lists of resource records::
-
-  !dx.resources = !{ !1, !2, !3, !4 }
-
-Resource lists are as follows.
-
-=== ======== ==============================
-Idx Type     Description
-=== ======== ==============================
-0   Metadata SRVs - shader resource views.
-1   Metadata UAVs - unordered access views.
-2   Metadata CBVs - constant buffer views.
-3   Metadata Samplers.
-=== ======== ==============================
-
-Metadata resource records
--------------------------
-
-Each resource list contains resource records. Each resource record contains fields that are common for each resource type, followed by fields specific to each resource type, followed by a metadata list of tag/value pairs, which can be used to specify additional properties or future extensions and may be null or empty.
-
-Common fields:
-
-=== =============== ==========================================================================================
-Idx Type            Description
-=== =============== ==========================================================================================
-0   i32             Unique resource record ID, used to identify the resource record in createHandle operation.
-1   Pointer         Pointer to a global constant symbol with the original shape of resource and element type.
-2   Metadata string Name of resource variable.
-3   i32             Bind space ID of the root signature range that corresponds to this resource.
-4   i32             Bind lower bound of the root signature range that corresponds to this resource.
-5   i32             Range size of the root signature range that corresponds to this resource.
-=== =============== ==========================================================================================
-
-When the shader has reflection information, the name is the original, unmangled HLSL name. If reflection is stripped, the name is empty string.
-
-SRV-specific fields:
-
-=== =============== ==========================================================================================
-Idx Type            Description
-=== =============== ==========================================================================================
-6   i32             SRV resource shape (enum value).
-7   i32             SRV sample count.
-8   Metadata        Metadata list of additional tag-value pairs.
-=== =============== ==========================================================================================
-
-SRV-specific tag/value pairs:
-
-=== === ==== =================================================== ============================================
-Idx Tag Type Resource Type                                       Description
-=== === ==== =================================================== ============================================
-0   0   i32  Any resource, except RawBuffer and StructuredBuffer Element type.
-1   1   i32  StructuredBuffer                                    Element stride or StructureBuffer, in bytes.
-=== === ==== =================================================== ============================================
-
-The symbol names for the are kDxilTypedBufferElementTypeTag (0) and kDxilStructuredBufferElementStrideTag (1).
-
-UAV-specific fields:
-
-=== =============== ==========================================================================================
-Idx Type            Description
-=== =============== ==========================================================================================
-6   i32             UAV resource shape (enum value).
-7   i1              1 - globally-coherent UAV; 0 - otherwise.
-8   i1              1 - UAV has counter; 0 - otherwise.
-9   i1              1 - UAV is ROV (rasterizer ordered view); 0 - otherwise.
-10  Metadata        Metadata list of additional tag-value pairs.
-=== =============== ==========================================================================================
-
-UAV-specific tag/value pairs:
-
-=== === ==== ====================================================== ============================================
-Idx Tag Type Resource Type                                          Description
-=== === ==== ====================================================== ============================================
-0   0   i32  RW resource, except RWRawBuffer and RWStructuredBuffer Element type.
-1   1   i32  RWStructuredBuffer                                     Element stride or StructureBuffer, in bytes.
-=== === ==== ====================================================== ============================================
-
-The symbol names for the are kDxilTypedBufferElementTypeTag (0) and kDxilStructuredBufferElementStrideTag (1).
-
-CBV-specific fields:
-
-=== =============== ==========================================================================================
-Idx Type            Description
-=== =============== ==========================================================================================
-6   i32             Constant buffer size in bytes.
-7   Metadata        Metadata list of additional tag-value pairs.
-=== =============== ==========================================================================================
-
-Sampler-specific fields:
-
-=== =============== ==========================================================================================
-Idx Type            Description
-=== =============== ==========================================================================================
-6   i32             Sampler type (enum value).
-7   Metadata        Metadata list of additional tag-value pairs.
-=== =============== ==========================================================================================
-
-The following example demonstrates SRV metadata::
-
- ; Original HLSL
- ; Texture2D<float4> MyTexture2D : register(t0, space0);
- ; StructuredBuffer<NS1::MyType1> MyBuffer[2][3] : register(t1, space0);
-
- !1 = !{ !2, !3 }
-
- ; Scalar resource: Texture2D<float4> MyTexture2D.
- %dx.types.ResElem.v4f32 = type { <4 x float> }
- @MyTexture2D = external addrspace(1) constant %dx.types.ResElem.v4f32, align 16
- !2 = !{ i32 0, %dx.types.ResElem.v4f32 addrspace(1)* @MyTexture2D, !"MyTexture2D",
-         i32 0, i32 0, i32 1, i32 2, i32 0, null }
-
- ; Array resource: StructuredBuffer<MyType1> MyBuffer[2][3].
- %struct.NS1.MyType1 = type { float, <2 x i32> }
- %dx.types.ResElem.NS1.MyType1 = type { %struct.NS1.MyType1 }
- @MyBuffer = external addrspace(1) constant [2x [3 x %dx.types.ResElem.NS1.MyType1]], align 16
- !3 = !{ i32 1, [2 x [3 x %dx.types.ResElem.NS1.MyType1]] addrspace(1)* @MyBuffer, !"MyBuffer",
-         i32 0, i32 1, i32 6, i32 11, i32 0, null }
-
-The type name of the variable is constructed by appending the element name (primitive, vector or UDT name) to dx.types.ResElem prefix. The type configuration of the resource range variable conveys (1) resource range shape and (2) resource element type.
-
-
-Reflection information
-----------------------
-
-Resource reflection data is conveyed via the resource's metadata record and global, external variable. The metadata record contains the original HLSL name, root signature range information, and the reference to the global resource variable declaration. The resource variable declaration conveys resource range shape, resource type and resource element type.
-
-The following disassembly provides an example::
-
- ; Scalar resource: Texture2D<float4> MyTexture2D.
- %dx.types.ResElem.v4f32 = type { <4 x float> }
- @MyTexture2D = external addrspace(1) constant %dx.types.ResElem.v4f32, align 16
- !0 = !{ i32 0, %dx.types.ResElem.v4f32 addrspace(1)* @MyTexture2D, !"MyTexture2D",
-         i32 0, i32 3, i32 1, i32 2, i32 0, null }
-
- ; struct MyType2 { float4 field1; int2 field2; };
- ; Constant buffer: ConstantBuffer<MyType2> MyCBuffer1[][3] : register(b5, space7)
- %struct.MyType2 = type { <4 x float>, <2 x i32> }
- ; Type reflection information (optional)
- !struct.MyType2 = !{ !1, !2 }
- !1 = !{ !"field1", null }
- !2 = !{ !"field2", null }
-
- %dx.types.ResElem.MyType1 = type { %struct.MyType2 }
-
- @MyCBuffer1 = external addrspace(1) constant [0 x [3 x %dx.types.ResElem.MyType2]], align 16
-
- !3 = !{ i32 0, [0 x [3 x %dx.types.ResElem.MyType1]] addrspace(1)* @MyCBuffer1, !"MyCBuffer1",
-         i32 7, i32 5, i32 -1, null }
-
-The reflection information can be removed from DXIL by obfuscating the resource HLSL name and resource variable name as well as removing reflection type annotations, if any.
-
-Structure of resource operation
--------------------------------
-
-Operations involving shader resources and samplers are expressed via external function calls.
-
-Below is an example for the sample method::
-
- %dx.types.ResRet.f32 = type { float, float, float, float, i32 }
-
- declare %dx.types.ResRet.f32 @dx.op.sample.f32(
-     i32,                      ; opcode
-     %dx.types.ResHandle,      ; texture handle
-     %dx.types.SamplerHandle,  ; sampler handle
-     float,                    ; coordinate c0
-     float,                    ; coordinate c1
-     float,                    ; coordinate c2
-     float,                    ; coordinate c3
-     i32,                      ; offset o0
-     i32,                      ; offset o1
-     i32,                      ; offset o2
-     float)                    ; clamp
-
-The method always returns five scalar values that are aggregated in dx.types.ResRet.f32 type and extracted into scalars via LLVM's extractelement right after the call. The first four elements are sample values and the last field is the status of operation for tiled resources. Some return values may be unused, which is easily determined from the SSA form. The driver compiler is free to specialize the sample instruction to the most efficient form depending on which return values are used in computation.
-
-If applicable, each intrinsic is overloaded on return type, e.g.::
-
-  %dx.types.ResRet.f32 = type { float, float, float, float, i32 }
-  %dx.types.ResRet.f16 = type { half, half, half, half, i32 }
-
-  declare %dx.types.ResRet.f32 @dx.op.sample.f32(...)
-  declare %dx.types.ResRet.f16 @dx.op.sample.f16(...)
-
-Wherever applicable, the return type indicates the "precision" at which the operation is executed. For example, sample intrinsic that returns half data is allowed to be executed at half precision, assuming hardware supports this; however, if the return type is float, the sample operation must be executed in float precision. If lower-precision is not supported by hardware, it is allowed to execute a higher-precision variant of the operation.
-
-The opcode parameter uniquely identifies the sample operation. More details can be found in the Instructions section. The value of opcode is the same for all overloads of an operation.
-
-Some resource operations are "polymorphic" with respect to resource types, e.g., dx.op.sample.f32 operates on several resource types: Texture1D[Array], Texture2D[Array], Texture3D, TextureCUBE[Array].
-
-Each resource/sampler is represented by a pair of i32 values. The first value is a unique (virtual) resource range ID, which corresponds to HLSL declaration of a resource/sampler. Range ID must be a constant for SM5.1 and below. The second integer is a 0-based index within the range. The index must be constant for SM5.0 and below.
-
-Both indices can be dynamic for SM6 and later to provide flexibility in usage of resources/samplers in control flow, e.g.::
-
-  Texture2D<float4> a[8], b[8];
-  ...
-  Texture2D<float4> c;
-  if(cond)	// arbitrary expression
-    c = a[idx1];
-  else
-    c = b[idx2];
-  ... = c.Sample(...);
-
-Resources/samplers used in such a way must reside in descriptor tables (cannot be root descriptors); this will be validated during shader and root signature setup.
-
-The DXIL verifier will ensure that all leaf-ranges (a and b above) of such a resource/sampler live-range have the same resource/sampler type and element type. If applicable, this constraint may be relaxed in the future. In particular, it is logical from HLSL programmer point of view to issue loads on compatible resource types, e.g., Texture2D, RWTexture2D, ROVTexture2D::
-
-  Texture2D<float4> a[8];
-  RWTexture2D<float4> b[6];
-  ...
-  Texture2D<float4> c;
-  if(cond)	// arbitrary expression
-   c = a[idx1];
-  else
-   c = b[idx2];
-  ... = c.Load(...);
-
-LLVM's undef value is used for unused input parameters. For example, coordinates c2 and c3 in an dx.op.sample.f32 call for Texture2D are undef, as only two coordinates c0 and c1 are required.
-
-If the clamp parameter is unused, its default value is 0.0f.
-
-Resource operations are not overloaded on input parameter types. For example, dx.op.sample.f32 operation does not have an overload where coordinates have half, rather than float, data type. Instead, the precision of input arguments can be inferred from the IR via a straightforward lookup along an SSA edge, e.g.::
-
-  %c0 = fpext half %0 to float
-  %res = call %dx.types.ResRet.f32 @dx.op.sample.f32(..., %c0, ...)
-
-SSA form makes it easy to infer that value %0 of type half got promoted to float. The driver compiler can tailor the instruction to the most efficient form for the target hardware.
-
-Resource operations
--------------------
-
-The section lists resource access operations. The specification is given for float return type, if applicable. The list of all overloads can be found in the appendix on intrinsic operations.
-
-Some general rules to interpret resource operations:
-
-* The number of active (meaningful) return components is determined by resource element type. Other return values must be unused; validator ensures this.
-* GPU instruction needs status only if the status return value is used in the program, which is determined through SSA.
-* Overload suffixes are specified for each resource operation.
-* Type of resource determines which inputs must be defined. Unused inputs are passed typed LLVM 'undef' values. This is checked by the DXIL validator.
-* Offset input parameters are i8 constants in [-8,+7] range; default offset is 0.
-
-Resource operation return types
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Many resource operations return several scalar values as well as status for tiled resource access. The return values are grouped into a helper structure type, as this is LLVM's way to return several values from the operation. After an operation, helper types are immediately decomposed into scalars, which are used in further computation.
-
-The defined helper types are listed below::
-
-  %dx.types.ResRet.i8  = type { i8, i8, i8, i8, i32 }
-  %dx.types.ResRet.i16 = type { i16, i16, i16, i16, i32 }
-  %dx.types.ResRet.i32 = type { i32, i32, i32, i32, i32 }
-  %dx.types.ResRet.i64 = type { i64, i64, i64, i64, i32 }
-  %dx.types.ResRet.f16 = type { half, half, half, half, i32 }
-  %dx.types.ResRet.f32 = type { float, float, float, float, i32 }
-  %dx.types.ResRet.f64 = type { double, double, double, double, i32 }
-
-  %dx.types.Dimensions = type { i32, i32, i32, i32 }
-  %dx.types.SamplePos  = type { float, float }
-
-Resource handles
-~~~~~~~~~~~~~~~~
-
-Resources are identified via handles passed to resource operations. Handles are represented via opaque type::
-
-  %dx.types.Handle     = type { i8 * }
-
-The handles are created out of resource range ID and index into the range::
-
-  declare %dx.types.Handle @dx.op.createHandle(
-      i32,                  ; opcode
-      i8,                   ; resource class: SRV=0, UAV=1, CBV=2, Sampler=3
-      i32,                  ; resource range ID (constant)
-      i32,                  ; index into the range
-      i1)                   ; non-uniform resource index: false or true
-
-Resource class is a constant that indicates which metadata list (SRV, UAV, CBV, Sampler) to use for property queries.
-
-Resource range ID is an i32 constant, which is the position of the metadata record in the corresponding metadata list. Range IDs start with 0 and are contiguous within each list.
-
-Index is an i32 value that may be a constant or a value computed by the shader.
-
-CBufferLoadLegacy
-~~~~~~~~~~~~~~~~~
-
-The following signature shows the operation syntax::
-  
-  ; overloads: SM5.1: f32|i32|f64,  future SM: possibly deprecated
-  %dx.types.CBufRet.f32 = type { float, float, float, float }
-  declare %dx.types.CBufRet.f32 @dx.op.cbufferLoadLegacy.f32(
-      i32,                  ; opcode
-      %dx.types.Handle,     ; resource handle
-      i32)	                ; 0-based row index (row = 16-byte DXBC register)
-
-Valid resource types: ConstantBuffer. Valid shader model: SM5.1 and earlier.
-
-The operation loads four 32-bit values from a constant buffer, which has legacy, 16-byte layout. Values are extracted via "extractvalue" instruction; unused values may be optimized away by the driver compiler. The operation respects SM5.1 and earlier OOB behavior for cbuffers.
-
-CBufferLoad
-~~~~~~~~~~~
-
-The following signature shows the operation syntax::
-
-  ; overloads: SM5.1: f32|i32|f64,  SM6.0: f16|f32|f64|i16|i32|i64
-  declare float @dx.op.cbufferLoad.f32(
-      i32,                  ; opcode
-      %dx.types.Handle,     ; resource handle
-      i32,	                ; byte offset from the start of the buffer memory
-      i32)                  ; read alignment
-
-Valid resource types: ConstantBuffer.
-
-The operation loads a value from a constant buffer, which has linear layout, using 1D index: byte offset from the beginning of the buffer memory. The operation respects SM5.1 and earlier OOB behavior for cbuffers.
-
-Read alignment is a constant value identifying what the byte offset alignment is. If the actual byte offset does not have this alignment, the results of this operation are undefined.
-
-GetDimensions
-~~~~~~~~~~~~~
-
-The following signature shows the operation syntax::
-
-  declare %dx.types.Dimensions @dx.op.getDimensions(
-      i32,                  ; opcode
-      %dx.types.Handle,     ; resource handle
-      i32)                  ; MIP level
-
-This table describes the return component meanings for each resource type { c0, c1, c2, c3 }.
-
-==================== ===== ========== ========== ==========
-Valid resource types c0    c1         c2         c3
-==================== ===== ========== ========== ==========
-[RW]Texture1D        width undef      undef      MIP levels
-[RW]Texture1DArray   width array size undef      MIP levels
-[RW]Texture2D        width height     undef      MIP levels
-[RW]Texture2DArray   width height     array size MIP levels
-[RW]Texture3D        width height     depth      MIP levels
-[RW]Texture2DMS      width height     undef      samples
-[RW]Texture2DMSArray width height     array size samples
-TextureCUBE          width height     undef      MIP levels
-TextureCUBEArray     width height     array size MIP levels
-[RW]TypedBuffer      width undef      undef      undef
-[RW]RawBuffer        width undef      undef      undef
-[RW]StructuredBuffer width undef      undef      undef
-==================== ===== ========== ========== ==========
-
-MIP levels is always undef for RW resources.  Undef means the component will not be used.  The validator will verify this.
-There is no GetDimensions that returns float values.
-
-Sample
-~~~~~~
-
-The following signature shows the operation syntax::
-
-  ; overloads: SM5.1: f32,  SM6.0: f16|f32
-  declare %dx.types.ResRet.f32 @dx.op.sample.f32(
-      i32,                  ; opcode
-      %dx.types.Handle,     ; texture handle
-      %dx.types.Handle,     ; sampler handle
-      float,                ; coordinate c0
-      float,                ; coordinate c1
-      float,                ; coordinate c2
-      float,                ; coordinate c3
-      i32,                  ; offset o0
-      i32,                  ; offset o1
-      i32,                  ; offset o2
-      float)                ; clamp
-
-=================== ================================ ===================
-Valid resource type # of active coordinates          # of active offsets
-=================== ================================ ===================
-Texture1D           1 (c0)                           1 (o0)
-Texture1DArray      2 (c0, c1 = array slice)         1 (o0)
-Texture2D           2 (c0, c1)                       2 (o0, o1)
-Texture2DArray      3 (c0, c1, c2 = array slice)     2 (o0, o1)
-Texture3D           3 (c0, c1, c2)                   3 (o0, o1, o2)
-TextureCUBE         3 (c0, c1, c2)                   3 (o0, o1, o2)
-TextureCUBEArray    4 (c0, c1, c2, c3 = array slice) 3 (o0, o1, o2)
-=================== ================================ ===================
-
-SampleBias
-~~~~~~~~~~
-
-The following signature shows the operation syntax::
-
-  ; overloads: SM5.1: f32,  SM6.0: f16|f32
-  declare %dx.types.ResRet.f32 @dx.op.sampleBias.f32(
-      i32,                  ; opcode
-      %dx.types.Handle,     ; texture handle
-      %dx.types.Handle,     ; sampler handle
-      float,                ; coordinate c0
-      float,                ; coordinate c1
-      float,                ; coordinate c2
-      float,                ; coordinate c3
-      i32,                  ; offset o0
-      i32,                  ; offset o1
-      i32,                  ; offset o2
-      float,                ; bias: in [-16.f,15.99f]
-      float)                ; clamp
-
-Valid resource types and active components/offsets are the same as for the sample operation.
-
-SampleLevel
-~~~~~~~~~~~
-
-The following signature shows the operation syntax::
-
-  ; overloads: SM5.1: f32,  SM6.0: f16|f32
-  declare %dx.types.ResRet.f32 @dx.op.sampleLevel.f32(
-      i32,                  ; opcode
-      %dx.types.Handle,     ; texture handle
-      %dx.types.Handle,     ; sampler handle
-      float,                ; coordinate c0
-      float,                ; coordinate c1
-      float,                ; coordinate c2
-      float,                ; coordinate c3
-      i32,                  ; offset o0
-      i32,                  ; offset o1
-      i32,                  ; offset o2
-      float)                ; LOD
-
-Valid resource types and active components/offsets are the same as for the sample operation.
-
-SampleGrad
-~~~~~~~~~~
-
-The following signature shows the operation syntax::
-
-  ; overloads: SM5.1: f32,  SM6.0: f16|f32
-  declare %dx.types.ResRet.f32 @dx.op.sampleGrad.f32(
-      i32,                  ; opcode
-      %dx.types.Handle,     ; texture handle
-      %dx.types.Handle,     ; sampler handle
-      float,                ; coordinate c0
-      float,                ; coordinate c1
-      float,                ; coordinate c2
-      float,                ; coordinate c3
-      i32,                  ; offset o0
-      i32,                  ; offset o1
-      i32,                  ; offset o2
-      float,                ; ddx0
-      float,                ; ddx1
-      float,                ; ddx2
-      float,                ; ddy0
-      float,                ; ddy1
-      float,                ; ddy2
-      float)                ; clamp
-
-Valid resource types and active components and offsets are the same as for the sample operation. Valid active ddx and ddy are   the same as offsets.
-
-SampleCmp
-~~~~~~~~~
-
-The following signature shows the operation syntax::
-
-  ; overloads: SM5.1: f32,  SM6.0: f16|f32
-  declare %dx.types.ResRet.f32 @dx.op.sampleCmp.f32(
-      i32,                  ; opcode
-      %dx.types.Handle,     ; texture handle
-      %dx.types.Handle,     ; sampler handle
-      float,                ; coordinate c0
-      float,                ; coordinate c1
-      float,                ; coordinate c2
-      float,                ; coordinate c3
-      i32,                  ; offset o0
-      i32,                  ; offset o1
-      i32,                  ; offset o2
-      float,                ; compare value
-      float)                ; clamp
-
-=================== ================================ ===================
-Valid resource type # of active coordinates          # of active offsets
-=================== ================================ ===================
-Texture1D           1 (c0)                           1 (o0)
-Texture1DArray      2 (c0, c1 = array slice)         1 (o0)
-Texture2D           2 (c0, c1)                       2 (o0, o1)
-Texture2DArray      3 (c0, c1, c2 = array slice)     2 (o0, o1)
-TextureCUBE         3 (c0, c1, c2)                   3 (o0, o1, o2)
-TextureCUBEArray    4 (c0, c1, c2, c3 = array slice) 3 (o0, o1, o2)
-=================== ================================ ===================
-
-SampleCmpLevelZero
-~~~~~~~~~~~~~~~~~~
-
-The following signature shows the operation syntax::
-
-  ; overloads: SM5.1: f32,  SM6.0: f16|f32
-  declare %dx.types.ResRet.f32 @dx.op.sampleCmpLevelZero.f32(
-      i32,                  ; opcode
-      %dx.types.Handle,     ; texture handle
-      %dx.types.Handle,     ; sampler handle
-      float,                ; coordinate c0
-      float,                ; coordinate c1
-      float,                ; coordinate c2
-      float,                ; coordinate c3
-      i32,                  ; offset o0
-      i32,                  ; offset o1
-      i32,                  ; offset o2
-      float)                ; compare value
-
-Valid resource types and active components/offsets are the same as for the sampleCmp operation.
-
-TextureLoad
-~~~~~~~~~~~
-
-The following signature shows the operation syntax::
-
-  ; overloads: SM5.1: f32|i32,  SM6.0: f16|f32|i16|i32
-  declare %dx.types.ResRet.f32 @dx.op.textureLoad.f32(
-      i32,                  ; opcode
-      %dx.types.Handle,     ; texture handle
-      i32,                  ; MIP level; sample for Texture2DMS
-      i32,                  ; coordinate c0
-      i32,                  ; coordinate c1
-      i32,                  ; coordinate c2
-      i32,                  ; offset o0
-      i32,                  ; offset o1
-      i32)                  ; offset o2
-
-=================== ========= ============================ ===================
-Valid resource type MIP level # of active coordinates      # of active offsets
-=================== ========= ============================ ===================
-Texture1D           yes       1 (c0)                       1 (o0)
-RWTexture1D         undef     1 (c0)                       undef
-Texture1DArray      yes       2 (c0, c1 = array slice)     1 (o0)
-RWTexture1DArray    undef     2 (c0, c1 = array slice)     undef
-Texture2D           yes       2 (c0, c1)                   2 (o0, o1)
-RWTexture2D         undef     2 (c0, c1)                   undef
-Texture2DArray      yes       3 (c0, c1, c2 = array slice) 2 (o0, o1)
-RWTexture2DArray    undef     3 (c0, c1, c2 = array slice) undef
-Texture3D           yes       3 (c0, c1, c2)               3 (o0, o1, o2)
-RWTexture3D         undef     3 (c0, c1, c2)               undef
-=================== ========= ============================ ===================
-
-For Texture2DMS:
-
-=================== ============ =================================
-Valid resource type Sample index # of active coordinate components
-=================== ============ =================================
-Texture2DMS         yes          2 (c0, c1)
-Texture2DMSArray    yes          3 (c0, c1, c2 = array slice)
-=================== ============ =================================
-
-TextureStore
-~~~~~~~~~~~~
-
-The following signature shows the operation syntax::
-
-  ; overloads: SM5.1: f32|i32,  SM6.0: f16|f32|i16|i32
-  ; returns: status
-  declare void @dx.op.textureStore.f32(
-      i32,                  ; opcode
-      %dx.types.Handle,     ; texture handle
-      i32,                  ; coordinate c0
-      i32,                  ; coordinate c1
-      i32,                  ; coordinate c2
-      float,                ; value v0
-      float,                ; value v1
-      float,                ; value v2
-      float,                ; value v3
-      i8)                   ; write mask
-
-The write mask indicates which components are written (x - 1, y - 2, z - 4, w - 8), similar to DXBC. The mask must cover all resource components.
-
-=================== =================================
-Valid resource type # of active coordinate components
-=================== =================================
-RWTexture1D         1 (c0)
-RWTexture1DArray    2 (c0, c1 = array slice)
-RWTexture2D         2 (c0, c1)
-RWTexture2DArray    3 (c0, c1, c2 = array slice)
-RWTexture3D         3 (c0, c1, c2)
-=================== =================================
-
-CalculateLOD
-~~~~~~~~~~~~
-
-The following signature shows the operation syntax::
-
-  ; returns: LOD
-  declare float @dx.op.calculateLOD.f32(
-      i32,                  ; opcode
-      %dx.types.Handle,     ; texture handle
-      %dx.types.Handle,     ; sampler handle
-      float,                ; coordinate c0, [0.0, 1.0]
-      float,                ; coordinate c1, [0.0, 1.0]
-      float,                ; coordinate c2, [0.0, 1.0]
-      i1)                   ; true - clamped; false - unclamped
-
-============================= =======================
-Valid resource type           # of active coordinates
-============================= =======================
-Texture1D, Texture1DArray     1 (c0)
-Texture2D, Texture2DArray     2 (c0, c1)
-Texture3D                     3 (c0, c1, c2)
-TextureCUBE, TextureCUBEArray 3 (c0, c1, c2)
-============================= =======================
-
-TextureGather
-~~~~~~~~~~~~~
-
-The following signature shows the operation syntax::
-
-  ; overloads: SM5.1: f32|i32,  SM6.0: f16|f32|i16|i32
-  declare %dx.types.ResRet.f32 @dx.op.textureGather.f32(
-      i32,                  ; opcode
-      %dx.types.Handle,     ; texture handle
-      %dx.types.Handle,     ; sampler handle
-      float,                ; coordinate c0
-      float,                ; coordinate c1
-      float,                ; coordinate c2
-      float,                ; coordinate c3
-      i32,                  ; offset o0
-      i32,                  ; offset o1
-      i32)                  ; channel, constant in {0=red,1=green,2=blue,3=alpha}
-
-=================== ================================ ===================
-Valid resource type # of active coordinates          # of active offsets
-=================== ================================ ===================
-Texture2D           2 (c0, c1)                       2 (o0, o1)
-Texture2DArray      3 (c0, c1, c2 = array slice)     2 (o0, o1)
-TextureCUBE         3 (c0, c1, c2)                   0
-TextureCUBEArray    4 (c0, c1, c2, c3 = array slice) 0
-=================== ================================ ===================
-
-TextureGatherCmp
-~~~~~~~~~~~~~~~~
-
-The following signature shows the operation syntax::
-
-  ; overloads: SM5.1: f32|i32,  SM6.0: f16|f32|i16|i32
-  declare %dx.types.ResRet.f32 @dx.op.textureGatherCmp.f32(
-      i32,                  ; opcode
-      %dx.types.Handle,     ; texture handle
-      %dx.types.Handle,     ; sampler handle
-      float,                ; coordinate c0
-      float,                ; coordinate c1
-      float,                ; coordinate c2
-      float,                ; coordinate c3
-      i32,                  ; offset o0
-      i32,                  ; offset o1
-      i32,                  ; channel, constant in {0=red,1=green,2=blue,3=alpha}
-      float)                ; compare value
-
-Valid resource types and active components/offsets are the same as for the textureGather operation.
-
-Texture2DMSGetSamplePosition
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The following signature shows the operation syntax::
-
-  declare %dx.types.SamplePos @dx.op.texture2DMSGetSamplePosition(
-      i32,                  ; opcode
-      %dx.types.Handle,     ; texture handle
-      i32)                  ; sample ID
-
-Returns sample position of a texture.
-
-RenderTargetGetSamplePosition
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The following signature shows the operation syntax::
-
-  declare %dx.types.SamplePos @dx.op.renderTargetGetSamplePosition(
-      i32,                  ; opcode
-      i32)                  ; sample ID
-
-Returns sample position of a render target.
-
-RenderTargetGetSampleCount
-~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The following signature shows the operation syntax::
-
-  declare i32 @dx.op.renderTargetGetSampleCount(
-      i32)                  ; opcode
-
-Returns sample count of a render target.
-
-BufferLoad
-~~~~~~~~~~
-
-The following signature shows the operation syntax::
-
-  ; overloads: SM5.1: f32|i32,  SM6.0: f32|i32
-  declare %dx.types.ResRet.f32 @dx.op.bufferLoad.f32(
-      i32,                  ; opcode
-      %dx.types.Handle,     ; resource handle
-      i32,                  ; coordinate c0
-      i32)                  ; coordinate c1
-
-The call respects SM5.1 OOB and alignment rules.
-
-=================== =====================================================
-Valid resource type # of active coordinates
-=================== =====================================================
-[RW]TypedBuffer     1 (c0 in elements)
-[RW]RawBuffer       1 (c0 in bytes)
-[RW]TypedBuffer     2 (c0 in elements, c1 = byte offset into the element)
-=================== =====================================================
-
-BufferStore
-~~~~~~~~~~~
-
-The following signature shows the operation syntax::
-
-  ; overloads: SM5.1: f32|i32,  SM6.0: f32|i32
-  ; returns: status
-  declare void @dx.op.bufferStore.f32(
-      i32,                  ; opcode
-      %dx.types.Handle,     ; resource handle
-      i32,                  ; coordinate c0
-      i32,                  ; coordinate c1
-      float,                ; value v0
-      float,                ; value v1
-      float,                ; value v2
-      float,                ; value v3
-      i8)                   ; write mask
-
-The call respects SM5.1 OOB and alignment rules.
-
-The write mask indicates which components are written (x - 1, y - 2, z - 4, w - 8), similar to DXBC. For RWTypedBuffer, the mask must cover all resource components. For RWRawBuffer and RWStructuredBuffer, valid masks are: x, xy, xyz, xyzw.
-
-=================== =====================================================
-Valid resource type # of active coordinates
-=================== =====================================================
-RWTypedBuffer       1 (c0 in elements)
-RWRawBuffer         1 (c0 in bytes)
-RWStructuredBuffer  2 (c0 in elements, c1 = byte offset into the element)
-=================== =====================================================
-
-BufferUpdateCounter
-~~~~~~~~~~~~~~~~~~~
-
-The following signature shows the operation syntax::
-
-  ; opcodes: bufferUpdateCounter
-  declare void @dx.op.bufferUpdateCounter(
-      i32,                  ; opcode
-      %dx.types.ResHandle,  ; buffer handle
-      i8)                   ; 1 - increment, -1 - decrement
-
-Valid resource type: RWRawBuffer.
-
-AtomicBinOp
-~~~~~~~~~~~
-
-The following signature shows the operation syntax::
-
-  ; overloads: SM5.1: i32,  SM6.0: i32
-  ; returns: original value in memory before the operation
-  declare i32 @dx.op.atomicBinOp.i32(
-      i32,                  ; opcode
-      %dx.types.Handle,     ; resource handle
-      i32,                  ; binary operation code: EXCHANGE, IADD, AND, OR, XOR, IMIN, IMAX, UMIN, UMAX
-      i32,                  ; coordinate c0
-      i32,                  ; coordinate c1
-      i32,                  ; coordinate c2
-      i32)                  ; new value
-
-The call respects SM5.1 OOB and alignment rules.
-
-=================== =====================================================
-Valid resource type # of active coordinates
-=================== =====================================================
-RWTexture1D         1 (c0)
-RWTexture1DArray    2 (c0, c1 = array slice)
-RWTexture2D         2 (c0, c1)
-RWTexture2DArray    3 (c0, c1, c2 = array slice)
-RWTexture3D         3 (c0, c1, c2)
-RWTypedBuffer       1 (c0 in elements)
-RWRawBuffer         1 (c0 in bytes)
-RWStructuredBuffer  2 (c0 in elements, c1 - byte offset into the element)
-=================== =====================================================
-
-AtomicBinOp subsumes corresponding DXBC atomic operations that do not return the old value in memory. The driver compiler is free to specialize the corresponding GPU instruction if the return value is unused.
-
-AtomicCompareExchange
-~~~~~~~~~~~~~~~~~~~~~
-
-The following signature shows the operation syntax::
-
-  ; overloads: SM5.1: i32,  SM6.0: i32
-  ; returns: original value in memory before the operation
-  declare i32 @dx.op.atomicBinOp.i32(
-      i32,                  ; opcode
-      %dx.types.Handle,     ; resource handle
-      i32,                  ; coordinate c0
-      i32,                  ; coordinate c1
-      i32,                  ; coordinate c2
-      i32,                  ; comparison value
-      i32)                  ; new value
-
-The call respects SM5.1 OOB and alignment rules.
-
-=================== =====================================================
-Valid resource type # of active coordinates
-=================== =====================================================
-RWTexture1D         1 (c0)
-RWTexture1DArray    2 (c0, c1 = array slice)
-RWTexture2D         2 (c0, c1)
-RWTexture2DArray    3 (c0, c1, c2 = array slice)
-RWTexture3D         3 (c0, c1, c2)
-RWTypedBuffer       1 (c0 in elements)
-RWRawBuffer         1 (c0 in bytes)
-RWStructuredBuffer  2 (c0 in elements, c1 - byte offset into the element)
-=================== =====================================================
-
-AtomicCompareExchange subsumes DXBC's atomic compare store. The driver compiler is free to specialize the corresponding GPU instruction if the return value is unused.
-
-GetBufferBasePtr (SM6.0)
-~~~~~~~~~~~~~~~~~~~~~~~~
-
-The following signature shows the operation syntax::
-
-  Returns i8* pointer to the base of [RW]RawBuffer instance.
-  declare i8 addrspace(ASmemory) * @dx.op.getBufferBasePtr.pASmemory (
-      i32,                ; opcode
-      %dx.types.Handle)   ; resource handle
-  Returns i8* pointer to the base of ConstantBuffer instance.
-  declare i8 addrspace(AScbuffer) * @dx.op.getBufferBasePtr.pAScbuffer(
-      i32,                ; opcode
-      %dx.types.Handle)   ; resource handle
-
-Given SM5.1 resource handle, return base pointer to perform pointer-based accesses to the resource memory.
-
-Note: the functionality is requested for SM6.0 to support pointer-based accesses to SM5.1 resources with raw linear memory (raw buffer and cbuffer) in HLSL next. This would be one of the way how a valid pointer is produced in the shader, and would let new-style, pointer-based code access SM5.1 resources with linear memory view.
-
-Atomic operations via pointer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Groupshared memory atomic operations are done via LLVM atomic instructions atomicrmw and cmpxchg. The instructions accept only i32 addrspace(ASgs) * pointers, where ASgs is the addrspace number of groupshared variables. Atomicrmw instruction does not support 'sub' and 'nand' operations. These constraints may be revisited in the future. OOB behavior is undefined.
-SM6.0 will enable similar mechanism for atomic operations performed on device memory (raw buffer).
-
-Samplers
---------
-
-There are no intrinsics for samplers. Sampler reflection data is represented similar to other resources.
-
-Immediate Constant Buffer
--------------------------
-There is no immediate constant buffer in DXIL. Instead, indexable constants are represented via LLVM global initialized constants in address space ASicb.
-
-Texture Buffers
----------------
-A texture buffer is mapped to RawBuffer. Texture buffer variable declarations are present for reflection purposes only.
-
-Groupshared memory
-------------------
-Groupshared memory (DXBC g-registers) is linear in DXIL. Groupshared variables are declared via global variables in addrspace(ASgs). The optimizer will not group variables; the driver compiler can do this if desired. Accesses to groupshared variables occur via pointer load/store instructions (see below).
-
-Indexable threadlocal memory
-----------------------------
-Indexable threadlocal memory (DXBC x-registers) is linear in DXIL. Threadlocal variables are "declared" via alloca instructions. Threadlocal variables are assumed to reside in addrspace(0). The variables are not allocated into some memory pool; the driver compiler can do this, if desired. Accesses to threadlocal variables occur via pointer load/store instructions (see below).
-
-Load/Store/Atomics via pointer in future SM
--------------------------------------------
-HLSL offers several abstractions with linear memory: buffers, cbuffers, groupshared and indexable threadlocal memory, that are conceptually similar, but have different HLSL syntax and some differences in behavior, which are exposed to HLSL developers. The plan is to introduce pointers into HLSL to unify access syntax to such linear-memory resources such that they appear conceptually the same to HLSL programmers.
-
-Each resource memory type is expressed by a unique LLVM address space. The following table shows memory types and their address spaces:
-
-========================================= =====================================
-Memory type                               Address space number n - addrspace(n)
-========================================= =====================================
-code, local, indexable threadlocal memory AS_default = 0
-device memory ([RW]RawBuffer)             AS_memory = 1
-cbuffer-like memory (ConstantBuffer)      AS_cbuffer = 2
-groupshared memory                        AS_groupshared = 3
-========================================= =====================================
-
-Pointers can be produced in the shader in a variety of ways (see Memory accesses section). Note that if GetBaseBufferPtr was used on [RW]RawBuffer or ConstantBuffer to produce a pointer, the base pointer is stateless; i.e., it "loses its connection" to the underlying resource and is treated as a stateless pointer into a particular memory type.
-
-Additional resource properties
-------------------------------
-TODO: enumerate all additional resource range properties, e.g., ROV, Texture2DMS, globally coherent, UAV counter, sampler mode, CB: immediate/dynamic indexed.
-
-Operations
-==========
-DXIL operations are represented in two ways: using LLVM instructions and using LLVM external functions. The reference list of operations as well as their overloads can be found in the attached Excel spreadsheet "DXIL Operations".
-
-Operations via instructions
----------------------------
-
-DXIL uses a subset of core LLVM IR instructions that make sense for HLSL, where the meaning of the LLVM IR operation matches the meaning of the HLSL operation.
-
-The following LLVM instructions are valid in a DXIL program, with the specified operand types where applicable. The legend for overload types (v)oid, (h)alf, (f)loat, (d)ouble, (1)-bit, (8)-bit, (w)ord, (i)nt, (l)ong.
-
-
-.. <py>import hctdb_instrhelp</py>
-.. <py::lines('INSTR-RST')>hctdb_instrhelp.get_instrs_rst()</py>
-.. INSTR-RST:BEGIN
-
-============= ======================================================================= =================
-Instruction   Action                                                                  Operand overloads
-============= ======================================================================= =================
-Ret           returns a value (possibly void), from a function.                       vhfd1wil
-Br            branches (conditional or unconditional)
-Switch        performs a multiway switch
-Add           returns the sum of its two operands                                     wil
-FAdd          returns the sum of its two operands                                     hfd
-Sub           returns the difference of its two operands                              wil
-FSub          returns the difference of its two operands                              hfd
-Mul           returns the product of its two operands                                 wil
-FMul          returns the product of its two operands                                 hfd
-UDiv          returns the quotient of its two unsigned operands                       wil
-SDiv          returns the quotient of its two signed operands                         wil
-FDiv          returns the quotient of its two operands                                hfd
-URem          returns the remainder from the unsigned division of its two operands    wil
-SRem          returns the remainder from the signed division of its two operands      wil
-FRem          returns the remainder from the division of its two operands             hfd
-Shl           shifts left (logical)                                                   wil
-LShr          shifts right (logical), with zero bit fill                              wil
-AShr          shifts right (arithmetic), with 'a' operand sign bit fill               wil
-And           returns a  bitwise logical and of its two operands                      1wil
-Or            returns a bitwise logical or of its two operands                        1wil
-Xor           returns a bitwise logical xor of its two operands                       1wil
-Alloca        allocates memory on the stack frame of the currently executing function
-Load          reads from memory
-Store         writes to memory
-GetElementPtr gets the address of a subelement of an aggregate value
-AtomicCmpXchg atomically modifies memory
-AtomicRMW     atomically modifies memory
-Trunc         truncates an integer                                                    1wil
-ZExt          zero extends an integer                                                 1wil
-SExt          sign extends an integer                                                 1wil
-FPToUI        converts a floating point to UInt                                       hfd1wil
-FPToSI        converts a floating point to SInt                                       hfd1wil
-UIToFP        converts a UInt to floating point                                       hfd1wil
-SIToFP        converts a SInt to floating point                                       hfd1wil
-FPTrunc       truncates a floating point                                              hfd
-FPExt         extends a floating point                                                hfd
-BitCast       performs a bit-preserving type cast                                     hfd1wil
-AddrSpaceCast casts a value addrspace
-ICmp          compares integers                                                       1wil
-FCmp          compares floating points                                                hfd
-PHI           is a PHI node instruction
-Call          calls a function
-Select        selects an instruction
-ExtractValue  extracts from aggregate
-============= ======================================================================= =================
-
-
-.. INSTR-RST:END
-
-Operations via external functions
----------------------------------
-Operations missing in core LLVM IR, such as abs, fma, discard, etc., are represented by external functions, whose name is prefixed with dx.op.
-
-The very first parameter of each such external function is the opcode of the operation, which is an i32 constant. For example, dx.op.unary computes a unary function T res = opcode(T input). Opcode defines which unary function to perform.
-
-Opcodes are defined on a dense range and will be provided as enum in a header file. The opcode parameter is introduced for efficiency reasons: grouping of operations to reduce the total number of overloads and more efficient property lookup, e.g., via an array of operation properties rather than a hash table.
-
-.. <py::lines('OPCODES-RST')>hctdb_instrhelp.get_opcodes_rst()</py>
-.. OPCODES-RST:BEGIN
-
-=== ============================= =================================================================================================================
-ID  Name                          Description
-=== ============================= =================================================================================================================
-0   TempRegLoad_                  Helper load operation
-1   TempRegStore_                 Helper store operation
-2   MinPrecXRegLoad_              Helper load operation for minprecision
-3   MinPrecXRegStore_             Helper store operation for minprecision
-4   LoadInput_                    Loads the value from shader input
-5   StoreOutput_                  Stores the value to shader output
-6   FAbs_                         returns the absolute value of the input value.
-7   Saturate_                     clamps the result of a single or double precision floating point value to [0.0f...1.0f]
-8   IsNaN_                        Returns true if x is NAN or QNAN, false otherwise.
-9   IsInf_                        Returns true if x is +INF or -INF, false otherwise.
-10  IsFinite_                     Returns true if x is finite, false otherwise.
-11  IsNormal_                     returns IsNormal
-12  Cos_                          returns cosine(theta) for theta in radians.
-13  Sin_                          returns sine(theta) for theta in radians.
-14  Tan_                          returns tan(theta) for theta in radians.
-15  Acos_                         Returns the arccosine of the specified value. Input should be a floating-point value within the range of -1 to 1.
-16  Asin_                         Returns the arccosine of the specified value. Input should be a floating-point value within the range of -1 to 1
-17  Atan_                         Returns the arctangent of the specified value. The return value is within the range of -PI/2 to PI/2.
-18  Hcos_                         returns the hyperbolic cosine of the specified value.
-19  Hsin_                         returns the hyperbolic sine of the specified value.
-20  Htan_                         returns the hyperbolic tangent of the specified value.
-21  Exp_                          returns 2^exponent
-22  Frc_                          extract fracitonal component.
-23  Log_                          returns log base 2.
-24  Sqrt_                         returns square root
-25  Rsqrt_                        returns reciprocal square root (1 / sqrt(src)
-26  Round_ne_                     floating-point round to integral float.
-27  Round_ni_                     floating-point round to integral float.
-28  Round_pi_                     floating-point round to integral float.
-29  Round_z_                      floating-point round to integral float.
-30  Bfrev_                        Reverses the order of the bits.
-31  Countbits_                    Counts the number of bits in the input integer.
-32  FirstbitLo_                   Returns the location of the first set bit starting from the lowest order bit and working upward.
-33  FirstbitHi_                   Returns the location of the first set bit starting from the highest order bit and working downward.
-34  FirstbitSHi_                  Returns the location of the first set bit from the highest order bit based on the sign.
-35  FMax_                         returns a if a >= b, else b
-36  FMin_                         returns a if a < b, else b
-37  IMax_                         IMax(a,b) returns a if a > b, else b
-38  IMin_                         IMin(a,b) returns a if a < b, else b
-39  UMax_                         unsigned integer maximum. UMax(a,b) = a > b ? a : b
-40  UMin_                         unsigned integer minimum. UMin(a,b) = a < b ? a : b
-41  IMul_                         multiply of 32-bit operands to produce the correct full 64-bit result.
-42  UMul_                         multiply of 32-bit operands to produce the correct full 64-bit result.
-43  UDiv_                         unsigned divide of the 32-bit operand src0 by the 32-bit operand src1.
-44  UAddc_                        unsigned add of 32-bit operand with the carry
-45  USubb_                        unsigned subtract of 32-bit operands with the borrow
-46  FMad_                         floating point multiply & add
-47  Fma_                          fused multiply-add
-48  IMad_                         Signed integer multiply & add
-49  UMad_                         Unsigned integer multiply & add
-50  Msad_                         masked Sum of Absolute Differences.
-51  Ibfe_                         Integer bitfield extract
-52  Ubfe_                         Unsigned integer bitfield extract
-53  Bfi_                          Given a bit range from the LSB of a number, places that number of bits in another number at any offset
-54  Dot2_                         Two-dimensional vector dot-product
-55  Dot3_                         Three-dimensional vector dot-product
-56  Dot4_                         Four-dimensional vector dot-product
-57  CreateHandle                  creates the handle to a resource
-58  CBufferLoad                   loads a value from a constant buffer resource
-59  CBufferLoadLegacy             loads a value from a constant buffer resource
-60  Sample                        samples a texture
-61  SampleBias                    samples a texture after applying the input bias to the mipmap level
-62  SampleLevel                   samples a texture using a mipmap-level offset
-63  SampleGrad                    samples a texture using a gradient to influence the way the sample location is calculated
-64  SampleCmp                     samples a texture and compares a single component against the specified comparison value
-65  SampleCmpLevelZero            samples a texture and compares a single component against the specified comparison value
-66  TextureLoad                   reads texel data without any filtering or sampling
-67  TextureStore                  reads texel data without any filtering or sampling
-68  BufferLoad                    reads from a TypedBuffer
-69  BufferStore                   writes to a RWTypedBuffer
-70  BufferUpdateCounter           atomically increments/decrements the hidden 32-bit counter stored with a Count or Append UAV
-71  CheckAccessFullyMapped        determines whether all values from a Sample, Gather, or Load operation accessed mapped tiles in a tiled resource
-72  GetDimensions                 gets texture size information
-73  TextureGather                 gathers the four texels that would be used in a bi-linear filtering operation
-74  TextureGatherCmp              same as TextureGather, except this instrution performs comparison on texels, similar to SampleCmp
-75  Texture2DMSGetSamplePosition  gets the position of the specified sample
-76  RenderTargetGetSamplePosition gets the position of the specified sample
-77  RenderTargetGetSampleCount    gets the number of samples for a render target
-78  AtomicBinOp                   performs an atomic operation on two operands
-79  AtomicCompareExchange         atomic compare and exchange to memory
-80  Barrier                       inserts a memory barrier in the shader
-81  CalculateLOD                  calculates the level of detail
-82  Discard                       discard the current pixel
-83  DerivCoarseX_                 computes the rate of change per stamp in x direction.
-84  DerivCoarseY_                 computes the rate of change per stamp in y direction.
-85  DerivFineX_                   computes the rate of change per pixel in x direction.
-86  DerivFineY_                   computes the rate of change per pixel in y direction.
-87  EvalSnapped                   evaluates an input attribute at pixel center with an offset
-88  EvalSampleIndex               evaluates an input attribute at a sample location
-89  EvalCentroid                  evaluates an input attribute at pixel center
-90  SampleIndex                   returns the sample index in a sample-frequency pixel shader
-91  Coverage                      returns the coverage mask input in a pixel shader
-92  InnerCoverage                 returns underestimated coverage input from conservative rasterization in a pixel shader
-93  ThreadId                      reads the thread ID
-94  GroupId                       reads the group ID (SV_GroupID)
-95  ThreadIdInGroup               reads the thread ID within the group (SV_GroupThreadID)
-96  FlattenedThreadIdInGroup      provides a flattened index for a given thread within a given group (SV_GroupIndex)
-97  EmitStream                    emits a vertex to a given stream
-98  CutStream                     completes the current primitive topology at the specified stream
-99  EmitThenCutStream             equivalent to an EmitStream followed by a CutStream
-100 GSInstanceID                  GSInstanceID
-101 MakeDouble                    creates a double value
-102 SplitDouble                   splits a double into low and high parts
-103 LoadOutputControlPoint        LoadOutputControlPoint
-104 LoadPatchConstant             LoadPatchConstant
-105 DomainLocation                DomainLocation
-106 StorePatchConstant            StorePatchConstant
-107 OutputControlPointID          OutputControlPointID
-108 PrimitiveID                   PrimitiveID
-109 CycleCounterLegacy            CycleCounterLegacy
-110 WaveIsFirstLane               returns 1 for the first lane in the wave
-111 WaveGetLaneIndex              returns the index of the current lane in the wave
-112 WaveGetLaneCount              returns the number of lanes in the wave
-113 WaveAnyTrue                   returns 1 if any of the lane evaluates the value to true
-114 WaveAllTrue                   returns 1 if all the lanes evaluate the value to true
-115 WaveActiveAllEqual            returns 1 if all the lanes have the same value
-116 WaveActiveBallot              returns a struct with a bit set for each lane where the condition is true
-117 WaveReadLaneAt                returns the value from the specified lane
-118 WaveReadLaneFirst             returns the value from the first lane
-119 WaveActiveOp                  returns the result the operation across waves
-120 WaveActiveBit                 returns the result of the operation across all lanes
-121 WavePrefixOp                  returns the result of the operation on prior lanes
-122 QuadReadLaneAt                reads from a lane in the quad
-123 QuadOp                        returns the result of a quad-level operation
-124 BitcastI16toF16               bitcast between different sizes
-125 BitcastF16toI16               bitcast between different sizes
-126 BitcastI32toF32               bitcast between different sizes
-127 BitcastF32toI32               bitcast between different sizes
-128 BitcastI64toF64               bitcast between different sizes
-129 BitcastF64toI64               bitcast between different sizes
-130 LegacyF32ToF16                legacy fuction to convert float (f32) to half (f16) (this is not related to min-precision)
-131 LegacyF16ToF32                legacy fuction to convert half (f16) to float (f32) (this is not related to min-precision)
-132 LegacyDoubleToFloat           legacy fuction to convert double to float
-133 LegacyDoubleToSInt32          legacy fuction to convert double to int32
-134 LegacyDoubleToUInt32          legacy fuction to convert double to uint32
-135 WaveAllBitCount               returns the count of bits set to 1 across the wave
-136 WavePrefixBitCount            returns the count of bits set to 1 on prior lanes
-137 AttributeAtVertex_            returns the values of the attributes at the vertex.
-138 ViewID                        returns the view index
-=== ============================= =================================================================================================================
-
-
-Acos
-~~~~
-
-The return value is within the range of -PI/2 to PI/2.
-
-+----------+------+--------------+---------+------+------+---------+------+-----+
-| src      | -inf | [-1,1]       | -denorm | -0   | +0   | +denorm | +inf | NaN |
-+----------+------+--------------+---------+------+------+---------+------+-----+
-| acos(src)|  NaN | (-PI/2,+PI/2)|    PI/2 | PI/2 | PI/2 |    PI/2 |  NaN | NaN |
-+----------+------+--------------+---------+------+------+---------+------+-----+
-
-Asin
-~~~~
-
-The return value is within the range of -PI/2 to PI/2.
-
-+----------+------+--------------+---------+------+------+---------+------+-----+
-| src      | -inf | [-1,1]       | -denorm | -0   | +0   | +denorm | +inf | NaN |
-+----------+------+--------------+---------+------+------+---------+------+-----+
-| asin(src)|  NaN | (-PI/2,+PI/2)|    0    |  0   |  0   |    0    |  NaN | NaN |
-+----------+------+--------------+---------+------+------+---------+------+-----+
-
-Atan
-~~~~
-
-+----------+------+--------------+---------+------+------+---------+---------------+-----+-----+
-| src      | -inf | -F           | -denorm | -0   | +0   | +denorm | +F            |+inf | NaN |
-+----------+------+--------------+---------+------+------+---------+---------------+-----+-----+
-| atan(src)| -PI/2| (-PI/2,+PI/2)|    0    |  0   |  0   |    0    | (-PI/2,+PI/2) |PI/2 | NaN |
-+----------+------+--------------+---------+------+------+---------+---------------+-----+-----+
-
-Returns the arctangent of the specified value. The return value is within the range of -PI/2 to PI/2
-
-AttributeAtVertex
-~~~~~~~~~~~~~~~~~
-
-returns the values of the attributes at the vertex. VertexID ranges from 0 to 2.
-
-Bfi
-~~~
-
-Given a bit range from the LSB of a number, place that number of bits in another number at any offset.
-
-dst = Bfi(src0, src1, src2, src3);
-
-The LSB 5 bits of src0 provide the bitfield width (0-31) to take from src2.
-The LSB 5 bits of src1 provide the bitfield offset (0-31) to start replacing bits in the  number read from src3.
-Given width, offset: bitmask = (((1 << width)-1) << offset) & 0xffffffff, dest = ((src2 << offset) & bitmask) | (src3 & ~bitmask)
-
-Bfrev
-~~~~~
-
-Reverses the order of the bits. For example given 0x12345678 the result would be 0x1e6a2c48.
-
-Cos
-~~~
-
-Theta values can be any IEEE 32-bit floating point values.
-
-The maximum absolute error is 0.0008 in the interval from -100*Pi to +100*Pi.
-
-+----------+------+------------+---------+----+----+---------+------------+------+-----+
-| src      | -inf | -F         | -denorm | -0 | +0 | +denorm | +F         | +inf | NaN |
-+----------+------+------------+---------+----+----+---------+------------+------+-----+
-| cos(src) |  NaN | [-1 to +1] |      +1 | +1 | +1 |      +1 | [-1 to +1] |  NaN | NaN |
-+----------+------+------------+---------+----+----+---------+------------+------+-----+
-
-Countbits
-~~~~~~~~~
-
-Counts the number of bits in the input integer.
-
-DerivCoarseX
-~~~~~~~~~~~~
-
-dst = DerivCoarseX(src);
-
-Computes the rate of change per stamp in x direction. Only a single x derivative pair is computed for each 2x2 stamp of pixels.
-The data in the current Pixel Shader invocation may or may not participate in the calculation of the requested derivative, given the derivative will be calculated only once per 2x2 quad:
-As an example, the x derivative could be a delta from the top row of pixels.
-The exact calculation is up to the hardware vendor. There is also no specification dictating how the 2x2 quads will be aligned/tiled over a primitive.
-
-DerivCoarseY
-~~~~~~~~~~~~
-
-dst = DerivCoarseY(src);
-
-Computes the rate of change per stamp in y direction. Only a single y derivative pair is computed for each 2x2 stamp of pixels.
-The data in the current Pixel Shader invocation may or may not participate in the calculation of the requested derivative, given the derivative will be calculated only once per 2x2 quad:
-As an example, the y derivative could be a delta from the left column of pixels.
-The exact calculation is up to the hardware vendor. There is also no specification dictating how the 2x2 quads will be aligned/tiled over a primitive.
-
-DerivFineX
-~~~~~~~~~~
-
-dst = DerivFineX(src);
-
-Computes the rate of change per pixel in x direction. Each pixel in the 2x2 stamp gets a unique pair of x derivative calculations
-The data in the current Pixel Shader invocation always participates in the calculation of the requested derivative.
-There is no specification dictating how the 2x2 quads will be aligned/tiled over a primitive.
-
-DerivFineY
-~~~~~~~~~~
-
-dst = DerivFineY(src);
-
-Computes the rate of change per pixel in y direction. Each pixel in the 2x2 stamp gets a unique pair of y derivative calculations
-The data in the current Pixel Shader invocation always participates in the calculation of the requested derivative.
-There is no specification dictating how the 2x2 quads will be aligned/tiled over a primitive.
-
-Dot2
-~~~~
-
-Two-dimensional vector dot-product
-
-Dot3
-~~~~
-
-Three-dimensional vector dot-product
-
-Dot4
-~~~~
-
-Four-dimensional vector dot-product
-
-Exp
-~~~
-
-Returns 2^exponent. Note that hlsl log intrinsic returns the base-e exponent. Maximum relative error is e^-21.
-
-+----------+------+------------+---------+----+----+---------+------------+------+-----+
-| src      | -inf | -F         | -denorm | -0 | +0 | +denorm | +F         | +inf | NaN |
-+----------+------+------------+---------+----+----+---------+------------+------+-----+
-| exp(src) |  0   | +F         |    1    |  1 |  1 |       1 | +F         | +inf | NaN |
-+----------+------+------------+---------+----+----+---------+------------+------+-----+
-
-FAbs
-~~~~
-
-The FAbs instruction takes simply forces the sign of the number(s) on the source operand positive, including on INF and denorm values.
-Applying FAbs on NaN preserves NaN, although the particular NaN bit pattern that results is not defined.
-
-FMad
-~~~~
-
-Floating point multiply & add. This operation is not fused for "precise" operations.
-FMad(a,b,c) = a * b + c
-
-FMax
-~~~~
-
->= is used instead of > so that if min(x,y) = x then max(x,y) = y.
-
-NaN has special handling: If one source operand is NaN, then the other source operand is returned.
-If both are NaN, any NaN representation is returned.
-This conforms to new IEEE 754R rules.
-
-Denorms are flushed (sign preserved) before comparison, however the result written to dest may or may not be denorm flushed.
-
-+------+-----------------------------+
-| a    | b                           |
-|      +------+--------+------+------+
-|      | -inf | F      | +inf | NaN  |
-+------+------+--------+------+------+
-| -inf | -inf | b      | +inf | -inf |
-+------+------+--------+------+------+
-| F    | a    | a or b | +inf | a    |
-+------+------+--------+------+------+
-| +inf | +inf | +inf   | +inf | +inf |
-+------+------+--------+------+------+
-| NaN  | -inf | b      | +inf | NaN  |
-+------+------+--------+------+------+
-
-FMin
-~~~~
-
-NaN has special handling: If one source operand is NaN, then the other source operand is returned.
-If both are NaN, any NaN representation is returned.
-This conforms to new IEEE 754R rules.
-
-Denorms are flushed (sign preserved) before comparison, however the result written to dest may or may not be denorm flushed.
-
-+------+-----------------------------+
-| a    | b                           |
-|      +------+--------+------+------+
-|      | -inf | F      | +inf | NaN  |
-+------+------+--------+------+------+
-| -inf | -inf | -inf   | -inf | -inf |
-+------+------+--------+------+------+
-| F    | -inf | a or b |    a |    a |
-+------+------+--------+------+------+
-| +inf | -inf | b      | +inf | +inf |
-+------+------+--------+------+------+
-| NaN  | -inf | b      | +inf | NaN  |
-+------+------+--------+------+------+
-
-FirstbitHi
-~~~~~~~~~~
-
-Returns the integer position of the first bit set in the 32-bit input starting from the MSB. For example, 0x10000000 would return 3. Returns 0xffffffff if no match was found.
-
-FirstbitLo
-~~~~~~~~~~
-
-Returns the integer position of the first bit set in the 32-bit input starting from the LSB. For example, 0x00000000 would return 1. Returns 0xffffffff if no match was found.
-
-FirstbitSHi
-~~~~~~~~~~~
-
-Returns the first 0 from the MSB if the number is negative, else the first 1 from the MSB. Returns 0xffffffff if no match was found.
-
-Fma
-~~~
-
-Fused multiply-add. This operation is only defined in double precision.
-Fma(a,b,c) = a * b + c
-
-Frc
-~~~
-
-+--------------+------+------+---------+----+----+---------+--------+------+-----+
-| src          | -inf | -F   | -denorm | -0 | +0 | +denorm | +F     | +inf | NaN |
-+--------------+------+------+---------+----+----+---------+--------+------+-----+
-| log(src)     | NaN  |[+0,1)| +0      | +0 | +0 | +0      | [+0,1) | NaN  | NaN |
-+--------------+------+------+---------+----+----+---------+--------+------+-----+
-
-Hcos
-~~~~
-
-Returns the hyperbolic cosine of the specified value.
-
-+----------+------+------------+---------+----+----+---------+------------+------+-----+
-| src      | -inf | -F         | -denorm | -0 | +0 | +denorm | +F         | +inf | NaN |
-+----------+------+------------+---------+----+----+---------+------------+------+-----+
-| hcos(src)| +inf | (1, +inf)  |      +1 | +1 | +1 |      +1 | (1, +inf)  | +inf | NaN |
-+----------+------+------------+---------+----+----+---------+------------+------+-----+
-
-Hsin
-~~~~
-
-Returns the hyperbolic sine of the specified value.
-
-+----------+------+------------+---------+----+----+---------+------------+------+-----+
-| src      | -inf | -F         | -denorm | -0 | +0 | +denorm | +F         | +inf | NaN |
-+----------+------+------------+---------+----+----+---------+------------+------+-----+
-| hsin(src)| -inf | -F         |       0 |  0 |  0 |       0 | +F         | +inf | NaN |
-+----------+------+------------+---------+----+----+---------+------------+------+-----+
-
-Htan
-~~~~
-
-Returns the hyperbolic tangent of the specified value.
-
-+----------+------+------------+---------+----+----+---------+------------+------+-----+
-| src      | -inf | -F         | -denorm | -0 | +0 | +denorm | +F         | +inf | NaN |
-+----------+------+------------+---------+----+----+---------+------------+------+-----+
-| htan(src)| -1   | -F         |       0 |  0 |  0 |       0 | +F         | +1   | NaN |
-+----------+------+------------+---------+----+----+---------+------------+------+-----+
-
-IMad
-~~~~
-
-Signed integer multiply & add
-
-IMad(a,b,c) = a * b + c
-
-IMax
-~~~~
-
-IMax(a,b) returns a if a > b, else b. Optional negate modifier on source operands takes 2's complement before performing operation.
-
-IMin
-~~~~
-
-IMin(a,b) returns a if a < b, else b. Optional negate modifier on source operands takes 2's complement before performing operation.
-
-IMul
-~~~~
-
-IMul(src0, src1) = destHi, destLo
-multiply of 32-bit operands src0 and src1 (note they are signed), producing the correct full 64-bit result.
-The low 32 bits are placed in destLO. The high 32 bits are placed in destHI.
-
-Either of destHI or destLO may be specified as NULL instead of specifying a register, in the case high or low 32 bits of the 64-bit result are not needed.
-
-Optional negate modifier on source operands takes 2's complement before performing arithmetic operation.
-
-Ibfe
-~~~~
-
-dest = Ibfe(src0, src1, src2)
-
-Given a range of bits in a number, shift those bits to the LSB and sign extend the MSB of the range.
-
-width : The LSB 5 bits of src0 (0-31).
-
-offset: The LSB 5 bits of src1 (0-31)
-
-.. code:: c
-
-    if( width == 0 )
-    {
-        dest = 0
-    }
-    else if( width + offset < 32 )
-    {
-        shl dest, src2, 32-(width+offset)
-        ishr dest, dest, 32-width
-    }
-    else
-    {
-        ishr dest, src2, offset
-    }
-
-IsFinite
-~~~~~~~~
-
-Returns true if x is finite, false otherwise.
-
-IsInf
-~~~~~
-
-Returns true if x is +INF or -INF, false otherwise.
-
-IsNaN
-~~~~~
-
-Returns true if x is NAN or QNAN, false otherwise.
-
-IsNormal
-~~~~~~~~
-
-Returns IsNormal.
-
-LoadInput
-~~~~~~~~~
-
-Loads the value from shader input
-
-Log
-~~~
-
-Returns log base 2. Note that hlsl log intrinsic returns natural log.
-
-+----------+------+------------+---------+----+----+---------+------------+------+-----+
-| src      | -inf | -F         | -denorm | -0 | +0 | +denorm | +F         | +inf | NaN |
-+----------+------+------------+---------+----+----+---------+------------+------+-----+
-| log(src) |  NaN | NaN        |    -inf |-inf|-inf|    -inf |  F         | +inf | NaN |
-+----------+------+------------+---------+----+----+---------+------------+------+-----+
-
-MinPrecXRegLoad
-~~~~~~~~~~~~~~~
-
-Helper load operation for minprecision
-
-MinPrecXRegStore
-~~~~~~~~~~~~~~~~
-
-Helper store operation for minprecision
-
-Msad
-~~~~
-
-Returns the masked Sum of Absolute Differences.
-
-dest = msad(ref, src, accum)
-
-ref: contains 4 packed 8-bit unsigned integers in 32 bits.
-
-src: contains 4 packed 8-bit unsigned integers in 32 bits.
-
-accum: a 32-bit unsigned integer, providing an existing accumulation.
-
-dest receives the result of the masked SAD operation added to the accumulation value.
-
-.. code:: c
-
-    UINT msad( UINT ref, UINT src, UINT accum )
-    {
-        for (UINT i = 0; i < 4; i++)
-        {
-            BYTE refByte, srcByte, absDiff;
-
-            refByte = (BYTE)(ref >> (i * 8));
-            if (!refByte)
-            {
-                continue;
-            }
-
-            srcByte = (BYTE)(src >> (i * 8));
-            if (refByte >= srcByte)
-            {
-                absDiff = refByte - srcByte;
-            }
-            else
-            {
-                absDiff = srcByte - refByte;
-            }
-
-            // The recommended overflow behavior for MSAD is
-            // to do a 32-bit saturate. This is not
-            // required, however, and wrapping is allowed.
-            // So from an application point of view,
-            // overflow behavior is undefined.
-            if (UINT_MAX - accum < absDiff)
-            {
-                accum = UINT_MAX;
-                break;
-            }
-
-            accum += absDiff;
-        }
-
-        return accum;
-    }
-
-Round_ne
-~~~~~~~~
-
-Floating-point round of the values in src,
-writing integral floating-point values to dest.
-
-round_ne rounds towards nearest even. For halfway, it rounds away from zero.
-
-+--------------+------+----+---------+----+----+---------+----+------+-----+
-| src          | -inf | -F | -denorm | -0 | +0 | +denorm | +F | +inf | NaN |
-+--------------+------+----+---------+----+----+---------+----+------+-----+
-| round_ne(src)| -inf | -F | -0      | -0 | +0 | +0      | +F | +inf | NaN |
-+--------------+------+----+---------+----+----+---------+----+------+-----+
-
-Round_ni
-~~~~~~~~
-
-Floating-point round of the values in src,
-writing integral floating-point values to dest.
-
-round_ni rounds towards -INF, commonly known as floor().
-
-+--------------+------+----+---------+----+----+---------+----+------+-----+
-| src          | -inf | -F | -denorm | -0 | +0 | +denorm | +F | +inf | NaN |
-+--------------+------+----+---------+----+----+---------+----+------+-----+
-| round_ni(src)| -inf | -F | -0      | -0 | +0 | +0      | +F | +inf | NaN |
-+--------------+------+----+---------+----+----+---------+----+------+-----+
-
-Round_pi
-~~~~~~~~
-
-Floating-point round of the values in src,
-writing integral floating-point values to dest.
-
-round_pi rounds towards +INF, commonly known as ceil().
-
-+--------------+------+----+---------+----+----+---------+----+------+-----+
-| src          | -inf | -F | -denorm | -0 | +0 | +denorm | +F | +inf | NaN |
-+--------------+------+----+---------+----+----+---------+----+------+-----+
-| round_pi(src)| -inf | -F | -0      | -0 | +0 | +0      | +F | +inf | NaN |
-+--------------+------+----+---------+----+----+---------+----+------+-----+
-
-Round_z
-~~~~~~~
-
-Floating-point round of the values in src,
-writing integral floating-point values to dest.
-
-round_z rounds towards zero.
-
-+--------------+------+----+---------+----+----+---------+----+------+-----+
-| src          | -inf | -F | -denorm | -0 | +0 | +denorm | +F | +inf | NaN |
-+--------------+------+----+---------+----+----+---------+----+------+-----+
-| round_z(src) | -inf | -F | -0      | -0 | +0 | +0      | +F | +inf | NaN |
-+--------------+------+----+---------+----+----+---------+----+------+-----+
-
-Rsqrt
-~~~~~
-
-Maximum relative error is 2^21.
-
-+--------------+------+----+---------+----+----+---------+----+------+-----+
-| src          | -inf | -F | -denorm | -0 | +0 | +denorm | +F | +inf | NaN |
-+--------------+------+----+---------+----+----+---------+----+------+-----+
-| rsqrt(src)   | -inf | -F | -0      | -0 | +0 | +0      | +F | +inf | NaN |
-+--------------+------+----+---------+----+----+---------+----+------+-----+
-
-Saturate
-~~~~~~~~
-
-The Saturate instruction performs the following operation on its input value:
-
-min(1.0f, max(0.0f, value))
-
-where min() and max() in the above expression behave in the way Min and Max behave.
-
-Saturate(NaN) returns 0, by the rules for min and max.
-
-Sin
-~~~
-
-Theta values can be any IEEE 32-bit floating point values.
-
-The maximum absolute error is 0.0008 in the interval from -100*Pi to +100*Pi.
-
-+----------+------+------------+---------+----+----+---------+------------+------+-----+
-| src      | -inf | -F         | -denorm | -0 | +0 | +denorm | +F         | +inf | NaN |
-+----------+------+------------+---------+----+----+---------+------------+------+-----+
-| sin(src) |  NaN | [-1 to +1] |      -0 | -0 | +0 |      +0 | [-1 to +1] |  NaN | NaN |
-+----------+------+------------+---------+----+----+---------+------------+------+-----+
-
-Sqrt
-~~~~
-
-Precision is 1 ulp.
-
-+--------------+------+----+---------+----+----+---------+----+------+-----+
-| src          | -inf | -F | -denorm | -0 | +0 | +denorm | +F | +inf | NaN |
-+--------------+------+----+---------+----+----+---------+----+------+-----+
-| sqrt(src)    | NaN  | NaN| -0      | -0 | +0 | +0      | +F | +inf | NaN |
-+--------------+------+----+---------+----+----+---------+----+------+-----+
-
-StoreOutput
-~~~~~~~~~~~
-
-Stores the value to shader output
-
-Tan
-~~~
-
-Theta values can be any IEEE 32-bit floating point values.
-
-+----------+----------+----------------+---------+----+----+---------+----------------+------+-----+
-| src      | -inf     | -F             | -denorm | -0 | +0 | +denorm | +F             | +inf | NaN |
-+----------+----------+----------------+---------+----+----+---------+----------------+------+-----+
-| tan(src) | NaN      | [-inf to +inf] | -0      | -0 | +0 | +0      | [-inf to +inf] | NaN  | NaN |
-+----------+----------+----------------+---------+----+----+---------+----------------+------+-----+
-
-TempRegLoad
-~~~~~~~~~~~
-
-Helper load operation
-
-TempRegStore
-~~~~~~~~~~~~
-
-Helper store operation
-
-UAddc
-~~~~~
-
-dest0, dest1 = UAddc(src0, src1)
-
-unsigned add of 32-bit operands src0 and src1, placing the LSB part of the 32-bit result in dest0.
-dest1 is written with: 1 if a carry is produced, 0 otherwise. Dest1 can be NULL if the carry is not needed
-
-UDiv
-~~~~
-
-destQUOT, destREM = UDiv(src0, src1);
-
-unsigned divide of the 32-bit operand src0 by the 32-bit operand src1.
-
-The results of the divides are the 32-bit quotients (placed in destQUOT) and 32-bit remainders (placed in destREM).
-
-Divide by zero returns 0xffffffff for both quotient and remainder.
-
-Either destQUOT or destREM may be specified as NULL instead of specifying a register, in the case the quotient or remainder are not needed.
-
-Unsigned subtract of 32-bit operands src1 from src0, placing the LSB part of the 32-bit result in dest0.
-dest1 is written with: 1 if a borrow is produced, 0 otherwise. Dest1 can be NULL if the borrow is not needed
-
-UMad
-~~~~
-
-Unsigned integer multiply & add.
-
-Umad(a,b,c) = a * b + c
-
-UMax
-~~~~
-
-unsigned integer maximum. UMax(a,b) = a > b ? a : b
-
-UMin
-~~~~
-
-unsigned integer minimum. UMin(a,b) = a < b ? a : b
-
-UMul
-~~~~
-
-multiply of 32-bit operands src0 and src1 (note they are unsigned), producing the correct full 64-bit result.
-The low 32 bits are placed in destLO. The high 32 bits are placed in destHI.
-Either of destHI or destLO may be specified as NULL instead of specifying a register, in the case high or low 32 bits of the 64-bit result are not needed
-
-USubb
-~~~~~
-
-dest0, dest1 = USubb(src0, src1)
-
-Ubfe
-~~~~
-
-dest = ubfe(src0, src1, src2)
-
-Given a range of bits in a number, shift those bits to the LSB and set remaining bits to 0.
-
-width : The LSB 5 bits of src0 (0-31).
-
-offset: The LSB 5 bits of src1 (0-31).
-
-Given width, offset:
-
-.. code:: c
-
-    if( width == 0 )
-    {
-        dest = 0
-    }
-    else if( width + offset < 32 )
-    {
-        shl dest, src2, 32-(width+offset)
-        ushr dest, dest, 32-width
-    }
-    else
-    {
-        ushr dest, src2, offset
-    }
-
-.. OPCODES-RST:END
-
-
-Custom instructions
--------------------
-Instructions for third-party extensions will be specially-prefixed external function calls, identified by a declared extension-set-prefix. Additional metadata will be included to provide hints about uniformity, pure or const guarantees, alignment, etc.
-
-Validation Rules
-================
-
-The following rules are verified by the *Validator* component and thus can be relied upon by downstream consumers.
-
-The set of validation rules that are known to hold for a DXIL program is identifier by the 'dx.valver' named metadata node, which consists of a two-element tuple of constant int values, a major and minor version. Minor version numbers are increments as rules are added to a prior table or as the implementation fixes issues.
-
-.. <py::lines('VALRULES-RST')>hctdb_instrhelp.get_valrules_rst()</py>
-.. VALRULES-RST:BEGIN
-
-====================================== =======================================================================================================================================================================================================================================================================================================
-Rule Code                              Description
-====================================== =======================================================================================================================================================================================================================================================================================================
-BITCODE.VALID                          TODO - Module must be bitcode-valid
-CONTAINER.PARTINVALID                  DXIL Container must not contain unknown parts
-CONTAINER.PARTMATCHES                  DXIL Container Parts must match Module
-CONTAINER.PARTMISSING                  DXIL Container requires certain parts, corresponding to module
-CONTAINER.PARTREPEATED                 DXIL Container must have only one of each part type
-CONTAINER.ROOTSIGNATUREINCOMPATIBLE    Root Signature in DXIL Container must be compatible with shader
-DECL.DXILFNEXTERN                      External function must be a DXIL function
-DECL.DXILNSRESERVED                    The DXIL reserved prefixes must only be used by built-in functions and types
-DECL.FNFLATTENPARAM                    Function parameters must not use struct types
-DECL.FNISCALLED                        Functions can only be used by call instructions
-DECL.NOTUSEDEXTERNAL                   External declaration should not be used
-DECL.USEDEXTERNALFUNCTION              External function must be used
-DECL.USEDINTERNAL                      Internal declaration must be used
-FLOW.DEADLOOP                          Loop must have break
-FLOW.FUNCTIONCALL                      Function with parameter is not permitted
-FLOW.NORECUSION                        Recursion is not permitted
-FLOW.REDUCIBLE                         Execution flow must be reducible
-INSTR.ALLOWED                          Instructions must be of an allowed type
-INSTR.ATTRIBUTEATVERTEXNOINTERPOLATION Attribute %0 must have nointerpolation mode in order to use GetAttributeAtVertex function.
-INSTR.BARRIERMODEFORNONCS              sync in a non-Compute Shader must only sync UAV (sync_uglobal)
-INSTR.BARRIERMODENOMEMORY              sync must include some form of memory barrier - _u (UAV) and/or _g (Thread Group Shared Memory).  Only _t (thread group sync) is optional.
-INSTR.BARRIERMODEUSELESSUGROUP         sync can't specify both _ugroup and _uglobal. If both are needed, just specify _uglobal.
-INSTR.BUFFERUPDATECOUNTERONUAV         BufferUpdateCounter valid only on UAV
-INSTR.CALLOLOAD                        Call to DXIL intrinsic must match overload signature
-INSTR.CANNOTPULLPOSITION               pull-model evaluation of position disallowed
-INSTR.CBUFFERCLASSFORCBUFFERHANDLE     Expect Cbuffer for CBufferLoad handle
-INSTR.CBUFFEROUTOFBOUND                Cbuffer access out of bound
-INSTR.COORDINATECOUNTFORRAWTYPEDBUF    raw/typed buffer don't need 2 coordinates
-INSTR.COORDINATECOUNTFORSTRUCTBUF      structured buffer require 2 coordinates
-INSTR.DXILSTRUCTUSER                   Dxil struct types should only used by ExtractValue
-INSTR.DXILSTRUCTUSEROUTOFBOUND         Index out of bound when extract value from dxil struct types
-INSTR.EVALINTERPOLATIONMODE            Interpolation mode on %0 used with eval_* instruction must be linear, linear_centroid, linear_noperspective, linear_noperspective_centroid, linear_sample or linear_noperspective_sample
-INSTR.EXTRACTVALUE                     ExtractValue should only be used on dxil struct types and cmpxchg
-INSTR.FAILTORESLOVETGSMPOINTER         TGSM pointers must originate from an unambiguous TGSM global variable.
-INSTR.HANDLENOTFROMCREATEHANDLE        Resource handle should returned by createHandle
-INSTR.IMMBIASFORSAMPLEB                bias amount for sample_b must be in the range [%0,%1], but %2 was specified as an immediate
-INSTR.INBOUNDSACCESS                   Access to out-of-bounds memory is disallowed
-INSTR.MINPRECISIONNOTPRECISE           Instructions marked precise may not refer to minprecision values
-INSTR.MINPRECISONBITCAST               Bitcast on minprecison types is not allowed
-INSTR.MIPLEVELFORGETDIMENSION          Use mip level on buffer when GetDimensions
-INSTR.MIPONUAVLOAD                     uav load don't support mipLevel/sampleIndex
-INSTR.NOGENERICPTRADDRSPACECAST        Address space cast between pointer types must have one part to be generic address space
-INSTR.NOIDIVBYZERO                     No signed integer division by zero
-INSTR.NOINDEFINITEACOS                 No indefinite arccosine
-INSTR.NOINDEFINITEASIN                 No indefinite arcsine
-INSTR.NOINDEFINITEDSXY                 No indefinite derivative calculation
-INSTR.NOINDEFINITELOG                  No indefinite logarithm
-INSTR.NOREADINGUNINITIALIZED           Instructions should not read uninitialized value
-INSTR.NOUDIVBYZERO                     No unsigned integer division by zero
-INSTR.OFFSETONUAVLOAD                  uav load don't support offset
-INSTR.OLOAD                            DXIL intrinsic overload must be valid
-INSTR.ONLYONEALLOCCONSUME              RWStructuredBuffers may increment or decrement their counters, but not both.
-INSTR.OPCODERESERVED                   Instructions must not reference reserved opcodes
-INSTR.OPCONST                          DXIL intrinsic requires an immediate constant operand
-INSTR.OPCONSTRANGE                     Constant values must be in-range for operation
-INSTR.OPERANDRANGE                     DXIL intrinsic operand must be within defined range
-INSTR.PTRBITCAST                       Pointer type bitcast must be have same size
-INSTR.RESOURCECLASSFORLOAD             load can only run on UAV/SRV resource
-INSTR.RESOURCECLASSFORSAMPLERGATHER    sample, lod and gather should on srv resource.
-INSTR.RESOURCECLASSFORUAVSTORE         store should on uav resource.
-INSTR.RESOURCECOORDINATEMISS           coord uninitialized
-INSTR.RESOURCECOORDINATETOOMANY        out of bound coord must be undef
-INSTR.RESOURCEKINDFORBUFFERLOADSTORE   buffer load/store only works on Raw/Typed/StructuredBuffer
-INSTR.RESOURCEKINDFORCALCLOD           lod requires resource declared as texture1D/2D/3D/Cube/CubeArray/1DArray/2DArray
-INSTR.RESOURCEKINDFORGATHER            gather requires resource declared as texture/2D/Cube/2DArray/CubeArray
-INSTR.RESOURCEKINDFORGETDIM            Invalid resource kind on GetDimensions
-INSTR.RESOURCEKINDFORSAMPLE            sample/_l/_d requires resource declared as texture1D/2D/3D/Cube/1DArray/2DArray/CubeArray
-INSTR.RESOURCEKINDFORSAMPLEC           samplec requires resource declared as texture1D/2D/Cube/1DArray/2DArray/CubeArray
-INSTR.RESOURCEKINDFORTEXTURELOAD       texture load only works on Texture1D/1DArray/2D/2DArray/3D/MS2D/MS2DArray
-INSTR.RESOURCEKINDFORTEXTURESTORE      texture store only works on Texture1D/1DArray/2D/2DArray/3D
-INSTR.RESOURCEOFFSETMISS               offset uninitialized
-INSTR.RESOURCEOFFSETTOOMANY            out of bound offset must be undef
-INSTR.SAMPLECOMPTYPE                   sample_* instructions require resource to be declared to return UNORM, SNORM or FLOAT.
-INSTR.SAMPLEINDEXFORLOAD2DMS           load on Texture2DMS/2DMSArray require sampleIndex
-INSTR.SAMPLERMODEFORLOD                lod instruction requires sampler declared in default mode
-INSTR.SAMPLERMODEFORSAMPLE             sample/_l/_d/_cl_s/gather instruction requires sampler declared in default mode
-INSTR.SAMPLERMODEFORSAMPLEC            sample_c_*/gather_c instructions require sampler declared in comparison mode
-INSTR.STRUCTBITCAST                    Bitcast on struct types is not allowed
-INSTR.TEXTUREOFFSET                    offset texture instructions must take offset which can resolve to integer literal in the range -8 to 7
-INSTR.TGSMRACECOND                     Race condition writing to shared memory detected, consider making this write conditional
-INSTR.UNDEFRESULTFORGETDIMENSION       GetDimensions used undef dimension %0 on %1
-INSTR.WRITEMASKFORTYPEDUAVSTORE        store on typed uav must write to all four components of the UAV
-INSTR.WRITEMASKMATCHVALUEFORUAVSTORE   uav store write mask must match store value mask, write mask is %0 and store value mask is %1
-META.BARYCENTRICSFLOAT3                only 'float3' type is allowed for SV_Barycentrics.
-META.BARYCENTRICSINTERPOLATION         SV_Barycentrics cannot be used with 'nointerpolation' type
-META.BARYCENTRICSTWOPERSPECTIVES       There can only be up to two input attributes of SV_Barycentrics with different perspective interpolation mode.
-META.BRANCHFLATTEN                     Can't use branch and flatten attributes together
-META.CLIPCULLMAXCOMPONENTS             Combined elements of SV_ClipDistance and SV_CullDistance must fit in 8 components
-META.CLIPCULLMAXROWS                   Combined elements of SV_ClipDistance and SV_CullDistance must fit in two rows.
-META.CONTROLFLOWHINTNOTONCONTROLFLOW   Control flow hint only works on control flow inst
-META.DENSERESIDS                       Resource identifiers must be zero-based and dense
-META.DUPLICATESYSVALUE                 System value may only appear once in signature
-META.ENTRYFUNCTION                     entrypoint not found
-META.FLAGSUSAGE                        Flags must match usage
-META.FORCECASEONSWITCH                 Attribute forcecase only works for switch
-META.FUNCTIONANNOTATION                Cannot find function annotation for %0
-META.GLCNOTONAPPENDCONSUME             globallycoherent cannot be used with append/consume buffers
-META.INTEGERINTERPMODE                 Interpolation mode on integer must be Constant
-META.INTERPMODEINONEROW                Interpolation mode must be identical for all elements packed into the same row.
-META.INTERPMODEVALID                   Interpolation mode must be valid
-META.INVALIDCONTROLFLOWHINT            Invalid control flow hint
-META.KNOWN                             Named metadata should be known
-META.MAXTESSFACTOR                     Hull Shader MaxTessFactor must be [%0..%1].  %2 specified
-META.NOSEMANTICOVERLAP                 Semantics must not overlap
-META.REQUIRED                          TODO - Required metadata missing
-META.SEMAKINDMATCHESNAME               Semantic name must match system value, when defined.
-META.SEMAKINDVALID                     Semantic kind must be valid
-META.SEMANTICCOMPTYPE                  %0 must be %1
-META.SEMANTICINDEXMAX                  System value semantics have a maximum valid semantic index
-META.SEMANTICLEN                       Semantic length must be at least 1 and at most 64
-META.SEMANTICSHOULDBEALLOCATED         Semantic should have a valid packing location
-META.SEMANTICSHOULDNOTBEALLOCATED      Semantic should have a packing location of -1
-META.SIGNATURECOMPTYPE                 signature %0 specifies unrecognized or invalid component type
-META.SIGNATUREILLEGALCOMPONENTORDER    Component ordering for packed elements must be: arbitrary < system value < system generated value
-META.SIGNATUREINDEXCONFLICT            Only elements with compatible indexing rules may be packed together
-META.SIGNATUREOUTOFRANGE               Signature elements must fit within maximum signature size
-META.SIGNATUREOVERLAP                  Signature elements may not overlap in packing location.
-META.STRUCTBUFALIGNMENT                StructuredBuffer stride not aligned
-META.STRUCTBUFALIGNMENTOUTOFBOUND      StructuredBuffer stride out of bounds
-META.SYSTEMVALUEROWS                   System value may only have 1 row
-META.TARGET                            Target triple must be 'dxil-ms-dx'
-META.TESSELLATOROUTPUTPRIMITIVE        Invalid Tessellator Output Primitive specified. Must be point, line, triangleCW or triangleCCW.
-META.TESSELLATORPARTITION              Invalid Tessellator Partitioning specified. Must be integer, pow2, fractional_odd or fractional_even.
-META.TEXTURETYPE                       elements of typed buffers and textures must fit in four 32-bit quantities
-META.USED                              All metadata must be used by dxil
-META.VALIDSAMPLERMODE                  Invalid sampler mode on sampler
-META.VALUERANGE                        Metadata value must be within range
-META.WELLFORMED                        TODO - Metadata must be well-formed in operand count and types
-SM.APPENDANDCONSUMEONSAMEUAV           BufferUpdateCounter inc and dec on a given UAV (%d) cannot both be in the same shader for shader model less than 5.1.
-SM.CBUFFERELEMENTOVERFLOW              CBuffer elements must not overflow
-SM.CBUFFEROFFSETOVERLAP                CBuffer offsets must not overlap
-SM.CBUFFERTEMPLATETYPEMUSTBESTRUCT     D3D12 constant/texture buffer template element can only be a struct
-SM.COMPLETEPOSITION                    Not all elements of SV_Position were written
-SM.COUNTERONLYONSTRUCTBUF              BufferUpdateCounter valid only on structured buffers
-SM.CSNORETURN                          Compute shaders can't return values, outputs must be written in writable resources (UAVs).
-SM.DOMAINLOCATIONIDXOOB                DomainLocation component index out of bounds for the domain.
-SM.DSINPUTCONTROLPOINTCOUNTRANGE       DS input control point count must be [0..%0].  %1 specified
-SM.DXILVERSION                         Target shader model requires specific Dxil Version
-SM.GSINSTANCECOUNTRANGE                GS instance count must be [1..%0].  %1 specified
-SM.GSOUTPUTVERTEXCOUNTRANGE            GS output vertex count must be [0..%0].  %1 specified
-SM.GSTOTALOUTPUTVERTEXDATARANGE        Declared output vertex count (%0) multiplied by the total number of declared scalar components of output data (%1) equals %2.  This value cannot be greater than %3
-SM.GSVALIDINPUTPRIMITIVE               GS input primitive unrecognized
-SM.GSVALIDOUTPUTPRIMITIVETOPOLOGY      GS output primitive topology unrecognized
-SM.HSINPUTCONTROLPOINTCOUNTRANGE       HS input control point count must be [0..%0].  %1 specified
-SM.HULLPASSTHRUCONTROLPOINTCOUNTMATCH  For pass thru hull shader, input control point count must match output control point count
-SM.INSIDETESSFACTORSIZEMATCHDOMAIN     InsideTessFactor rows, columns (%0, %1) invalid for domain %2.  Expected %3 rows and 1 column.
-SM.INVALIDRESOURCECOMPTYPE             Invalid resource return type
-SM.INVALIDRESOURCEKIND                 Invalid resources kind
-SM.INVALIDTEXTUREKINDONUAV             Texture2DMS[Array] or TextureCube[Array] resources are not supported with UAVs
-SM.ISOLINEOUTPUTPRIMITIVEMISMATCH      Hull Shader declared with IsoLine Domain must specify output primitive point or line. Triangle_cw or triangle_ccw output are not compatible with the IsoLine Domain.
-SM.MAXTGSMSIZE                         Total Thread Group Shared Memory storage is %0, exceeded %1
-SM.MAXTHEADGROUP                       Declared Thread Group Count %0 (X*Y*Z) is beyond the valid maximum of %1
-SM.MULTISTREAMMUSTBEPOINT              When multiple GS output streams are used they must be pointlists
-SM.NAME                                Target shader model name must be known
-SM.NOINTERPMODE                        Interpolation mode must be undefined for VS input/PS output/patch constant.
-SM.NOPSOUTPUTIDX                       Pixel shader output registers are not indexable.
-SM.OPCODE                              Opcode must be defined in target shader model
-SM.OPCODEININVALIDFUNCTION             Invalid DXIL opcode usage like StorePatchConstant in patch constant function
-SM.OPERAND                             Operand must be defined in target shader model
-SM.OUTPUTCONTROLPOINTCOUNTRANGE        output control point count must be [0..%0].  %1 specified
-SM.OUTPUTCONTROLPOINTSTOTALSCALARS     Total number of scalars across all HS output control points must not exceed
-SM.PATCHCONSTANTONLYFORHSDS            patch constant signature only valid in HS and DS
-SM.PSCONSISTENTINTERP                  Interpolation mode for PS input position must be linear_noperspective_centroid or linear_noperspective_sample when outputting oDepthGE or oDepthLE and not running at sample frequency (which is forced by inputting SV_SampleIndex or declaring an input linear_sample or linear_noperspective_sample)
-SM.PSCOVERAGEANDINNERCOVERAGE          InnerCoverage and Coverage are mutually exclusive.
-SM.PSMULTIPLEDEPTHSEMANTIC             Pixel Shader only allows one type of depth semantic to be declared
-SM.PSOUTPUTSEMANTIC                    Pixel Shader allows output semantics to be SV_Target, SV_Depth, SV_DepthGreaterEqual, SV_DepthLessEqual, SV_Coverage or SV_StencilRef, %0 found
-SM.PSTARGETCOL0                        SV_Target packed location must start at column 0
-SM.PSTARGETINDEXMATCHESROW             SV_Target semantic index must match packed row location
-SM.RESOURCERANGEOVERLAP                Resource ranges must not overlap
-SM.ROVONLYINPS                         RasterizerOrdered objects are only allowed in 5.0+ pixel shaders
-SM.SAMPLECOUNTONLYON2DMS               Only Texture2DMS/2DMSArray could has sample count
-SM.SEMANTIC                            Semantic must be defined in target shader model
-SM.STREAMINDEXRANGE                    Stream index (%0) must between 0 and %1
-SM.TESSFACTORFORDOMAIN                 Required TessFactor for domain not found declared anywhere in Patch Constant data
-SM.TESSFACTORSIZEMATCHDOMAIN           TessFactor rows, columns (%0, %1) invalid for domain %2.  Expected %3 rows and 1 column.
-SM.THREADGROUPCHANNELRANGE             Declared Thread Group %0 size %1 outside valid range [%2..%3]
-SM.TRIOUTPUTPRIMITIVEMISMATCH          Hull Shader declared with Tri Domain must specify output primitive point, triangle_cw or triangle_ccw. Line output is not compatible with the Tri domain
-SM.UNDEFINEDOUTPUT                     Not all elements of output %0 were written
-SM.VALIDDOMAIN                         Invalid Tessellator Domain specified. Must be isoline, tri or quad
-SM.VIEWIDNEEDSSLOT                     ViewID requires compatible space in pixel shader input signature
-SM.ZEROHSINPUTCONTROLPOINTWITHINPUT    When HS input control point count is 0, no input signature should exist
-TYPES.DEFINED                          Type must be defined based on DXIL primitives
-TYPES.I8                               I8 can only used as immediate value for intrinsic
-TYPES.INTWIDTH                         Int type must be of valid width
-TYPES.NOMULTIDIM                       Only one dimension allowed for array type
-TYPES.NOVECTOR                         Vector types must not be present
-UNI.NOWAVESENSITIVEGRADIENT            Gradient operations are not affected by wave-sensitive data or control flow.
-====================================== =======================================================================================================================================================================================================================================================================================================
-
-.. VALRULES-RST:END
-
-
-Modules and Linking
-===================
-
-HLSL has linking capabilities to enable third-party libraries. The linking step happens before shader DXIL is given to the driver compilers.
-Experimental library generation is added in DXIL1.1. A library could be created by compile with lib_6_1 profile.
-A library is a dxil container like the compile result of other shader profiles. The difference is library will keep information for linking like resource link info and entry function signatures.
-Library support is not part of DXIL spec. Only requirement is linked shader must be valid DXIL.
-
-
-Additional Notes
-================
-
-These additional notes are not normative for DXIL, and are included for the convenience of implementers.
-
-Other Versioned Components
---------------------------
-
-In addition to shader model, DXIL and bitcode representation versions, two other interesting versioned components are discussed: the supporting operating system and runtime, and the HLSL language.
-
-Support is provided in the Microsoft Windows family of operating systems, when running on the D3D12 runtime.
-
-The HLSL language is versioned independently of DXIL, and currently follows an 'HLSL <year>' naming scheme. HLSL 2015 is the dialect supported by the d3dcompiler_47 library; a limited form of support is provided in the open source HLSL on LLVM project. HLSL 2016 is the version supported by the current HLSL on LLVM project, which removes some features (primarily effect framework syntax, backquote operator) and adds new ones (wave intrinsics and basic i64 support).
-
-.. _dxil_container_format:
-
-DXIL Container Format
----------------------
-
-DXIL is typically encapsulated in a DXIL container. A DXIL container is composed of a header, a sequence of part lengths, and a sequence of parts.
-
-The following C declaration describes this structure::
-
-  struct DxilContainerHeader {
-    uint32_t  HeaderFourCC;
-    uint8_t   Digest[DxilContainerHashSize];
-    uint16_t  MajorVersion;
-    uint16_t  MinorVersion;
-    uint32_t  ContainerSizeInBytes; // From start of this header
-    uint32_t  PartCount;
-    // Structure is followed by uint32_t PartOffset[PartCount];
-    // The offset is to a DxilPartHeader.
-  };
-
-Each part has a standard header, followed by a part-specify body::
-
-  struct DxilPartHeader {
-    uint32_t  PartFourCC; // Four char code for part type.
-    uint32_t  PartSize;   // Byte count for PartData.
-    // Structure is followed by uint8_t PartData[PartSize].
-  };
-
-The DXIL program is found in a part with the following body::
-
-  struct DxilProgramHeader {
-    uint32_t          ProgramVersion;   /// Major and minor version of shader, including type.
-    uint32_t          SizeInUint32;     /// Size in uint32_t units including this header.
-    uint32_t DxilMagic;       // 0x4C495844, ASCII "DXIL".
-    uint32_t DxilVersion;     // DXIL version.
-    uint32_t BitcodeOffset;   // Offset to LLVM bitcode (from DxilMagic).
-    uint32_t BitcodeSize;     // Size of LLVM bitcode.
-    // Followed by uint8_t[BitcodeHeader.BitcodeSize] after possible gap from BitcodeOffset
-  };
-
-The bitcode payload is defined as per bitcode encoding.
-
-Future Directions
------------------
-
-This section provides background on future directions for DXIL that may or may not materialize. They imply a new version of DXIL.
-
-It's desirable to support generic pointers, pointing to one of other kinds of pointers. If the compiler fails to disambiguate, memory access is done via a generic pointer; the HLSL compiler will warn the user about each access that it cannot disambiguate. Not supported for SM6.
-
-HLSL will eventually support more primitive types such as i8, i16, i32, i64, half, float, double, as well as declspec(align(n)) and #pragma pack(n) directives. SM6.0 will eventually require byte-granularity access support in hardware, especially writes. Not supported for SM6.
-
-There will be a Requires32BitAlignedAccesses CAP flag. If absent, this would indicate that the shader requires writes that (1) do not write full four bytes, or (2) are not aligned on four-byte boundary. If hardware does not natively support these, the shader is rejected. Programmers can work around this hardware limitation by manually aligning smaller data on four-byte boundary in HLSL.
-
-When libraries are supported as first-class DXIL constructs, "lib_*" shader models can specify more than one entry point per module; the other shader models must specify exactly one entry point.
-
-The target machine specification for HLSL might specify a 64-bit pointer side with 64-bit offsets.
-
-Hardware support for generic pointer is essential for HLSL next as a fallback mechanism for cases when compiler cannot disambiguate pointer's address space.
-
-Future DXIL will change how half and i16 are treated:
-* i16 will have to be supported natively either in hardware or via emulation,
-* half's behavior will depend on the value of RequiresHardwareHalf CAP; if it's not set, half can be treated as min-precision type (min16float); i.e., computation may be done with values implicitly promoted to floats; if it's set and hardware does not support half type natively, the driver compiler can either emulate exact IEEE half behavior or fail shader creation.
-
-Pending Specification Work
-==========================
-
-The following work on this specification is still pending:
-
-* Consider moving some additional tables and lists into hctdb and cross-reference.
-* Complete the extended documentation for instructions.
-
+=============================
+DirectX Intermediate Language
+=============================
+
+.. contents::
+   :local:
+   :depth: 2
+
+Introduction
+============
+
+This document presents the design of the DirectX Intermediate Language (DXIL) for GPU shaders. DXIL is intended to support a direct mapping of the HLSL programming language into Low-Level Virtual Machine Intermediate Representation (LLVM IR), suitable for consumption in GPU drivers. This version of the specification is based on LLVM 3.7 in the use of metadata syntax.
+
+We distinguish between DXIL, which is a low-level IR for GPU driver compilers, and DXIR, which is a high-level IR, more suitable for emission by IR producers, such as Clang. DXIR is transformed to DXIL by the optimizer. DXIR accepts high-level constructs, such as user-defined types, multi-dimensional arrays, matrices, and vectors. These, however, are not suitable for fast JIT-ing in the driver compilers, and so are lowered by the optimizer, such that DXIL works on simpler abstractions. Both DXIL and DXIR are derived from LLVM IR. This document does not describe DXIR.
+
+LLVM is quickly becoming a de facto standard in modern compilation technology. The LLVM framework offers several distinct features, such as a vibrant ecosystem, complete compilation framework, modular design, and reasonable documentation. We can leverage these to achieve two important objectives.
+
+First, unification of shader compilation tool chain. DXIL is a contract between IR producers, such as compilers for HLSL and other domain-specific languages, and IR consumers, such as IHV driver JIT compilers or offline XBOX shader compiler. In addition, the design provides for conversion the current HLSL IL, called DXBC IL in this document, to DXIL.
+
+Second, leveraging the LLVM ecosystem. Microsoft will publicly document DXIL and DXIR to attract domain language implementers and spur innovation. Using LLVM-based IR offers reduced entry costs for small teams, simply because small teams are likely to use LLVM and Clang as their main compilation framework. We will provide DXIL verifier to check consistency of generated DXIL.
+
+The following diagram shows how some of these components tie together::
+
+  HLSL   Other shading langs  DSL          DXBC IL
+  +      +                    +            +
+  |      |                    |            |
+  v      v                    v            v
+  Clang  Clang                Other Tools  dxbc2dxil
+  +      +                    +            +
+  |      |                    |            |
+  v      v                    v            |
+  +------+--------------------+---------+  |
+  |          High level IR (DXIR)       |  |
+  +-------------------------------------+  |
+                    |                      |
+                    |                      |
+                    v                      |
+                Optimizer <-----+ Linker   |
+                +      ^             +     |
+                |      |             |     |
+                |      |             |     |
+   +------------v------+-------------v-----v-------+
+   |              Low level IR (DXIL)              |
+   +------------+----------------------+-----------+
+                |                      |
+                v                      v
+        Driver Compiler             Verifier
+
+The *dxbc2dxil* element in the diagram is a component that converts existing DXBC shader byte code into DXIL. The *Optimizer* element is a component that consumes DXIR, verifies it is valid, optimizes it, and produces a valid DXIL form. The *Verifier* element is a public component that verifies and signs DXIL. The *Linker* is a component that combines precompiled DXIL libraries with the entry function to produce a valid shader.
+
+DXIL does not support the following HLSL features that were present in prior implementations.
+
+* Shader models 9 and below. Microsoft may implement 10level9 shader models via DXIL capability tiers.
+* Effects.
+* HLSL interfaces.
+* Shader compression/decompression.
+* Partial precision. Half data type should be used instead.
+* min10float type. Half data type should be used instead.
+* HLSL *uniform* parameter qualifier.
+* Current fxc legacy compatibility mode for old shader models (e.g., c-register binding).
+* PDB. Debug Information annotations are used instead.
+* Compute shader model cs_4_0.
+* DXBC label, call, fcall constructs.
+
+The following principles are used to ease reuse with LLVM components and aid extensibility.
+
+* DXIL uses a subset of LLVM IR constructs that makes sense for HLSL.
+* No modifications to the core LLVM IR; i.e., no new instructions or fundamental types.
+* Additional information is conveyed via metadata, LLVM intrinsics or external functions.
+* Name prefixes: 'llvm.dx.', 'llvm.dxil.', 'llvm.dxir.', 'dx.', 'dxil.', and 'dxir.' are reserved.
+
+LLVM IR has three equivalent forms: human-readable, binary (bitcode), and in-memory. DXIL is a binary format and is based on a subset of LLVM IR bitcode format. The document uses only human-readable form to describe DXIL.
+
+Versioning
+==========
+
+There are three versioning mechanisms in DXIL shaders: shader model, DXIL version, and LLVM bitcode version.
+
+At a high-level, the shader model describes the target execution model and environment; DXIL provides a mechanism to express programs (including rules around expressing data types and operations); and LLVM bitcode provides a way to encode a DXIL program.
+
+Shader Model
+------------
+
+The shader model in DXIL is similar to DXBC shader model. A shader model specifies the execution model, the set of capabilities that shader instructions can use and the constraints that a shader program must adhere to.
+
+The shader model is specified as a named metadata in DXIL::
+
+  !dx.shaderModel = !{ !0 }
+  !0 = !{ !"<shadelModelName>", i32 <major>, i32 <minor> }
+
+The following values of <shaderModelName>_<major>_<minor> are supported:
+
+==================== ===================================== ===========
+Target               Legacy Models                         DXIL Models
+==================== ===================================== ===========
+Vertex shader (VS)   vs_4_0, vs_4_1, vs_5_0, vs_5_1        vs_6_0
+Hull shader (HS)     hs_5_0, hs_5_1                        hs_6_0
+Domain shader (DS)   ds_5_0, ds_5_1                        ds_6_0
+Geometry shader (GS) gs_4_0, gs_4_1, gs_5_0, gs_5_1        gs_6_0
+Pixel shader (PS)    ps_4_0, ps_4_1, ps_5_0, ps_5_1        ps_6_0
+Compute shader (CS)  cs_5_0 (cs_4_0 is mapped onto cs_5_0) cs_6_0
+Shader library       no support                            lib_6_1
+==================== ===================================== ===========
+
+The DXIL verifier ensures that DXIL conforms to the specified shader model.
+
+For shader models prior to 6.0, only the rules applicable to the DXIL representation are valid. For example, the limits on maximum number of resources is honored, but the limits on registers aren't because DXIL does not have a representation for registers.
+
+DXIL version
+------------
+
+The primary mechanism to evolve HLSL capabilities is through shader models. However, DXIL version is reserved for additional flexibility of future extensions. There are two currently defined versions: 1.0 and 1.1.
+
+DXIL version has major and minor versions that are specified as named metadata::
+
+  !dx.version = !{ !0 }
+  !0 = !{ i32 <major>, i32 <minor> }
+
+DXIL version must be declared exactly once per LLVM module (translation unit) and is valid for the entire module.
+
+DXIL will evolve in a manner that retains backward compatibility.
+
+LLVM Bitcode version
+--------------------
+
+The current version of DXIL is based on LLVM bitcode v3.7. This encoding is necessarily implied by something outside the DXIL module.
+
+General Issues
+==============
+
+An important goal is to enable HLSL to be closer to a strict subset of C/C++. This has implications for DXIL design and future hardware feature requests outlined below.
+
+Terminology
+-----------
+Resource refers to one of the following:
+
+* SRV - shader resource view (read-only)
+* UAV - unordered access view (read-write)
+* CBV - constant buffer view (read-only)
+* Sampler
+
+Intrinsics typically refer to operations missing in the core LLVM IR. DXIL represents HLSL built-in functions (also called intrinsics) not as LLVM intrinsics, but rather as external function calls.
+
+
+DXIL abstraction level
+----------------------
+
+DXIL has level of abstraction similar to a 'scalarized' DXBC. DXIL is lower level IR than DXIR emitted by the front-end to be amenable to fast and robust JIT-ing in driver compilers.
+
+In particular, the following passes are performed to lower the HLSL/DXIR abstractions down to DXIL:
+
+* optimize function parameter copies
+* inline functions
+* allocate and transform shader signatures
+* lower matrices, optimizing intermediate storage
+* linearize multi-dimensional arrays and user-defined type accesses
+* scalarize vectors
+
+Scalar IR
+---------
+DXIL operations work with scalar quantities. Several scalar quantities may be grouped together in a struct to represent several return values, which is used for memory operations, e.g., load/store, sample, etc., that benefit from access coalescing.
+
+Metadata, resource declarations, and debugging info may contain vectors to more closely convey source code shape to tools and debuggers.
+
+Future versions of IR may contain vectors or grouping hints for less-than-32-bit quantities, such as half and i16.
+
+Memory accesses
+---------------
+
+DXIL conceptually aligns with DXBC in how different memory types are accessed. Out-of-bounds behavior and various restrictions are preserved.
+
+Indexable thread-local and groupshared variables are represented as variables and accessed via LLVM C-like pointers.
+
+Swizzled resources, such as textures, have opaque memory layouts from a DXIL point of view. Accesses to these resources are done via intrinsics.
+
+There are two layouts for constant buffer memory: (1) legacy, matching DXBC's layout and (2) linear layout. SM6 DXIL uses intrinsics to read cbuffer for either layout.
+
+Shader signatures require packing and are located in a special type of memory that cannot be viewed as linear. Accesses to signature values are done via special intrinsics in DXIL. If a signature parameter needs to be passed to a function, a copy is created first in threadlocal memory and the copy is passed to the function.
+
+Typed buffers represent memory with in-flight data conversion. Typed buffer load/store/atomics are done via special functions in DXIL with element-granularity indexing.
+
+The following pointer types are supported:
+
+* Non-indexable thread-local variables.
+* Indexable thread-local variables (DXBC x-registers).
+* Groupshared variables (DXBC g-registers).
+* Device memory pointer.
+* Constant-buffer-like memory pointer.
+
+The type of DXIL pointer is differentiated by LLVM addrspace construct. The HLSL compiler will make the best effort to infer the exact pointer addrspace such that a driver compiler can issue the most efficient instruction.
+
+A pointer can come into being in a number of ways:
+
+* Global Variables.
+* AllocaInst.
+* Synthesized as a result of some pointer arithmetic.
+
+DXIL uses 32-bit pointers in its representation.
+
+Out-of-bounds behavior
+----------------------
+
+Indexable thread-local accesses are done via LLVM pointer and have C-like OOB semantics.
+Groupshared accesses are done via LLVM pointer too. The origin of a groupshared pointer must be a single TGSM allocation.
+If a groupshared pointer uses in-bound GEP instruction, it should not OOB. The behavior for an OOB access for in-bound pointer is undefined.
+For groupshared pointer from regular GEP, OOB will has same behavior as DXBC. Loads return 0 for OOB accesses; OOB stores are silently dropped.
+
+Resource accesses keeps the same out-of-bounds behavior as DXBC. Loads return 0 for OOB accesses; OOB stores are silently dropped.
+
+OOB pointer accesses in SM6.0 and later have undefined (C-like) behavior. LLVM memory optimization passes can be used to optimize such accesses. Where out-of-bound behavior is desired, intrinsic functions are used to access memory.
+
+Memory access granularity
+-------------------------
+
+Intrinsic and resource accesses may imply a wider access than requested by an instruction. DXIL defines memory accesses for i1, i16, i32, i64, f16, f32, f64 on thread local memory, and i32, f32, f64 for memory I/O (that is, groupshared memory and memory accessed via resources such as CBs, UAVs and SRVs).
+
+
+Number of virtual values
+------------------------
+
+There is no limit on the number of virtual values in DXIL. The IR is guaranteed to be in an SSA form. For optimized shaders, the optimizer will run -mem2reg LLVM pass as well as perform other memory to register promotions if profitable.
+
+Control-flow restrictions
+-------------------------
+
+The DXIL control-flow graph must be reducible, as checked by T1-T2 test. DXIL does not preserve structured control flow of DXBC. Preserving structured control-flow property would impose significant burden on third-party tools optimizing to DXIL via LLVM, reducing appeal of DXIL.
+
+DXIL allows fall-through for switch label blocks. This is a difference from DXBC, in which the fall-through is prohibited.
+
+DXIL will not support the DXBC label and call instructions; LLVM functions can be used instead (see below). The primary uses for these are (1) HLSL interfaces, which are not supported, and (2) outlining of case-bodies in a switch statement annotated with [call], which is not a scenario of interest.
+
+Functions
+---------
+
+Instead of DXBC labels/calls, DXIL supports functions and call instructions. Recursion is not allowed; DXIL validator enforces this.
+
+The functions are regular LLVM functions. Parameters can be passed by-value or by-reference. The functions are to facilitate separate compilation for big, complex shaders. However, driver compilers are free to inline functions as they see fit.
+
+Identifiers
+-----------
+
+DXIL identifiers must conform to LLVM IR identifier rules.
+
+Identifier mangling rules are the ones used by Clang 3.7 with the HLSL target.
+
+The following identifier prefixes are reserved:
+
+* dx.*, dxil.*, dxir.*
+* llvm.dx.*, llvm.dxil.*, llvm.dxir.*
+
+Address Width
+-------------
+
+DXIL will use only 32-bit addresses for pointers. Byte offsets are also 32-bit.
+
+Shader restrictions
+-------------------
+
+There is no support for the following in DXIL:
+
+* recursion
+* exceptions
+* indirect function calls and dynamic dispatch
+
+Entry points
+------------
+
+The dx.entryPoints metadata specifies a list of entry point records, one for each entry point. Libraries could specify more than one entry point per module but currently exist outside the DXIL specification; the other shader models must specify exactly one entry point.
+
+For example::
+
+ define void @"\01?myfunc1@@YAXXZ"() #0 { ... }
+ define float @"\01?myfunc2@@YAMXZ"() #0 { ... }
+
+ !dx.entryPoints = !{ !1, !2 }
+
+ !1 = !{ void  ()* @"\01?myfunc1@@YAXXZ", !"myfunc1", !3, null, null }
+ !2 = !{ float ()* @"\01?myfunc2@@YAMXZ", !"myfunc2", !5, !6, !7 }
+
+Each entry point metadata record specifies:
+
+* reference to the entry point function global symbol
+* unmangled name
+* list of signatures
+* list of resources
+* list of tag-value pairs of shader capabilities and other properties
+
+A 'null' value specifies absence of a particular node.
+
+Shader capabilities are properties that are additional to properties dictated by shader model. The list is organized as pairs of i32 tag, followed immediately by the value itself.
+
+Hull shader representation
+--------------------------
+
+The hull shader is represented as two functions, related via metadata: (1) control point phase function, which is the entry point of the hull shader, and (2) patch constant phase function.
+
+For example::
+
+ !dx.entryPoints = !{ !1 }
+ !1 = !{ void ()* @"ControlPointFunc", ..., !2 }  ; shader entry record
+ !2 = !{ !"HS", !3 }
+ !3 = !{ void ()* @"PatchConstFunc", ... }        ; additional hull shader state
+
+The patch constant function represents original HLSL computation, and is not separated into fork and join phases, as it is the case in DXBC. The driver compiler may perform such separation if this is profitable for the target GPU.
+
+In DXBC to DXIL conversion, the original patch constant function cannot be recovered during DXBC-to-DXIL conversion. Instead, instructions of each fork and join phases are 'wrapped' by a loop that iterates the corresponding number of phase-instance-count iterations. Thus, fork/join instance ID becomes the loop induction variable. LoadPatchConstant intrinsic (see below) represents load from DXBC vpc register.
+
+The following table summarizes the names of intrinsic functions to load inputs and store outputs of hull and domain shaders. CP stands for Control Point, PC - for Patch Constant.
+
+=================== ==================== ====================== ======================
+Operation           Control Point (Hull) Patch Constant         Domain
+=================== ==================== ====================== ======================
+Store Input CP
+Load Input CP       LoadInput            LoadInput
+Store Output CP     StoreOutput
+Load Output CP                           LoadOutputControlPoint LoadInput
+Store PC                                 StorePatchConstant
+Load PC                                  LoadPatchConstant      LoadPatchConstant
+Store Output Vertex                                             StoreOutput
+=================== ==================== ====================== ======================
+
+LoadPatchConstant function in PC stage is generated only by DXBC-to-DXIL converter, to access DXBC vpc registers. HLSL compiler produces IR that references LLVM IR values directly.
+
+Type System
+===========
+
+Most of LLVM type system constructs are legal in DXIL.
+
+Primitive Types
+---------------
+
+The following types are supported:
+
+* void
+* metadata
+* i1, i8, i16, i32, i64
+* half, float, double
+
+SM6.0 assumes native hardware support for i32 and float types.
+
+i8 is supported only in a few intrinsics to signify masks, enumeration constant values, or in metadata. It's not supported for memory access or computation by the shader.
+
+HLSL min12int, min16int and min16uint data types are mapped to i16.
+
+half and i16 are treated as corresponding DXBC min-presicion types (min16float, min16int/min16uint) in SM6.0.
+
+The HLSL compiler optimizer treats half, i16 and i8 data as data types natively supported by the hardware; i.e., saturation, range clipping, INF/NaN are done according to the IEEE standard. Such semantics allow the optimizer to reuse LLVM optimization passes.
+
+Hardware support for doubles in optional and is guarded by RequiresHardwareDouble CAP bit.
+
+Hardware support for i64 is optional and is guarded by a CAP bit.
+
+Vectors
+-------
+
+HLSL vectors are scalarized. They do not participate in computation; however, they may be present in declarations to convey original variable layout to tools, debuggers, and reflection.
+
+Future DXIL may add support for <2 x half> and <2 x i16> vectors or hints for packing related half and i16 quantities.
+
+Matrices
+--------
+
+Matrices are lowered to vectors, and are not referenced by instructions. They may be present in declarations to convey original variable layout to tools, debuggers, and reflection.
+
+Arrays
+------
+
+Instructions may reference only 1D arrays of primitive types. However, complex arrays, e.g., multidimensional arrays or user-defined types, may be present to convey original variable layout to tools, debuggers, and reflection.
+
+User-defined types
+------------------
+
+Original HLSL UDTs are lowered and are not referenced by instructions. However, they may be present in declarations to convey original variable layout to tools, debuggers, and reflection. Some resource operations return 'grouping' UDTs that group several return values; such UDTs are immediately 'decomposed' into components that are then consumed by other instructions.
+
+Type conversions
+----------------
+
+Explicit conversions between types are supported via LLVM instructions.
+
+Precise qualifier
+-----------------
+
+By default, all floating-point HLSL operations are considered 'fast' or non-precise. HLSL and driver compilers are allowed to refactor such operations. Non-precise LLVM instructions: fadd, fsub, fmul, fdiv, frem, fcmp are marked with 'fast' math flags.
+
+HLSL precise type qualifier requires that all operations contributing to the value be IEEE compliant with respect to optimizations. The /Gis compiler switch implicitly declares all variables and values as precise.
+
+Precise behavior is represented in LLVM instructions: fadd, fsub, fmul, fdiv, frem, fcmp by not having 'fast' math flags set. Each relevant call instruction that contributes to computation of a precise value is annotated with dx.precise metadata that indicates that it is illegal for the driver compiler to perform IEEE-unsafe optimizations.
+
+Type annotations
+----------------
+
+User-defined types are annotated in DXIL to 'attach' additional properties to structure fields. For example, DXIL may contain type annotations for reflection purposes::
+
+ ; namespace MyNamespace1
+ ; {
+ ;   struct MyType1
+ ;   {
+ ;     float field1;
+ ;     int2 field2;
+ ;   };
+ ; }
+
+ %struct.MyNamespace1.MyType1 = type { float, <2 x i32> }
+ !struct.MyNamespace1.MyType1 = !{ !1, !2 }
+ !1 = !{ !"field1", null }
+ !2 = !{ !"field2", null }
+
+ ; struct MyType2
+ ; {
+ ;    MyType1 array_field[2];
+ ;    float4 float4_field;
+ ; };
+
+ %struct.MyType2 = type { [2 x %struct.MyType1], <4 x float> }
+ !struct.MyType2 = !{ !3, !4 }
+ !3 = !{ !"array_field", null }
+ !4 = !{ !"float4_field", null }
+
+The type/field annotation metadata hierarchy recursively mimics LLVM type hierarchy.
+
+Each field-annotation record has an optional named-value pair list for infrequent annotations and for future extensions. The lists are null in the example above.
+
+Note that Clang emits '::' to separate namespaces, if any, in type names. We modify Clang to use '.' instead, because it is illegal to use ':' in metadata names.
+
+Shader Properties and Capabilities
+==================================
+
+Additional shader properties are specified via tag-value pair list, which is the last element in the entry function description record.
+
+Shader Flags
+------------
+
+Shaders have additional flags that covey their capabilities via tag-value pair with tag kDxilShaderFlagsTag (0), followed by an i64 bitmask integer. The bits have the following meaning:
+
+=== =====================================================================
+Bit Description
+=== =====================================================================
+0   Disable shader optimizations
+1   Disable math refactoring
+2   Shader uses doubles
+3   Force early depth stencil
+4   Enable raw and structured buffers
+5   Shader uses min-precision, expressed as half and i16
+6   Shader uses double extension intrinsics
+7   Shader uses MSAD
+8   All resources must be bound for the duration of shader execution
+9   Enable view port and RT array index from any stage feeding rasterizer
+10  Shader uses inner coverage
+11  Shader uses stencil
+12  Shader uses intrinsics that access tiled resources
+13  Shader uses relaxed typed UAV load formats
+14  Shader uses Level9 comparison filtering
+15  Shader uses up to 64 UAVs
+16  Shader uses UAVs
+17  Shader uses CS4 raw and structured buffers
+18  Shader uses Rasterizer Ordered Views
+19  Shader uses wave intrinsics
+20  Shader uses int64 instructions
+=== =====================================================================
+
+Geometry Shader
+---------------
+
+Geometry shader properties are specified via tag-value pair with tag kDxilGSStateTag (1), followed by a list of GS properties. The format of this list is the following.
+
+=== ==== ===============================================================
+Idx Type Description
+=== ==== ===============================================================
+0   i32  Input primitive (InputPrimitive enum value).
+1   i32  Max vertex count.
+2   i32  Primitive topology for stream 0 (PrimitiveTopology enum value).
+3   i32  Primitive topology for stream 1 (PrimitiveTopology enum value).
+4   i32  Primitive topology for stream 2 (PrimitiveTopology enum value).
+5   i32  Primitive topology for stream 3 (PrimitiveTopology enum value).
+=== ==== ===============================================================
+
+Domain Shader
+-------------
+
+Domain shader properties are specified via tag-value pair with tag kDxilDSStateTag (2), followed by a list of DS properties. The format of this list is the following.
+
+=== ==== ===============================================================
+Idx Type Description
+=== ==== ===============================================================
+0   i32  Tessellator domain (TessellatorDomain enum value).
+1   i32  Input control point count.
+=== ==== ===============================================================
+
+Hull Shader
+-----------
+
+Hull shader properties are specified via tag-value pair with tag kDxilHSStateTag (3), followed by a list of HS properties. The format of this list is the following.
+
+=== ======= =====================================================================
+Idx Type    Description
+=== ======= =====================================================================
+0   MDValue Patch constant function (global symbol).
+1   i32     Input control point count.
+2   i32     Output control point count.
+3   i32     Tessellator domain (TessellatorDomain enum value).
+4   i32     Tessellator partitioning (TessellatorPartitioning enum value).
+5   i32     Tessellator output primitive (TessellatorOutputPrimitive enum value).
+6   float   Max tessellation factor.
+=== ======= =====================================================================
+
+Compute Shader
+--------------
+
+Compute shader has the following tag-value properties.
+
+===================== ======================== =============================================
+Tag	                  Value                    Description
+===================== ======================== =============================================
+kDxilNumThreadsTag(4) MD list: (i32, i32, i32) Number of threads (X,Y,Z) for compute shader.
+===================== ======================== =============================================
+
+Shader Parameters and Signatures
+================================
+
+This section formalizes how HLSL shader input and output parameters are expressed in DXIL.
+
+HLSL signatures and semantics
+-----------------------------
+
+Formal parameters of a shader entry function in HLSL specify how the shader interacts with the graphics pipeline. Input parameters, referred to as an input signature, specify values received by the shader. Output parameters, referred to as an output signature, specify values produced by the shader. The shader compiler maps HLSL input and output signatures into DXIL specifications that conform to hardware constraints outlined in the Direct3D Functional Specification. DXIL specifications are also called signatures.
+
+Signature mapping is a complex process, as there are many constraints. All signature parameters must fit into a finite space of N 4x32-bit registers. For efficiency reasons, parameters are packed together in a way that does not violate specification constraints. The process is called signature packing. Most signatures are tightly packed; however, the VS input signature is not packed, as the values are coming from the Input Assembler (IA) stage rather than the graphics pipeline. Alternately, the PS output signature is allocated to align the SV_Target semantic index with the output register index.
+
+Each HLSL signature parameter is defined via C-like type, interpolation mode, and semantic name and index. The type defines parameter shape, which may be quite complex. Interpolation mode adds to the packing constraints, namely that parameters packed together must have compatible interpolation modes. Semantics are extra names associated with parameters for the following purposes: (1) to specify whether a parameter is as a special System Value (SV) or not, (2) to link parameters to IA or StreamOut API streams, and (3) to aid debugging. Semantic index is used to disambiguate parameters that use the same semantic name, or span multiple rows of the register space.
+
+SV semantics add specific meanings and constraints to associated parameters. A parameter may be supplied by the hardware, and is then known as a System Generated Value (SGV). Alternatively, a parameter may be interpreted by the hardware and is then known as System Interpreted Value (SIV).  SGVs and SIVs are pipeline-stage dependent; moreover, some participate in signature packing and some do not. Non-SV semantics always participate in signature packing.
+
+Most System Generated Values (SGV) are loaded using special Dxil intrinsic functions, rather than loading the input from a signature.  These usually will not be present in the signature at all.  Their presence may be detected by the declaration and use of the special instrinsic function itself.  The exceptions to this are notible.  In one case they are present and loaded from the signature instead of a special intrinsic because they must be part of the packed signature potentially passed from the prior stage, allowing the prior stage to override these values, such as for SV_PrimitiveID and SV_IsFrontFace that may be written in the the Geometry Shader.  In another case, they identify signature elements that still contribute to DXBC signature for informational purposes, but will only use the special intrinsic function to read the value, such as for SV_PrimitiveID for GS input and SampleIndex for PS input.
+
+The classification of behavior for various system values in various signature locations is described in a table organized by SemanticKind and SigPointKind.  The SigPointKind is a new classification that uniquely identifies each set of parameters that may be input or output for each entry point.  For each combination of SemanticKind and SigPointKind, there is a SemanticInterpretationKind that defines the class of treatment for that location.
+
+Each SigPointKind also has a corresponding element allocation (or packing) behavior called PackingKind.  Some SigPointKinds do not result in a signature at all, which corresponds to the packing kind of PackingKind::None.
+
+Signature Points are enumerated as follows in the SigPointKind
+
+.. <py>import hctdb_instrhelp</py>
+.. <py::lines('SIGPOINT-RST')>hctdb_instrhelp.get_sigpoint_rst()</py>
+.. SIGPOINT-RST:BEGIN
+
+== ======== ======= ========== ============== ============= ============================================================================
+ID SigPoint Related ShaderKind PackingKind    SignatureKind Description
+== ======== ======= ========== ============== ============= ============================================================================
+0  VSIn     Invalid Vertex     InputAssembler Input         Ordinary Vertex Shader input from Input Assembler
+1  VSOut    Invalid Vertex     Vertex         Output        Ordinary Vertex Shader output that may feed Rasterizer
+2  PCIn     HSCPIn  Hull       None           Invalid       Patch Constant function non-patch inputs
+3  HSIn     HSCPIn  Hull       None           Invalid       Hull Shader function non-patch inputs
+4  HSCPIn   Invalid Hull       Vertex         Input         Hull Shader patch inputs - Control Points
+5  HSCPOut  Invalid Hull       Vertex         Output        Hull Shader function output - Control Point
+6  PCOut    Invalid Hull       PatchConstant  PatchConstant Patch Constant function output - Patch Constant data passed to Domain Shader
+7  DSIn     Invalid Domain     PatchConstant  PatchConstant Domain Shader regular input - Patch Constant data plus system values
+8  DSCPIn   Invalid Domain     Vertex         Input         Domain Shader patch input - Control Points
+9  DSOut    Invalid Domain     Vertex         Output        Domain Shader output - vertex data that may feed Rasterizer
+10 GSVIn    Invalid Geometry   Vertex         Input         Geometry Shader vertex input - qualified with primitive type
+11 GSIn     GSVIn   Geometry   None           Invalid       Geometry Shader non-vertex inputs (system values)
+12 GSOut    Invalid Geometry   Vertex         Output        Geometry Shader output - vertex data that may feed Rasterizer
+13 PSIn     Invalid Pixel      Vertex         Input         Pixel Shader input
+14 PSOut    Invalid Pixel      Target         Output        Pixel Shader output
+15 CSIn     Invalid Compute    None           Invalid       Compute Shader input
+== ======== ======= ========== ============== ============= ============================================================================
+
+.. SIGPOINT-RST:END
+
+Semantic Interpretations are as follows (SemanticInterpretationKind)
+
+
+.. <py>import hctdb_instrhelp</py>
+.. <py::lines('SEMINT-RST')>hctdb_instrhelp.get_sem_interpretation_enum_rst()</py>
+.. SEMINT-RST:BEGIN
+
+== ========== =============================================================
+ID Name       Description
+== ========== =============================================================
+0  NA         Not Available
+1  SV         Normal System Value
+2  SGV        System Generated Value (sorted last)
+3  Arb        Treated as Arbitrary
+4  NotInSig   Not included in signature (intrinsic access)
+5  NotPacked  Included in signature, but does not contribute to packing
+6  Target     Special handling for SV_Target
+7  TessFactor Special handling for tessellation factors
+8  Shadow     Shadow element must be added to a signature for compatibility
+== ========== =============================================================
+
+.. SEMINT-RST:END
+
+Semantic Interpretations for each SemanticKind at each SigPointKind are as follows
+
+
+.. <py>import hctdb_instrhelp</py>
+.. <py::lines('SEMINT-TABLE-RST')>hctdb_instrhelp.get_sem_interpretation_table_rst()</py>
+.. SEMINT-TABLE-RST:BEGIN
+
+====================== ============ ===== ============ ============ ====== ======= ========== ============ ====== ===== ===== ============ ===== ============= ============= ========
+Semantic               VSIn         VSOut PCIn         HSIn         HSCPIn HSCPOut PCOut      DSIn         DSCPIn DSOut GSVIn GSIn         GSOut PSIn          PSOut         CSIn
+====================== ============ ===== ============ ============ ====== ======= ========== ============ ====== ===== ===== ============ ===== ============= ============= ========
+Arbitrary              Arb          Arb   NA           NA           Arb    Arb     Arb        Arb          Arb    Arb   Arb   NA           Arb   Arb           NA            NA
+VertexID               SV           NA    NA           NA           NA     NA      NA         NA           NA     NA    NA    NA           NA    NA            NA            NA
+InstanceID             SV           Arb   NA           NA           Arb    Arb     NA         NA           Arb    Arb   Arb   NA           Arb   Arb           NA            NA
+Position               Arb          SV    NA           NA           SV     SV      Arb        Arb          SV     SV    SV    NA           SV    SV            NA            NA
+RenderTargetArrayIndex Arb          SV    NA           NA           SV     SV      Arb        Arb          SV     SV    SV    NA           SV    SV            NA            NA
+ViewPortArrayIndex     Arb          SV    NA           NA           SV     SV      Arb        Arb          SV     SV    SV    NA           SV    SV            NA            NA
+ClipDistance           Arb          SV    NA           NA           SV     SV      Arb        Arb          SV     SV    SV    NA           SV    SV            NA            NA
+CullDistance           Arb          SV    NA           NA           SV     SV      Arb        Arb          SV     SV    SV    NA           SV    SV            NA            NA
+OutputControlPointID   NA           NA    NA           NotInSig     NA     NA      NA         NA           NA     NA    NA    NA           NA    NA            NA            NA
+DomainLocation         NA           NA    NA           NA           NA     NA      NA         NotInSig     NA     NA    NA    NA           NA    NA            NA            NA
+PrimitiveID            NA           NA    NotInSig     NotInSig     NA     NA      NA         NotInSig     NA     NA    NA    Shadow       SGV   SGV           NA            NA
+GSInstanceID           NA           NA    NA           NA           NA     NA      NA         NA           NA     NA    NA    NotInSig     NA    NA            NA            NA
+SampleIndex            NA           NA    NA           NA           NA     NA      NA         NA           NA     NA    NA    NA           NA    Shadow _41    NA            NA
+IsFrontFace            NA           NA    NA           NA           NA     NA      NA         NA           NA     NA    NA    NA           SGV   SGV           NA            NA
+Coverage               NA           NA    NA           NA           NA     NA      NA         NA           NA     NA    NA    NA           NA    NotInSig _50  NotPacked _41 NA
+InnerCoverage          NA           NA    NA           NA           NA     NA      NA         NA           NA     NA    NA    NA           NA    NotInSig _50  NA            NA
+Target                 NA           NA    NA           NA           NA     NA      NA         NA           NA     NA    NA    NA           NA    NA            Target        NA
+Depth                  NA           NA    NA           NA           NA     NA      NA         NA           NA     NA    NA    NA           NA    NA            NotPacked     NA
+DepthLessEqual         NA           NA    NA           NA           NA     NA      NA         NA           NA     NA    NA    NA           NA    NA            NotPacked _50 NA
+DepthGreaterEqual      NA           NA    NA           NA           NA     NA      NA         NA           NA     NA    NA    NA           NA    NA            NotPacked _50 NA
+StencilRef             NA           NA    NA           NA           NA     NA      NA         NA           NA     NA    NA    NA           NA    NA            NotPacked _50 NA
+DispatchThreadID       NA           NA    NA           NA           NA     NA      NA         NA           NA     NA    NA    NA           NA    NA            NA            NotInSig
+GroupID                NA           NA    NA           NA           NA     NA      NA         NA           NA     NA    NA    NA           NA    NA            NA            NotInSig
+GroupIndex             NA           NA    NA           NA           NA     NA      NA         NA           NA     NA    NA    NA           NA    NA            NA            NotInSig
+GroupThreadID          NA           NA    NA           NA           NA     NA      NA         NA           NA     NA    NA    NA           NA    NA            NA            NotInSig
+TessFactor             NA           NA    NA           NA           NA     NA      TessFactor TessFactor   NA     NA    NA    NA           NA    NA            NA            NA
+InsideTessFactor       NA           NA    NA           NA           NA     NA      TessFactor TessFactor   NA     NA    NA    NA           NA    NA            NA            NA
+ViewID                 NotInSig _61 NA    NotInSig _61 NotInSig _61 NA     NA      NA         NotInSig _61 NA     NA    NA    NotInSig _61 NA    NotInSig _61  NA            NA
+Barycentrics           NA           NA    NA           NA           NA     NA      NA         NA           NA     NA    NA    NA           NA    NotPacked _61 NA            NA
+====================== ============ ===== ============ ============ ====== ======= ========== ============ ====== ===== ===== ============ ===== ============= ============= ========
+
+.. SEMINT-TABLE-RST:END
+
+Below is a vertex shader example that is used for illustration throughout this section::
+
+ struct Foo {
+   float a;
+   float b[2];
+ };
+
+ struct VSIn {
+   uint    vid     : SV_VertexID;
+   float3  pos     : Position;
+   Foo     foo[3]  : SemIn1;
+   float   f       : SemIn10;
+ };
+
+ struct VSOut
+ {
+   float   f       : SemOut1;
+   Foo     foo[3]  : SemOut2;
+   float4  pos     : SV_Position;
+ };
+
+ void main(in  VSIn  In, 	// input  signature
+           out VSOut Out)	// output signature
+ {
+   ...
+ }
+
+Signature packing must be efficient. It should use as few registers as possible, and the packing algorithm should run in reasonable time. The complication is that the problem is NP complete, and the algorithm needs to resort to using a heuristic.
+
+While the details of the packing algorithm are not important at the moment, it is important to outline some concepts related to how a packed signature is represented in DXIL. Packing is further complicated by the complexity of parameter shapes induced by the C/C++ type system. In the example above, fields of Out.foo array field are actually arrays themselves, strided in memory. Allocating such strided shapes efficiently is hard. To simplify packing, the first step is to break user-defined (struct) parameters into constituent components and to make strided arrays contiguous. This preparation step enables the algorithm to operate on dense rectangular shapes, which we call signature elements. The output signature in the example above has the following elements: float Out_f, float Out_foo_a[3], float Out_foo_b[2][3], and float4 pos. Each element is characterized by the number of rows and columns. These are 1x1, 3x1, 6x1, and 1x4, respectively. The packing algorithm reduces to fitting these elements into Nx4 register space, satisfying all packing-compatibility constraints.
+
+Signature element record
+------------------------
+Each signature element is represented in DXIL as a metadata record.
+
+For above example output signature, the element records are as follows::
+
+ ;  element ID, semantic name, etype, sv, s.idx, interp,  rows, cols, start row, col, ext. list
+ !20 = !{i32 6, !"SemOut",      i8 0, i8 0, !40,   i8 2, i32 1, i8 1, i32 1,    i8 2, null}
+ !21 = !{i32 7, !"SemOut",      i8 0, i8 0, !41,   i8 2, i32 3, i8 1, i32 1,    i8 1, null}
+ !22 = !{i32 8, !"SemOut",      i8 0, i8 0, !42,   i8 2, i32 6, i8 1, i32 1,    i8 0, null}
+ !23 = !{i32 9, !"SV_Position", i8 0, i8 3, !43,   i8 2, i32 1, i8 4, i32 0,    i8 0, null}
+
+A record contains the following fields.
+
+=== =============== ===============================================================================
+Idx Type            Description
+=== =============== ===============================================================================
+0   i32             Unique signature element record ID, used to identify the element in operations.
+1   String metadata Semantic name.
+2   i8              ComponentType (enum value).
+3   i8              SemanticKind (enum value).
+4   Metadata        Metadata list that enumerates all semantic indexes of the flattened parameter.
+5   i8              InterpolationMode (enum value).
+6   i32             Number of element rows.
+7   i8              Number of element columns.
+8   i32             Starting row of element packing location.
+9   i8              Starting column of element packing location.
+10  Metadata        Metadata list of additional tag-value pairs; can be 'null' or empty.
+=== =============== ===============================================================================
+
+Semantic name system values always start with 'S', 'V', '_' , and it is illegal to start a user semantic with this prefix. Non-SVs can be ignored by drivers. Debug layers may use these to help validate signature compatibility between stages.
+
+The last metadata list is used to specify additional properties and future extensions.
+
+Signature record metadata
+-------------------------
+
+A shader typically has two signatures: input and output, while domain shader has an additional patch constant signature. The signatures are composed of signature element records and are attached to the shader entry metadata. The examples below clarify metadata details.
+
+Vertex shader HLSL
+~~~~~~~~~~~~~~~~~~
+
+Here is the HLSL of the above vertex shader. The semantic index assignment is explained in section below::
+
+ struct Foo
+ {
+   float a;
+   float b[2];
+ };
+
+ struct VSIn
+ {
+   uint    vid     : SV_VertexID;
+   float3  pos     : Position;
+   Foo     foo[3]  : SemIn1;
+     // semantic index assignment:
+     // foo[0].a     : SemIn1
+     // foo[0].b[0]  : SemIn2
+     // foo[0].b[1]  : SemIn3
+     // foo[1].a     : SemIn4
+     // foo[1].b[0]  : SemIn5
+     // foo[1].b[1]  : SemIn6
+     // foo[2].a     : SemIn7
+     // foo[2].b[0]  : SemIn8
+     // foo[2].b[1]  : SemIn9
+   float   f       : SemIn10;
+ };
+
+ struct VSOut
+ {
+   float   f       : SemOut1;
+   Foo     foo[3]  : SemOut2;
+     // semantic index assignment:
+     // foo[0].a     : SemOut2
+     // foo[0].b[0]  : SemOut3
+     // foo[0].b[1]  : SemOut4
+     // foo[1].a     : SemOut5
+     // foo[1].b[0]  : SemOut6
+     // foo[1].b[1]  : SemOut7
+     // foo[2].a     : SemOut8
+     // foo[2].b[0]  : SemOut9
+     // foo[2].b[1]  : SemOut10
+   float4  pos     : SV_Position;
+ };
+
+ void main(in  VSIn  In, 	// input  signature
+           out VSOut Out)	// output signature
+ {
+   ...
+ }
+
+The input signature is packed to be compatible with the IA stage. A packing algorithm must assign the following starting positions to the input signature elements:
+
+=================== ==== ======= ========= ===========
+Input element       Rows Columns Start row Start column
+=================== ==== ======= ========= ===========
+uint VSIn.vid       1    1       0         0
+float3 VSIn.pos     1    3       1         0
+float VSIn.foo.a[3] 3    1       2         0
+float VSIn.foo.b[6] 6    1       5         0
+float VSIn.f        1    1       11        0
+=================== ==== ======= ========= ===========
+
+A reasonable packing algorithm would assign the following starting positions to the output signature elements:
+
+==================== ==== ======= ========= ===========
+Input element        Rows Columns Start row Start column
+==================== ==== ======= ========= ===========
+uint VSOut.f         1    1       1         2
+float VSOut.foo.a[3] 3    1       1         1
+float VSOut.foo.b[6] 6    1       1         0
+float VSOut.pos      1    4       0         0
+==================== ==== ======= ========= ===========
+
+Semantic index assignment
+~~~~~~~~~~~~~~~~~~~~~~~~~
+Semantic index assignment in DXIL is exactly the same as for DXBC. Semantic index assignment, abbreviated s.idx above, is a consecutive enumeration of all fields under the same semantic name as if the signature were packed for the IA stage. That is, given a complex signature element, e.g., VSOut's foo[3] with semantic name SemOut and starting index 2, the element is flattened into individual fields: foo[0].a, foo[0].b[0], ..., foo[2].b[1], and the fields receive consecutive semantic indexes 2, 3, ..., 10, respectively. Semantic-index pairs are used to set up the IA stage and to capture values of individual signature registers via the StreamOut API.
+
+DXIL for VS signatures
+~~~~~~~~~~~~~~~~~~~~~~
+
+The corresponding DXIL metadata is presented below::
+
+ !dx.entryPoints = !{ !1 }
+ !1 = !{ void @main(), !"main", !2, null, null }
+ ; Signatures: In,   Out,  Patch Constant (optional)
+ !2 = !{       !3,   !4,   null }
+
+ ; Input signature (packed accordiong to IA rules)
+ !3 = !{ !10, !11, !12, !13, !14 }
+ ; element idx, semantic name, etype, sv, s.idx, interp,  rows, cols, start row, col, ext. list
+ !10 = !{i32 1, !"SV_VertexID", i8 0, i8 1, !30,  i32 0, i32 1, i8 1, i32 0,    i8 0, null}
+ !11 = !{i32 2, !"Position",    i8 0, i8 0, !30,  i32 0, i32 1, i8 3, i32 1,    i8 0, null}
+ !12 = !{i32 3, !"SemIn",       i8 0, i8 0, !32,  i32 0, i32 3, i8 1, i32 2,    i8 0, null}
+ !13 = !{i32 4, !"SemIn",       i8 0, i8 0, !33,  i32 0, i32 6, i8 1, i32 5,    i8 0, null}
+ !14 = !{i32 5, !"SemIn",       i8 0, i8 0, !34,  i32 0, i32 1, i8 1, i32 11,   i8 0, null}
+ ; semantic index assignment:
+ !30 = !{ i32 0 }
+ !32 = !{ i32 1, i32 4, i32 7 }
+ !33 = !{ i32 2, i32 3, i32 5, i32 6, i32 8, i32 9 }
+ !34 = !{ i32 10 }
+
+ ; Output signature (tightly packed according to pipeline stage packing rules)
+ !4 = !{ !20, !21, !22, !23 }
+ ;  element ID, semantic name, etype, sv, s.idx, interp,  rows, cols, start row, col, ext. list
+ !20 = !{i32 6, !"SemOut",      i8 0, i8 0, !40,  i32 2, i32 1, i8 1, i32 1,    i8 2, null}
+ !21 = !{i32 7, !"SemOut",      i8 0, i8 0, !41,  i32 2, i32 3, i8 1, i32 1,    i8 1, null}
+ !22 = !{i32 8, !"SemOut",      i8 0, i8 0, !42,  i32 2, i32 6, i8 1, i32 1,    i8 0, null}
+ !23 = !{i32 9, !"SV_Position", i8 0, i8 3, !43,  i32 2, i32 1, i8 4, i32 0,    i8 0, null}
+ ; semantic index assignment:
+ !40 = !{ i32 1 }
+ !41 = !{ i32 2, i32 5, i32 8 }
+ !42 = !{ i32 3, i32 4, i32 6, i32 7, i32 9, i32 10 }
+ !43 = !{ i32 0 }
+
+Hull shader example
+~~~~~~~~~~~~~~~~~~~
+A hull shader (HS) is defined by two entry point functions: control point (CP) function to compute control points, and patch constant (PC) function to compute patch constant data, including the tessellation factors. The inputs to both functions are the input control points for an entire patch, and therefore each element may be indexed by row and, in addition, is indexed by vertex.
+
+Here is an HS example entry point metadata and signature list::
+
+ ; !105 is extended parameter list containing reference to HS State:
+ !101 = !{ void @HSMain(), !"HSMain", !102, null, !105 }
+ ; Signatures: In,   Out,  Patch Constant
+ !102 = !{     !103, !104, !204 }
+
+The entry point record specifies: (1) CP function HSMain as the main symbol, and (2) PC function via optional metadata node !105.
+
+CP-input signature describing one input control point::
+
+ !103 = !{ !110, !111 }
+ ;  element ID, semantic name, etype, sv, s.idx, interp,  rows, cols, start row, col, ext. list
+ !110= !{i32 1, !"SV_Position", i8 0, i8 3, !130, i32 0, i32 1, i8 4, i32 0,    i8 0, null}
+ !111= !{i32 2, !"array",       i8 0, i8 0, !131, i32 0, i32 4, i8 3, i32 1,    i8 0, null}
+ ; semantic indexing for flattened elements:
+ !130 = !{ i32 0 }
+ !131 = !{ i32 0, i32 1, i32 2, i32 3 }
+
+Note that SV_OutputControlPointID and SV_PrimitiveID input elements are SGVs loaded through special Dxil intrinsics, and are not present in the signature at all.  These have a semantic interpretation of SemanticInterpretationKind::NotInSig.
+
+CP-output signature describing one output control point::
+
+ !104 = !{ !120, !121 }
+ ;  element ID, semantic name, etype, sv, s.idx, interp,  rows, cols, start row, col, ext. list
+ !120= !{i32 3, !"SV_Position", i8 0, i8 3, !130, i32 0, i32 1, i8 4, i32 0,    i8 0, null}
+ !121= !{i32 4, !"array",       i8 0, i8 0, !131, i32 0, i32 4, i8 3, i32 1,    i8 0, null}
+
+Hull shaders require an extended parameter that defines extra state::
+
+ ; extended parameter HS State
+ !105 = !{ i32 3, !201 }
+
+ ; HS State record defines patch constant function and other properties
+ ; Patch Constant Function, in CP count, out CP count, tess domain, tess part, out prim, max tess factor
+ !201 = !{  void @PCMain(), 4,           4,            3,           1,         3,        16.0 }
+
+PC-output signature::
+
+ !204 = !{ !220, !221, !222 }
+ ;  element ID, semantic name,         etype,   sv, s.idx,  interp, rows, cols, start row, col, ext. list
+ !220= !{i32 3, !"SV_TessFactor",       i8 0, i8 25, !130,  i32 0, i32 4, i8 1, i32 0, i8 3, null}
+ !221= !{i32 4, !"SV_InsideTessFactor", i8 0, i8 26, !231,  i32 0, i32 2, i8 1, i32 4, i8 3, null}
+ !222= !{i32 5, !"array",               i8 0, i8 0,  !131,  i32 0, i32 4, i8 3, i32 0, i8 0, null}
+ ; semantic indexing for flattened elements:
+ !231 = !{ i32 0, i32 1 }
+
+Accessing signature value in operations
+---------------------------------------
+
+There are no function parameters or variables that correspond to signature elements. Instead loadInput and storeOutput functions are used to access signature element values in operations. The accesses are scalar.
+
+These are the operation signatures::
+
+ ; overloads: SM5.1: f16|f32|i16|i32,  SM6.0: f16|f32|f64|i8|i16|i32|i64
+ declare float @dx.op.loadInput.f32(
+     i32,                            ; opcode
+     i32,                            ; input ID
+     i32,                            ; row (relative to start row of input ID)
+     i8,                             ; column (relative to start column of input ID), constant in [0,3]
+     i32)                            ; vertex index
+
+ ; overloads: SM5.1: f16|f32|i16|i32,  SM6.0: f16|f32|f64|i8|i16|i32|i64
+ declare void @dx.op.storeOutput.f32(
+     i32,                            ; opcode
+     i32,                            ; output ID
+     i32,                            ; row (relative to start row of output ID)
+     i8,                             ; column (relative to start column of output ID), constant in [0,3]
+     float)                          ; value to store
+
+LoadInput/storeOutput takes input/output element ID, which is the unique ID of a signature element metadata record. The row parameter is the array element row index from the start of the element; the register index is obtained by adding the start row of the element and the row parameter value. Similarly, the column parameter is relative column index; the packed register component is obtained by adding the start component of the element (packed col) and the column value. Several overloads exist to access elements of different primitive types. LoadInput takes an additional vertex index parameter that represents vertex index for DS CP-inputs and GS inputs; vertex index must be undef in other cases.
+
+Signature packing
+-----------------
+
+Signature elements must be packed into a space of N 4-32-bit registers according to runtime constraints. DXIL contains packed signatures. The packing algorithm is more aggressive than that for DX11. However, DXIL packing is only a suggestion to the driver implementation. Driver compilers can rearrange signature elements as they see fit, while preserving compatibility of connected pipeline stages. DXIL is designed in such a way that it is easy to 'relocate' signature elements - loadInput/storeOutput row and column indices do not need to change since they are relative to the start row/column for each element.
+
+Signature packing types
+~~~~~~~~~~~~~~~~~~~~~~~
+
+Two pipeline stages can connect in four different ways, resulting in four packing types.
+
+1. Input Assembly: VS input only
+   * Elements all map to unique registers, they may not be packed together.
+   * Interpolation mode is not used.
+2. Connects to Rasterizer: VS output, HS CP-input/output and PC-input, DS CP-input/output, GS input/output, PS input
+   * Elements can be packed according to constraints.
+   * Interpolation mode is used and must be consistent between connecting signatures.
+   * While HS CP-output and DS CP-input signatures do not go through the rasterizer, they are still treated as such. The reason is the pass-through HS case, in which HS CP-input and HS CP-output must have identical packing for efficiency.
+3. Patch Constant: HS PC-output, DS PC-input
+   * SV_TessFactor and SV_InsideTessFactor are the only SVs relevant here, and this is the only location where they are legal. These have special packing considerations.
+   * Interpolation mode is not used.
+4. Pixel Shader Output: PS output only
+   * Only SV_Target maps to output register space.
+   * No packing is performed, semantic index corresponds to render target index.
+
+Packing constraints
+~~~~~~~~~~~~~~~~~~~
+
+The packing algorithm is stricter and more aggressive in DXIL than in DXBC, although still compatible. In particular, array signature elements are not broken up into scalars, even if each array access can be disambiguated to a literal index. DXIL and DXBC signature packing are not identical, so linking them together into a single pipeline is not supported across compiler generations.
+
+The row dimension of a signature element represents an index range. If constraints permit, two adjacent or overlapping index ranges are coalesced into a single index range.
+
+Packing constraints are as follows:
+
+1. A register must have only one interpolation mode for all 4 components.
+2. Register components containing SVs must be to the right of components containing non-SVs.
+3. SV_ClipDistance and SV_CullDistance have additional constraints:
+   a. May be packed together
+   b. Must occupy a maximum of 2 registers (8-components)
+   c. SV_ClipDistance must have linear interpolation mode
+4. Registers containing SVs may not be within an index range, with the exception of Tessellation Factors (TessFactors).
+5. If an index range R1 overlaps with a TessFactor index range R2, R1 must be contained within R2. As a consequence, outside and inside TessFactors occupy disjoint index ranges when packed.
+6. Non-TessFactor index ranges are combined into a larger range, if they overlap.
+7. SGVs must be packed after all non-SGVs have been packed. If there are several SGVs, they are packed in the order of HLSL declaration.
+
+Packing for SGVs
+~~~~~~~~~~~~~~~~
+
+Non-SGV portions of two connecting signatures must match; however, SGV portions don't have to. An example would be a PS declaring SV_PrimitiveID as an input. If VS connects to PS, PS's SV_PrimitiveID value is synthesized by hardware; moreover, it is illegal to output SV_PrimitiveID from a VS. If GS connects PS, GS may declare SV_PrimitiveID as its output.
+
+Unfortunately, SGV specification creates a complication for separate compilation of connecting shaders. For example, GS outputs SV_PrimitiveID, and PS inputs SV_IsFrontFace and SV_PrimitiveID in this order. The positions of SV_PrimitiveID are incompatible in GS and PS signatures. Not much can be done about this ambiguity in SM5.0 and earlier; the programmers will have to rely on SDKLayers to catch potential mismatch.
+
+SM5.1 and later shaders work on D3D12+ runtime that uses PSO objects to describe pipeline state. Therefore, a driver compiler has access to both connecting shaders during compilation, even though the HLSL compiler does not. The driver compiler can resolve SGV ambiguity in signatures easily. For SM5.1 and later, the HLSL compiler will ensure that declared SGVs fit into packed signature; however, it will set SGV's start row-column location to (-1, 0) such that the driver compiler must resolve SGV placement during PSO compilation.
+
+Shader Resources
+================
+
+All global resources referenced by entry points of an LLVM module are described via named metadata dx.resources, which consists of four metadata lists of resource records::
+
+  !dx.resources = !{ !1, !2, !3, !4 }
+
+Resource lists are as follows.
+
+=== ======== ==============================
+Idx Type     Description
+=== ======== ==============================
+0   Metadata SRVs - shader resource views.
+1   Metadata UAVs - unordered access views.
+2   Metadata CBVs - constant buffer views.
+3   Metadata Samplers.
+=== ======== ==============================
+
+Metadata resource records
+-------------------------
+
+Each resource list contains resource records. Each resource record contains fields that are common for each resource type, followed by fields specific to each resource type, followed by a metadata list of tag/value pairs, which can be used to specify additional properties or future extensions and may be null or empty.
+
+Common fields:
+
+=== =============== ==========================================================================================
+Idx Type            Description
+=== =============== ==========================================================================================
+0   i32             Unique resource record ID, used to identify the resource record in createHandle operation.
+1   Pointer         Pointer to a global constant symbol with the original shape of resource and element type.
+2   Metadata string Name of resource variable.
+3   i32             Bind space ID of the root signature range that corresponds to this resource.
+4   i32             Bind lower bound of the root signature range that corresponds to this resource.
+5   i32             Range size of the root signature range that corresponds to this resource.
+=== =============== ==========================================================================================
+
+When the shader has reflection information, the name is the original, unmangled HLSL name. If reflection is stripped, the name is empty string.
+
+SRV-specific fields:
+
+=== =============== ==========================================================================================
+Idx Type            Description
+=== =============== ==========================================================================================
+6   i32             SRV resource shape (enum value).
+7   i32             SRV sample count.
+8   Metadata        Metadata list of additional tag-value pairs.
+=== =============== ==========================================================================================
+
+SRV-specific tag/value pairs:
+
+=== === ==== =================================================== ============================================
+Idx Tag Type Resource Type                                       Description
+=== === ==== =================================================== ============================================
+0   0   i32  Any resource, except RawBuffer and StructuredBuffer Element type.
+1   1   i32  StructuredBuffer                                    Element stride or StructureBuffer, in bytes.
+=== === ==== =================================================== ============================================
+
+The symbol names for the are kDxilTypedBufferElementTypeTag (0) and kDxilStructuredBufferElementStrideTag (1).
+
+UAV-specific fields:
+
+=== =============== ==========================================================================================
+Idx Type            Description
+=== =============== ==========================================================================================
+6   i32             UAV resource shape (enum value).
+7   i1              1 - globally-coherent UAV; 0 - otherwise.
+8   i1              1 - UAV has counter; 0 - otherwise.
+9   i1              1 - UAV is ROV (rasterizer ordered view); 0 - otherwise.
+10  Metadata        Metadata list of additional tag-value pairs.
+=== =============== ==========================================================================================
+
+UAV-specific tag/value pairs:
+
+=== === ==== ====================================================== ============================================
+Idx Tag Type Resource Type                                          Description
+=== === ==== ====================================================== ============================================
+0   0   i32  RW resource, except RWRawBuffer and RWStructuredBuffer Element type.
+1   1   i32  RWStructuredBuffer                                     Element stride or StructureBuffer, in bytes.
+=== === ==== ====================================================== ============================================
+
+The symbol names for the are kDxilTypedBufferElementTypeTag (0) and kDxilStructuredBufferElementStrideTag (1).
+
+CBV-specific fields:
+
+=== =============== ==========================================================================================
+Idx Type            Description
+=== =============== ==========================================================================================
+6   i32             Constant buffer size in bytes.
+7   Metadata        Metadata list of additional tag-value pairs.
+=== =============== ==========================================================================================
+
+Sampler-specific fields:
+
+=== =============== ==========================================================================================
+Idx Type            Description
+=== =============== ==========================================================================================
+6   i32             Sampler type (enum value).
+7   Metadata        Metadata list of additional tag-value pairs.
+=== =============== ==========================================================================================
+
+The following example demonstrates SRV metadata::
+
+ ; Original HLSL
+ ; Texture2D<float4> MyTexture2D : register(t0, space0);
+ ; StructuredBuffer<NS1::MyType1> MyBuffer[2][3] : register(t1, space0);
+
+ !1 = !{ !2, !3 }
+
+ ; Scalar resource: Texture2D<float4> MyTexture2D.
+ %dx.types.ResElem.v4f32 = type { <4 x float> }
+ @MyTexture2D = external addrspace(1) constant %dx.types.ResElem.v4f32, align 16
+ !2 = !{ i32 0, %dx.types.ResElem.v4f32 addrspace(1)* @MyTexture2D, !"MyTexture2D",
+         i32 0, i32 0, i32 1, i32 2, i32 0, null }
+
+ ; Array resource: StructuredBuffer<MyType1> MyBuffer[2][3].
+ %struct.NS1.MyType1 = type { float, <2 x i32> }
+ %dx.types.ResElem.NS1.MyType1 = type { %struct.NS1.MyType1 }
+ @MyBuffer = external addrspace(1) constant [2x [3 x %dx.types.ResElem.NS1.MyType1]], align 16
+ !3 = !{ i32 1, [2 x [3 x %dx.types.ResElem.NS1.MyType1]] addrspace(1)* @MyBuffer, !"MyBuffer",
+         i32 0, i32 1, i32 6, i32 11, i32 0, null }
+
+The type name of the variable is constructed by appending the element name (primitive, vector or UDT name) to dx.types.ResElem prefix. The type configuration of the resource range variable conveys (1) resource range shape and (2) resource element type.
+
+
+Reflection information
+----------------------
+
+Resource reflection data is conveyed via the resource's metadata record and global, external variable. The metadata record contains the original HLSL name, root signature range information, and the reference to the global resource variable declaration. The resource variable declaration conveys resource range shape, resource type and resource element type.
+
+The following disassembly provides an example::
+
+ ; Scalar resource: Texture2D<float4> MyTexture2D.
+ %dx.types.ResElem.v4f32 = type { <4 x float> }
+ @MyTexture2D = external addrspace(1) constant %dx.types.ResElem.v4f32, align 16
+ !0 = !{ i32 0, %dx.types.ResElem.v4f32 addrspace(1)* @MyTexture2D, !"MyTexture2D",
+         i32 0, i32 3, i32 1, i32 2, i32 0, null }
+
+ ; struct MyType2 { float4 field1; int2 field2; };
+ ; Constant buffer: ConstantBuffer<MyType2> MyCBuffer1[][3] : register(b5, space7)
+ %struct.MyType2 = type { <4 x float>, <2 x i32> }
+ ; Type reflection information (optional)
+ !struct.MyType2 = !{ !1, !2 }
+ !1 = !{ !"field1", null }
+ !2 = !{ !"field2", null }
+
+ %dx.types.ResElem.MyType1 = type { %struct.MyType2 }
+
+ @MyCBuffer1 = external addrspace(1) constant [0 x [3 x %dx.types.ResElem.MyType2]], align 16
+
+ !3 = !{ i32 0, [0 x [3 x %dx.types.ResElem.MyType1]] addrspace(1)* @MyCBuffer1, !"MyCBuffer1",
+         i32 7, i32 5, i32 -1, null }
+
+The reflection information can be removed from DXIL by obfuscating the resource HLSL name and resource variable name as well as removing reflection type annotations, if any.
+
+Structure of resource operation
+-------------------------------
+
+Operations involving shader resources and samplers are expressed via external function calls.
+
+Below is an example for the sample method::
+
+ %dx.types.ResRet.f32 = type { float, float, float, float, i32 }
+
+ declare %dx.types.ResRet.f32 @dx.op.sample.f32(
+     i32,                      ; opcode
+     %dx.types.ResHandle,      ; texture handle
+     %dx.types.SamplerHandle,  ; sampler handle
+     float,                    ; coordinate c0
+     float,                    ; coordinate c1
+     float,                    ; coordinate c2
+     float,                    ; coordinate c3
+     i32,                      ; offset o0
+     i32,                      ; offset o1
+     i32,                      ; offset o2
+     float)                    ; clamp
+
+The method always returns five scalar values that are aggregated in dx.types.ResRet.f32 type and extracted into scalars via LLVM's extractelement right after the call. The first four elements are sample values and the last field is the status of operation for tiled resources. Some return values may be unused, which is easily determined from the SSA form. The driver compiler is free to specialize the sample instruction to the most efficient form depending on which return values are used in computation.
+
+If applicable, each intrinsic is overloaded on return type, e.g.::
+
+  %dx.types.ResRet.f32 = type { float, float, float, float, i32 }
+  %dx.types.ResRet.f16 = type { half, half, half, half, i32 }
+
+  declare %dx.types.ResRet.f32 @dx.op.sample.f32(...)
+  declare %dx.types.ResRet.f16 @dx.op.sample.f16(...)
+
+Wherever applicable, the return type indicates the "precision" at which the operation is executed. For example, sample intrinsic that returns half data is allowed to be executed at half precision, assuming hardware supports this; however, if the return type is float, the sample operation must be executed in float precision. If lower-precision is not supported by hardware, it is allowed to execute a higher-precision variant of the operation.
+
+The opcode parameter uniquely identifies the sample operation. More details can be found in the Instructions section. The value of opcode is the same for all overloads of an operation.
+
+Some resource operations are "polymorphic" with respect to resource types, e.g., dx.op.sample.f32 operates on several resource types: Texture1D[Array], Texture2D[Array], Texture3D, TextureCUBE[Array].
+
+Each resource/sampler is represented by a pair of i32 values. The first value is a unique (virtual) resource range ID, which corresponds to HLSL declaration of a resource/sampler. Range ID must be a constant for SM5.1 and below. The second integer is a 0-based index within the range. The index must be constant for SM5.0 and below.
+
+Both indices can be dynamic for SM6 and later to provide flexibility in usage of resources/samplers in control flow, e.g.::
+
+  Texture2D<float4> a[8], b[8];
+  ...
+  Texture2D<float4> c;
+  if(cond)	// arbitrary expression
+    c = a[idx1];
+  else
+    c = b[idx2];
+  ... = c.Sample(...);
+
+Resources/samplers used in such a way must reside in descriptor tables (cannot be root descriptors); this will be validated during shader and root signature setup.
+
+The DXIL verifier will ensure that all leaf-ranges (a and b above) of such a resource/sampler live-range have the same resource/sampler type and element type. If applicable, this constraint may be relaxed in the future. In particular, it is logical from HLSL programmer point of view to issue loads on compatible resource types, e.g., Texture2D, RWTexture2D, ROVTexture2D::
+
+  Texture2D<float4> a[8];
+  RWTexture2D<float4> b[6];
+  ...
+  Texture2D<float4> c;
+  if(cond)	// arbitrary expression
+   c = a[idx1];
+  else
+   c = b[idx2];
+  ... = c.Load(...);
+
+LLVM's undef value is used for unused input parameters. For example, coordinates c2 and c3 in an dx.op.sample.f32 call for Texture2D are undef, as only two coordinates c0 and c1 are required.
+
+If the clamp parameter is unused, its default value is 0.0f.
+
+Resource operations are not overloaded on input parameter types. For example, dx.op.sample.f32 operation does not have an overload where coordinates have half, rather than float, data type. Instead, the precision of input arguments can be inferred from the IR via a straightforward lookup along an SSA edge, e.g.::
+
+  %c0 = fpext half %0 to float
+  %res = call %dx.types.ResRet.f32 @dx.op.sample.f32(..., %c0, ...)
+
+SSA form makes it easy to infer that value %0 of type half got promoted to float. The driver compiler can tailor the instruction to the most efficient form for the target hardware.
+
+Resource operations
+-------------------
+
+The section lists resource access operations. The specification is given for float return type, if applicable. The list of all overloads can be found in the appendix on intrinsic operations.
+
+Some general rules to interpret resource operations:
+
+* The number of active (meaningful) return components is determined by resource element type. Other return values must be unused; validator ensures this.
+* GPU instruction needs status only if the status return value is used in the program, which is determined through SSA.
+* Overload suffixes are specified for each resource operation.
+* Type of resource determines which inputs must be defined. Unused inputs are passed typed LLVM 'undef' values. This is checked by the DXIL validator.
+* Offset input parameters are i8 constants in [-8,+7] range; default offset is 0.
+
+Resource operation return types
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Many resource operations return several scalar values as well as status for tiled resource access. The return values are grouped into a helper structure type, as this is LLVM's way to return several values from the operation. After an operation, helper types are immediately decomposed into scalars, which are used in further computation.
+
+The defined helper types are listed below::
+
+  %dx.types.ResRet.i8  = type { i8, i8, i8, i8, i32 }
+  %dx.types.ResRet.i16 = type { i16, i16, i16, i16, i32 }
+  %dx.types.ResRet.i32 = type { i32, i32, i32, i32, i32 }
+  %dx.types.ResRet.i64 = type { i64, i64, i64, i64, i32 }
+  %dx.types.ResRet.f16 = type { half, half, half, half, i32 }
+  %dx.types.ResRet.f32 = type { float, float, float, float, i32 }
+  %dx.types.ResRet.f64 = type { double, double, double, double, i32 }
+
+  %dx.types.Dimensions = type { i32, i32, i32, i32 }
+  %dx.types.SamplePos  = type { float, float }
+
+Resource handles
+~~~~~~~~~~~~~~~~
+
+Resources are identified via handles passed to resource operations. Handles are represented via opaque type::
+
+  %dx.types.Handle     = type { i8 * }
+
+The handles are created out of resource range ID and index into the range::
+
+  declare %dx.types.Handle @dx.op.createHandle(
+      i32,                  ; opcode
+      i8,                   ; resource class: SRV=0, UAV=1, CBV=2, Sampler=3
+      i32,                  ; resource range ID (constant)
+      i32,                  ; index into the range
+      i1)                   ; non-uniform resource index: false or true
+
+Resource class is a constant that indicates which metadata list (SRV, UAV, CBV, Sampler) to use for property queries.
+
+Resource range ID is an i32 constant, which is the position of the metadata record in the corresponding metadata list. Range IDs start with 0 and are contiguous within each list.
+
+Index is an i32 value that may be a constant or a value computed by the shader.
+
+CBufferLoadLegacy
+~~~~~~~~~~~~~~~~~
+
+The following signature shows the operation syntax::
+  
+  ; overloads: SM5.1: f32|i32|f64,  future SM: possibly deprecated
+  %dx.types.CBufRet.f32 = type { float, float, float, float }
+  declare %dx.types.CBufRet.f32 @dx.op.cbufferLoadLegacy.f32(
+      i32,                  ; opcode
+      %dx.types.Handle,     ; resource handle
+      i32)	                ; 0-based row index (row = 16-byte DXBC register)
+
+Valid resource types: ConstantBuffer. Valid shader model: SM5.1 and earlier.
+
+The operation loads four 32-bit values from a constant buffer, which has legacy, 16-byte layout. Values are extracted via "extractvalue" instruction; unused values may be optimized away by the driver compiler. The operation respects SM5.1 and earlier OOB behavior for cbuffers.
+
+CBufferLoad
+~~~~~~~~~~~
+
+The following signature shows the operation syntax::
+
+  ; overloads: SM5.1: f32|i32|f64,  SM6.0: f16|f32|f64|i16|i32|i64
+  declare float @dx.op.cbufferLoad.f32(
+      i32,                  ; opcode
+      %dx.types.Handle,     ; resource handle
+      i32,	                ; byte offset from the start of the buffer memory
+      i32)                  ; read alignment
+
+Valid resource types: ConstantBuffer.
+
+The operation loads a value from a constant buffer, which has linear layout, using 1D index: byte offset from the beginning of the buffer memory. The operation respects SM5.1 and earlier OOB behavior for cbuffers.
+
+Read alignment is a constant value identifying what the byte offset alignment is. If the actual byte offset does not have this alignment, the results of this operation are undefined.
+
+GetDimensions
+~~~~~~~~~~~~~
+
+The following signature shows the operation syntax::
+
+  declare %dx.types.Dimensions @dx.op.getDimensions(
+      i32,                  ; opcode
+      %dx.types.Handle,     ; resource handle
+      i32)                  ; MIP level
+
+This table describes the return component meanings for each resource type { c0, c1, c2, c3 }.
+
+==================== ===== ========== ========== ==========
+Valid resource types c0    c1         c2         c3
+==================== ===== ========== ========== ==========
+[RW]Texture1D        width undef      undef      MIP levels
+[RW]Texture1DArray   width array size undef      MIP levels
+[RW]Texture2D        width height     undef      MIP levels
+[RW]Texture2DArray   width height     array size MIP levels
+[RW]Texture3D        width height     depth      MIP levels
+[RW]Texture2DMS      width height     undef      samples
+[RW]Texture2DMSArray width height     array size samples
+TextureCUBE          width height     undef      MIP levels
+TextureCUBEArray     width height     array size MIP levels
+[RW]TypedBuffer      width undef      undef      undef
+[RW]RawBuffer        width undef      undef      undef
+[RW]StructuredBuffer width undef      undef      undef
+==================== ===== ========== ========== ==========
+
+MIP levels is always undef for RW resources.  Undef means the component will not be used.  The validator will verify this.
+There is no GetDimensions that returns float values.
+
+Sample
+~~~~~~
+
+The following signature shows the operation syntax::
+
+  ; overloads: SM5.1: f32,  SM6.0: f16|f32
+  declare %dx.types.ResRet.f32 @dx.op.sample.f32(
+      i32,                  ; opcode
+      %dx.types.Handle,     ; texture handle
+      %dx.types.Handle,     ; sampler handle
+      float,                ; coordinate c0
+      float,                ; coordinate c1
+      float,                ; coordinate c2
+      float,                ; coordinate c3
+      i32,                  ; offset o0
+      i32,                  ; offset o1
+      i32,                  ; offset o2
+      float)                ; clamp
+
+=================== ================================ ===================
+Valid resource type # of active coordinates          # of active offsets
+=================== ================================ ===================
+Texture1D           1 (c0)                           1 (o0)
+Texture1DArray      2 (c0, c1 = array slice)         1 (o0)
+Texture2D           2 (c0, c1)                       2 (o0, o1)
+Texture2DArray      3 (c0, c1, c2 = array slice)     2 (o0, o1)
+Texture3D           3 (c0, c1, c2)                   3 (o0, o1, o2)
+TextureCUBE         3 (c0, c1, c2)                   3 (o0, o1, o2)
+TextureCUBEArray    4 (c0, c1, c2, c3 = array slice) 3 (o0, o1, o2)
+=================== ================================ ===================
+
+SampleBias
+~~~~~~~~~~
+
+The following signature shows the operation syntax::
+
+  ; overloads: SM5.1: f32,  SM6.0: f16|f32
+  declare %dx.types.ResRet.f32 @dx.op.sampleBias.f32(
+      i32,                  ; opcode
+      %dx.types.Handle,     ; texture handle
+      %dx.types.Handle,     ; sampler handle
+      float,                ; coordinate c0
+      float,                ; coordinate c1
+      float,                ; coordinate c2
+      float,                ; coordinate c3
+      i32,                  ; offset o0
+      i32,                  ; offset o1
+      i32,                  ; offset o2
+      float,                ; bias: in [-16.f,15.99f]
+      float)                ; clamp
+
+Valid resource types and active components/offsets are the same as for the sample operation.
+
+SampleLevel
+~~~~~~~~~~~
+
+The following signature shows the operation syntax::
+
+  ; overloads: SM5.1: f32,  SM6.0: f16|f32
+  declare %dx.types.ResRet.f32 @dx.op.sampleLevel.f32(
+      i32,                  ; opcode
+      %dx.types.Handle,     ; texture handle
+      %dx.types.Handle,     ; sampler handle
+      float,                ; coordinate c0
+      float,                ; coordinate c1
+      float,                ; coordinate c2
+      float,                ; coordinate c3
+      i32,                  ; offset o0
+      i32,                  ; offset o1
+      i32,                  ; offset o2
+      float)                ; LOD
+
+Valid resource types and active components/offsets are the same as for the sample operation.
+
+SampleGrad
+~~~~~~~~~~
+
+The following signature shows the operation syntax::
+
+  ; overloads: SM5.1: f32,  SM6.0: f16|f32
+  declare %dx.types.ResRet.f32 @dx.op.sampleGrad.f32(
+      i32,                  ; opcode
+      %dx.types.Handle,     ; texture handle
+      %dx.types.Handle,     ; sampler handle
+      float,                ; coordinate c0
+      float,                ; coordinate c1
+      float,                ; coordinate c2
+      float,                ; coordinate c3
+      i32,                  ; offset o0
+      i32,                  ; offset o1
+      i32,                  ; offset o2
+      float,                ; ddx0
+      float,                ; ddx1
+      float,                ; ddx2
+      float,                ; ddy0
+      float,                ; ddy1
+      float,                ; ddy2
+      float)                ; clamp
+
+Valid resource types and active components and offsets are the same as for the sample operation. Valid active ddx and ddy are   the same as offsets.
+
+SampleCmp
+~~~~~~~~~
+
+The following signature shows the operation syntax::
+
+  ; overloads: SM5.1: f32,  SM6.0: f16|f32
+  declare %dx.types.ResRet.f32 @dx.op.sampleCmp.f32(
+      i32,                  ; opcode
+      %dx.types.Handle,     ; texture handle
+      %dx.types.Handle,     ; sampler handle
+      float,                ; coordinate c0
+      float,                ; coordinate c1
+      float,                ; coordinate c2
+      float,                ; coordinate c3
+      i32,                  ; offset o0
+      i32,                  ; offset o1
+      i32,                  ; offset o2
+      float,                ; compare value
+      float)                ; clamp
+
+=================== ================================ ===================
+Valid resource type # of active coordinates          # of active offsets
+=================== ================================ ===================
+Texture1D           1 (c0)                           1 (o0)
+Texture1DArray      2 (c0, c1 = array slice)         1 (o0)
+Texture2D           2 (c0, c1)                       2 (o0, o1)
+Texture2DArray      3 (c0, c1, c2 = array slice)     2 (o0, o1)
+TextureCUBE         3 (c0, c1, c2)                   3 (o0, o1, o2)
+TextureCUBEArray    4 (c0, c1, c2, c3 = array slice) 3 (o0, o1, o2)
+=================== ================================ ===================
+
+SampleCmpLevelZero
+~~~~~~~~~~~~~~~~~~
+
+The following signature shows the operation syntax::
+
+  ; overloads: SM5.1: f32,  SM6.0: f16|f32
+  declare %dx.types.ResRet.f32 @dx.op.sampleCmpLevelZero.f32(
+      i32,                  ; opcode
+      %dx.types.Handle,     ; texture handle
+      %dx.types.Handle,     ; sampler handle
+      float,                ; coordinate c0
+      float,                ; coordinate c1
+      float,                ; coordinate c2
+      float,                ; coordinate c3
+      i32,                  ; offset o0
+      i32,                  ; offset o1
+      i32,                  ; offset o2
+      float)                ; compare value
+
+Valid resource types and active components/offsets are the same as for the sampleCmp operation.
+
+TextureLoad
+~~~~~~~~~~~
+
+The following signature shows the operation syntax::
+
+  ; overloads: SM5.1: f32|i32,  SM6.0: f16|f32|i16|i32
+  declare %dx.types.ResRet.f32 @dx.op.textureLoad.f32(
+      i32,                  ; opcode
+      %dx.types.Handle,     ; texture handle
+      i32,                  ; MIP level; sample for Texture2DMS
+      i32,                  ; coordinate c0
+      i32,                  ; coordinate c1
+      i32,                  ; coordinate c2
+      i32,                  ; offset o0
+      i32,                  ; offset o1
+      i32)                  ; offset o2
+
+=================== ========= ============================ ===================
+Valid resource type MIP level # of active coordinates      # of active offsets
+=================== ========= ============================ ===================
+Texture1D           yes       1 (c0)                       1 (o0)
+RWTexture1D         undef     1 (c0)                       undef
+Texture1DArray      yes       2 (c0, c1 = array slice)     1 (o0)
+RWTexture1DArray    undef     2 (c0, c1 = array slice)     undef
+Texture2D           yes       2 (c0, c1)                   2 (o0, o1)
+RWTexture2D         undef     2 (c0, c1)                   undef
+Texture2DArray      yes       3 (c0, c1, c2 = array slice) 2 (o0, o1)
+RWTexture2DArray    undef     3 (c0, c1, c2 = array slice) undef
+Texture3D           yes       3 (c0, c1, c2)               3 (o0, o1, o2)
+RWTexture3D         undef     3 (c0, c1, c2)               undef
+=================== ========= ============================ ===================
+
+For Texture2DMS:
+
+=================== ============ =================================
+Valid resource type Sample index # of active coordinate components
+=================== ============ =================================
+Texture2DMS         yes          2 (c0, c1)
+Texture2DMSArray    yes          3 (c0, c1, c2 = array slice)
+=================== ============ =================================
+
+TextureStore
+~~~~~~~~~~~~
+
+The following signature shows the operation syntax::
+
+  ; overloads: SM5.1: f32|i32,  SM6.0: f16|f32|i16|i32
+  ; returns: status
+  declare void @dx.op.textureStore.f32(
+      i32,                  ; opcode
+      %dx.types.Handle,     ; texture handle
+      i32,                  ; coordinate c0
+      i32,                  ; coordinate c1
+      i32,                  ; coordinate c2
+      float,                ; value v0
+      float,                ; value v1
+      float,                ; value v2
+      float,                ; value v3
+      i8)                   ; write mask
+
+The write mask indicates which components are written (x - 1, y - 2, z - 4, w - 8), similar to DXBC. The mask must cover all resource components.
+
+=================== =================================
+Valid resource type # of active coordinate components
+=================== =================================
+RWTexture1D         1 (c0)
+RWTexture1DArray    2 (c0, c1 = array slice)
+RWTexture2D         2 (c0, c1)
+RWTexture2DArray    3 (c0, c1, c2 = array slice)
+RWTexture3D         3 (c0, c1, c2)
+=================== =================================
+
+CalculateLOD
+~~~~~~~~~~~~
+
+The following signature shows the operation syntax::
+
+  ; returns: LOD
+  declare float @dx.op.calculateLOD.f32(
+      i32,                  ; opcode
+      %dx.types.Handle,     ; texture handle
+      %dx.types.Handle,     ; sampler handle
+      float,                ; coordinate c0, [0.0, 1.0]
+      float,                ; coordinate c1, [0.0, 1.0]
+      float,                ; coordinate c2, [0.0, 1.0]
+      i1)                   ; true - clamped; false - unclamped
+
+============================= =======================
+Valid resource type           # of active coordinates
+============================= =======================
+Texture1D, Texture1DArray     1 (c0)
+Texture2D, Texture2DArray     2 (c0, c1)
+Texture3D                     3 (c0, c1, c2)
+TextureCUBE, TextureCUBEArray 3 (c0, c1, c2)
+============================= =======================
+
+TextureGather
+~~~~~~~~~~~~~
+
+The following signature shows the operation syntax::
+
+  ; overloads: SM5.1: f32|i32,  SM6.0: f16|f32|i16|i32
+  declare %dx.types.ResRet.f32 @dx.op.textureGather.f32(
+      i32,                  ; opcode
+      %dx.types.Handle,     ; texture handle
+      %dx.types.Handle,     ; sampler handle
+      float,                ; coordinate c0
+      float,                ; coordinate c1
+      float,                ; coordinate c2
+      float,                ; coordinate c3
+      i32,                  ; offset o0
+      i32,                  ; offset o1
+      i32)                  ; channel, constant in {0=red,1=green,2=blue,3=alpha}
+
+=================== ================================ ===================
+Valid resource type # of active coordinates          # of active offsets
+=================== ================================ ===================
+Texture2D           2 (c0, c1)                       2 (o0, o1)
+Texture2DArray      3 (c0, c1, c2 = array slice)     2 (o0, o1)
+TextureCUBE         3 (c0, c1, c2)                   0
+TextureCUBEArray    4 (c0, c1, c2, c3 = array slice) 0
+=================== ================================ ===================
+
+TextureGatherCmp
+~~~~~~~~~~~~~~~~
+
+The following signature shows the operation syntax::
+
+  ; overloads: SM5.1: f32|i32,  SM6.0: f16|f32|i16|i32
+  declare %dx.types.ResRet.f32 @dx.op.textureGatherCmp.f32(
+      i32,                  ; opcode
+      %dx.types.Handle,     ; texture handle
+      %dx.types.Handle,     ; sampler handle
+      float,                ; coordinate c0
+      float,                ; coordinate c1
+      float,                ; coordinate c2
+      float,                ; coordinate c3
+      i32,                  ; offset o0
+      i32,                  ; offset o1
+      i32,                  ; channel, constant in {0=red,1=green,2=blue,3=alpha}
+      float)                ; compare value
+
+Valid resource types and active components/offsets are the same as for the textureGather operation.
+
+Texture2DMSGetSamplePosition
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The following signature shows the operation syntax::
+
+  declare %dx.types.SamplePos @dx.op.texture2DMSGetSamplePosition(
+      i32,                  ; opcode
+      %dx.types.Handle,     ; texture handle
+      i32)                  ; sample ID
+
+Returns sample position of a texture.
+
+RenderTargetGetSamplePosition
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The following signature shows the operation syntax::
+
+  declare %dx.types.SamplePos @dx.op.renderTargetGetSamplePosition(
+      i32,                  ; opcode
+      i32)                  ; sample ID
+
+Returns sample position of a render target.
+
+RenderTargetGetSampleCount
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The following signature shows the operation syntax::
+
+  declare i32 @dx.op.renderTargetGetSampleCount(
+      i32)                  ; opcode
+
+Returns sample count of a render target.
+
+BufferLoad
+~~~~~~~~~~
+
+The following signature shows the operation syntax::
+
+  ; overloads: SM5.1: f32|i32,  SM6.0: f32|i32
+  declare %dx.types.ResRet.f32 @dx.op.bufferLoad.f32(
+      i32,                  ; opcode
+      %dx.types.Handle,     ; resource handle
+      i32,                  ; coordinate c0
+      i32)                  ; coordinate c1
+
+The call respects SM5.1 OOB and alignment rules.
+
+=================== =====================================================
+Valid resource type # of active coordinates
+=================== =====================================================
+[RW]TypedBuffer     1 (c0 in elements)
+[RW]RawBuffer       1 (c0 in bytes)
+[RW]TypedBuffer     2 (c0 in elements, c1 = byte offset into the element)
+=================== =====================================================
+
+BufferStore
+~~~~~~~~~~~
+
+The following signature shows the operation syntax::
+
+  ; overloads: SM5.1: f32|i32,  SM6.0: f32|i32
+  ; returns: status
+  declare void @dx.op.bufferStore.f32(
+      i32,                  ; opcode
+      %dx.types.Handle,     ; resource handle
+      i32,                  ; coordinate c0
+      i32,                  ; coordinate c1
+      float,                ; value v0
+      float,                ; value v1
+      float,                ; value v2
+      float,                ; value v3
+      i8)                   ; write mask
+
+The call respects SM5.1 OOB and alignment rules.
+
+The write mask indicates which components are written (x - 1, y - 2, z - 4, w - 8), similar to DXBC. For RWTypedBuffer, the mask must cover all resource components. For RWRawBuffer and RWStructuredBuffer, valid masks are: x, xy, xyz, xyzw.
+
+=================== =====================================================
+Valid resource type # of active coordinates
+=================== =====================================================
+RWTypedBuffer       1 (c0 in elements)
+RWRawBuffer         1 (c0 in bytes)
+RWStructuredBuffer  2 (c0 in elements, c1 = byte offset into the element)
+=================== =====================================================
+
+BufferUpdateCounter
+~~~~~~~~~~~~~~~~~~~
+
+The following signature shows the operation syntax::
+
+  ; opcodes: bufferUpdateCounter
+  declare void @dx.op.bufferUpdateCounter(
+      i32,                  ; opcode
+      %dx.types.ResHandle,  ; buffer handle
+      i8)                   ; 1 - increment, -1 - decrement
+
+Valid resource type: RWRawBuffer.
+
+AtomicBinOp
+~~~~~~~~~~~
+
+The following signature shows the operation syntax::
+
+  ; overloads: SM5.1: i32,  SM6.0: i32
+  ; returns: original value in memory before the operation
+  declare i32 @dx.op.atomicBinOp.i32(
+      i32,                  ; opcode
+      %dx.types.Handle,     ; resource handle
+      i32,                  ; binary operation code: EXCHANGE, IADD, AND, OR, XOR, IMIN, IMAX, UMIN, UMAX
+      i32,                  ; coordinate c0
+      i32,                  ; coordinate c1
+      i32,                  ; coordinate c2
+      i32)                  ; new value
+
+The call respects SM5.1 OOB and alignment rules.
+
+=================== =====================================================
+Valid resource type # of active coordinates
+=================== =====================================================
+RWTexture1D         1 (c0)
+RWTexture1DArray    2 (c0, c1 = array slice)
+RWTexture2D         2 (c0, c1)
+RWTexture2DArray    3 (c0, c1, c2 = array slice)
+RWTexture3D         3 (c0, c1, c2)
+RWTypedBuffer       1 (c0 in elements)
+RWRawBuffer         1 (c0 in bytes)
+RWStructuredBuffer  2 (c0 in elements, c1 - byte offset into the element)
+=================== =====================================================
+
+AtomicBinOp subsumes corresponding DXBC atomic operations that do not return the old value in memory. The driver compiler is free to specialize the corresponding GPU instruction if the return value is unused.
+
+AtomicCompareExchange
+~~~~~~~~~~~~~~~~~~~~~
+
+The following signature shows the operation syntax::
+
+  ; overloads: SM5.1: i32,  SM6.0: i32
+  ; returns: original value in memory before the operation
+  declare i32 @dx.op.atomicBinOp.i32(
+      i32,                  ; opcode
+      %dx.types.Handle,     ; resource handle
+      i32,                  ; coordinate c0
+      i32,                  ; coordinate c1
+      i32,                  ; coordinate c2
+      i32,                  ; comparison value
+      i32)                  ; new value
+
+The call respects SM5.1 OOB and alignment rules.
+
+=================== =====================================================
+Valid resource type # of active coordinates
+=================== =====================================================
+RWTexture1D         1 (c0)
+RWTexture1DArray    2 (c0, c1 = array slice)
+RWTexture2D         2 (c0, c1)
+RWTexture2DArray    3 (c0, c1, c2 = array slice)
+RWTexture3D         3 (c0, c1, c2)
+RWTypedBuffer       1 (c0 in elements)
+RWRawBuffer         1 (c0 in bytes)
+RWStructuredBuffer  2 (c0 in elements, c1 - byte offset into the element)
+=================== =====================================================
+
+AtomicCompareExchange subsumes DXBC's atomic compare store. The driver compiler is free to specialize the corresponding GPU instruction if the return value is unused.
+
+GetBufferBasePtr (SM6.0)
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+The following signature shows the operation syntax::
+
+  Returns i8* pointer to the base of [RW]RawBuffer instance.
+  declare i8 addrspace(ASmemory) * @dx.op.getBufferBasePtr.pASmemory (
+      i32,                ; opcode
+      %dx.types.Handle)   ; resource handle
+  Returns i8* pointer to the base of ConstantBuffer instance.
+  declare i8 addrspace(AScbuffer) * @dx.op.getBufferBasePtr.pAScbuffer(
+      i32,                ; opcode
+      %dx.types.Handle)   ; resource handle
+
+Given SM5.1 resource handle, return base pointer to perform pointer-based accesses to the resource memory.
+
+Note: the functionality is requested for SM6.0 to support pointer-based accesses to SM5.1 resources with raw linear memory (raw buffer and cbuffer) in HLSL next. This would be one of the way how a valid pointer is produced in the shader, and would let new-style, pointer-based code access SM5.1 resources with linear memory view.
+
+Atomic operations via pointer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Groupshared memory atomic operations are done via LLVM atomic instructions atomicrmw and cmpxchg. The instructions accept only i32 addrspace(ASgs) * pointers, where ASgs is the addrspace number of groupshared variables. Atomicrmw instruction does not support 'sub' and 'nand' operations. These constraints may be revisited in the future. OOB behavior is undefined.
+SM6.0 will enable similar mechanism for atomic operations performed on device memory (raw buffer).
+
+Samplers
+--------
+
+There are no intrinsics for samplers. Sampler reflection data is represented similar to other resources.
+
+Immediate Constant Buffer
+-------------------------
+There is no immediate constant buffer in DXIL. Instead, indexable constants are represented via LLVM global initialized constants in address space ASicb.
+
+Texture Buffers
+---------------
+A texture buffer is mapped to RawBuffer. Texture buffer variable declarations are present for reflection purposes only.
+
+Groupshared memory
+------------------
+Groupshared memory (DXBC g-registers) is linear in DXIL. Groupshared variables are declared via global variables in addrspace(ASgs). The optimizer will not group variables; the driver compiler can do this if desired. Accesses to groupshared variables occur via pointer load/store instructions (see below).
+
+Indexable threadlocal memory
+----------------------------
+Indexable threadlocal memory (DXBC x-registers) is linear in DXIL. Threadlocal variables are "declared" via alloca instructions. Threadlocal variables are assumed to reside in addrspace(0). The variables are not allocated into some memory pool; the driver compiler can do this, if desired. Accesses to threadlocal variables occur via pointer load/store instructions (see below).
+
+Load/Store/Atomics via pointer in future SM
+-------------------------------------------
+HLSL offers several abstractions with linear memory: buffers, cbuffers, groupshared and indexable threadlocal memory, that are conceptually similar, but have different HLSL syntax and some differences in behavior, which are exposed to HLSL developers. The plan is to introduce pointers into HLSL to unify access syntax to such linear-memory resources such that they appear conceptually the same to HLSL programmers.
+
+Each resource memory type is expressed by a unique LLVM address space. The following table shows memory types and their address spaces:
+
+========================================= =====================================
+Memory type                               Address space number n - addrspace(n)
+========================================= =====================================
+code, local, indexable threadlocal memory AS_default = 0
+device memory ([RW]RawBuffer)             AS_memory = 1
+cbuffer-like memory (ConstantBuffer)      AS_cbuffer = 2
+groupshared memory                        AS_groupshared = 3
+========================================= =====================================
+
+Pointers can be produced in the shader in a variety of ways (see Memory accesses section). Note that if GetBaseBufferPtr was used on [RW]RawBuffer or ConstantBuffer to produce a pointer, the base pointer is stateless; i.e., it "loses its connection" to the underlying resource and is treated as a stateless pointer into a particular memory type.
+
+Additional resource properties
+------------------------------
+TODO: enumerate all additional resource range properties, e.g., ROV, Texture2DMS, globally coherent, UAV counter, sampler mode, CB: immediate/dynamic indexed.
+
+Operations
+==========
+DXIL operations are represented in two ways: using LLVM instructions and using LLVM external functions. The reference list of operations as well as their overloads can be found in the attached Excel spreadsheet "DXIL Operations".
+
+Operations via instructions
+---------------------------
+
+DXIL uses a subset of core LLVM IR instructions that make sense for HLSL, where the meaning of the LLVM IR operation matches the meaning of the HLSL operation.
+
+The following LLVM instructions are valid in a DXIL program, with the specified operand types where applicable. The legend for overload types (v)oid, (h)alf, (f)loat, (d)ouble, (1)-bit, (8)-bit, (w)ord, (i)nt, (l)ong.
+
+
+.. <py>import hctdb_instrhelp</py>
+.. <py::lines('INSTR-RST')>hctdb_instrhelp.get_instrs_rst()</py>
+.. INSTR-RST:BEGIN
+
+============= ======================================================================= =================
+Instruction   Action                                                                  Operand overloads
+============= ======================================================================= =================
+Ret           returns a value (possibly void), from a function.                       vhfd1wil
+Br            branches (conditional or unconditional)
+Switch        performs a multiway switch
+Add           returns the sum of its two operands                                     wil
+FAdd          returns the sum of its two operands                                     hfd
+Sub           returns the difference of its two operands                              wil
+FSub          returns the difference of its two operands                              hfd
+Mul           returns the product of its two operands                                 wil
+FMul          returns the product of its two operands                                 hfd
+UDiv          returns the quotient of its two unsigned operands                       wil
+SDiv          returns the quotient of its two signed operands                         wil
+FDiv          returns the quotient of its two operands                                hfd
+URem          returns the remainder from the unsigned division of its two operands    wil
+SRem          returns the remainder from the signed division of its two operands      wil
+FRem          returns the remainder from the division of its two operands             hfd
+Shl           shifts left (logical)                                                   wil
+LShr          shifts right (logical), with zero bit fill                              wil
+AShr          shifts right (arithmetic), with 'a' operand sign bit fill               wil
+And           returns a  bitwise logical and of its two operands                      1wil
+Or            returns a bitwise logical or of its two operands                        1wil
+Xor           returns a bitwise logical xor of its two operands                       1wil
+Alloca        allocates memory on the stack frame of the currently executing function
+Load          reads from memory
+Store         writes to memory
+GetElementPtr gets the address of a subelement of an aggregate value
+AtomicCmpXchg atomically modifies memory
+AtomicRMW     atomically modifies memory
+Trunc         truncates an integer                                                    1wil
+ZExt          zero extends an integer                                                 1wil
+SExt          sign extends an integer                                                 1wil
+FPToUI        converts a floating point to UInt                                       hfd1wil
+FPToSI        converts a floating point to SInt                                       hfd1wil
+UIToFP        converts a UInt to floating point                                       hfd1wil
+SIToFP        converts a SInt to floating point                                       hfd1wil
+FPTrunc       truncates a floating point                                              hfd
+FPExt         extends a floating point                                                hfd
+BitCast       performs a bit-preserving type cast                                     hfd1wil
+AddrSpaceCast casts a value addrspace
+ICmp          compares integers                                                       1wil
+FCmp          compares floating points                                                hfd
+PHI           is a PHI node instruction
+Call          calls a function
+Select        selects an instruction
+ExtractValue  extracts from aggregate
+============= ======================================================================= =================
+
+
+.. INSTR-RST:END
+
+Operations via external functions
+---------------------------------
+Operations missing in core LLVM IR, such as abs, fma, discard, etc., are represented by external functions, whose name is prefixed with dx.op.
+
+The very first parameter of each such external function is the opcode of the operation, which is an i32 constant. For example, dx.op.unary computes a unary function T res = opcode(T input). Opcode defines which unary function to perform.
+
+Opcodes are defined on a dense range and will be provided as enum in a header file. The opcode parameter is introduced for efficiency reasons: grouping of operations to reduce the total number of overloads and more efficient property lookup, e.g., via an array of operation properties rather than a hash table.
+
+.. <py::lines('OPCODES-RST')>hctdb_instrhelp.get_opcodes_rst()</py>
+.. OPCODES-RST:BEGIN
+
+=== ============================= =================================================================================================================
+ID  Name                          Description
+=== ============================= =================================================================================================================
+0   TempRegLoad_                  Helper load operation
+1   TempRegStore_                 Helper store operation
+2   MinPrecXRegLoad_              Helper load operation for minprecision
+3   MinPrecXRegStore_             Helper store operation for minprecision
+4   LoadInput_                    Loads the value from shader input
+5   StoreOutput_                  Stores the value to shader output
+6   FAbs_                         returns the absolute value of the input value.
+7   Saturate_                     clamps the result of a single or double precision floating point value to [0.0f...1.0f]
+8   IsNaN_                        Returns true if x is NAN or QNAN, false otherwise.
+9   IsInf_                        Returns true if x is +INF or -INF, false otherwise.
+10  IsFinite_                     Returns true if x is finite, false otherwise.
+11  IsNormal_                     returns IsNormal
+12  Cos_                          returns cosine(theta) for theta in radians.
+13  Sin_                          returns sine(theta) for theta in radians.
+14  Tan_                          returns tan(theta) for theta in radians.
+15  Acos_                         Returns the arccosine of the specified value. Input should be a floating-point value within the range of -1 to 1.
+16  Asin_                         Returns the arccosine of the specified value. Input should be a floating-point value within the range of -1 to 1
+17  Atan_                         Returns the arctangent of the specified value. The return value is within the range of -PI/2 to PI/2.
+18  Hcos_                         returns the hyperbolic cosine of the specified value.
+19  Hsin_                         returns the hyperbolic sine of the specified value.
+20  Htan_                         returns the hyperbolic tangent of the specified value.
+21  Exp_                          returns 2^exponent
+22  Frc_                          extract fracitonal component.
+23  Log_                          returns log base 2.
+24  Sqrt_                         returns square root
+25  Rsqrt_                        returns reciprocal square root (1 / sqrt(src)
+26  Round_ne_                     floating-point round to integral float.
+27  Round_ni_                     floating-point round to integral float.
+28  Round_pi_                     floating-point round to integral float.
+29  Round_z_                      floating-point round to integral float.
+30  Bfrev_                        Reverses the order of the bits.
+31  Countbits_                    Counts the number of bits in the input integer.
+32  FirstbitLo_                   Returns the location of the first set bit starting from the lowest order bit and working upward.
+33  FirstbitHi_                   Returns the location of the first set bit starting from the highest order bit and working downward.
+34  FirstbitSHi_                  Returns the location of the first set bit from the highest order bit based on the sign.
+35  FMax_                         returns a if a >= b, else b
+36  FMin_                         returns a if a < b, else b
+37  IMax_                         IMax(a,b) returns a if a > b, else b
+38  IMin_                         IMin(a,b) returns a if a < b, else b
+39  UMax_                         unsigned integer maximum. UMax(a,b) = a > b ? a : b
+40  UMin_                         unsigned integer minimum. UMin(a,b) = a < b ? a : b
+41  IMul_                         multiply of 32-bit operands to produce the correct full 64-bit result.
+42  UMul_                         multiply of 32-bit operands to produce the correct full 64-bit result.
+43  UDiv_                         unsigned divide of the 32-bit operand src0 by the 32-bit operand src1.
+44  UAddc_                        unsigned add of 32-bit operand with the carry
+45  USubb_                        unsigned subtract of 32-bit operands with the borrow
+46  FMad_                         floating point multiply & add
+47  Fma_                          fused multiply-add
+48  IMad_                         Signed integer multiply & add
+49  UMad_                         Unsigned integer multiply & add
+50  Msad_                         masked Sum of Absolute Differences.
+51  Ibfe_                         Integer bitfield extract
+52  Ubfe_                         Unsigned integer bitfield extract
+53  Bfi_                          Given a bit range from the LSB of a number, places that number of bits in another number at any offset
+54  Dot2_                         Two-dimensional vector dot-product
+55  Dot3_                         Three-dimensional vector dot-product
+56  Dot4_                         Four-dimensional vector dot-product
+57  CreateHandle                  creates the handle to a resource
+58  CBufferLoad                   loads a value from a constant buffer resource
+59  CBufferLoadLegacy             loads a value from a constant buffer resource
+60  Sample                        samples a texture
+61  SampleBias                    samples a texture after applying the input bias to the mipmap level
+62  SampleLevel                   samples a texture using a mipmap-level offset
+63  SampleGrad                    samples a texture using a gradient to influence the way the sample location is calculated
+64  SampleCmp                     samples a texture and compares a single component against the specified comparison value
+65  SampleCmpLevelZero            samples a texture and compares a single component against the specified comparison value
+66  TextureLoad                   reads texel data without any filtering or sampling
+67  TextureStore                  reads texel data without any filtering or sampling
+68  BufferLoad                    reads from a TypedBuffer
+69  BufferStore                   writes to a RWTypedBuffer
+70  BufferUpdateCounter           atomically increments/decrements the hidden 32-bit counter stored with a Count or Append UAV
+71  CheckAccessFullyMapped        determines whether all values from a Sample, Gather, or Load operation accessed mapped tiles in a tiled resource
+72  GetDimensions                 gets texture size information
+73  TextureGather                 gathers the four texels that would be used in a bi-linear filtering operation
+74  TextureGatherCmp              same as TextureGather, except this instrution performs comparison on texels, similar to SampleCmp
+75  Texture2DMSGetSamplePosition  gets the position of the specified sample
+76  RenderTargetGetSamplePosition gets the position of the specified sample
+77  RenderTargetGetSampleCount    gets the number of samples for a render target
+78  AtomicBinOp                   performs an atomic operation on two operands
+79  AtomicCompareExchange         atomic compare and exchange to memory
+80  Barrier                       inserts a memory barrier in the shader
+81  CalculateLOD                  calculates the level of detail
+82  Discard                       discard the current pixel
+83  DerivCoarseX_                 computes the rate of change per stamp in x direction.
+84  DerivCoarseY_                 computes the rate of change per stamp in y direction.
+85  DerivFineX_                   computes the rate of change per pixel in x direction.
+86  DerivFineY_                   computes the rate of change per pixel in y direction.
+87  EvalSnapped                   evaluates an input attribute at pixel center with an offset
+88  EvalSampleIndex               evaluates an input attribute at a sample location
+89  EvalCentroid                  evaluates an input attribute at pixel center
+90  SampleIndex                   returns the sample index in a sample-frequency pixel shader
+91  Coverage                      returns the coverage mask input in a pixel shader
+92  InnerCoverage                 returns underestimated coverage input from conservative rasterization in a pixel shader
+93  ThreadId                      reads the thread ID
+94  GroupId                       reads the group ID (SV_GroupID)
+95  ThreadIdInGroup               reads the thread ID within the group (SV_GroupThreadID)
+96  FlattenedThreadIdInGroup      provides a flattened index for a given thread within a given group (SV_GroupIndex)
+97  EmitStream                    emits a vertex to a given stream
+98  CutStream                     completes the current primitive topology at the specified stream
+99  EmitThenCutStream             equivalent to an EmitStream followed by a CutStream
+100 GSInstanceID                  GSInstanceID
+101 MakeDouble                    creates a double value
+102 SplitDouble                   splits a double into low and high parts
+103 LoadOutputControlPoint        LoadOutputControlPoint
+104 LoadPatchConstant             LoadPatchConstant
+105 DomainLocation                DomainLocation
+106 StorePatchConstant            StorePatchConstant
+107 OutputControlPointID          OutputControlPointID
+108 PrimitiveID                   PrimitiveID
+109 CycleCounterLegacy            CycleCounterLegacy
+110 WaveIsFirstLane               returns 1 for the first lane in the wave
+111 WaveGetLaneIndex              returns the index of the current lane in the wave
+112 WaveGetLaneCount              returns the number of lanes in the wave
+113 WaveAnyTrue                   returns 1 if any of the lane evaluates the value to true
+114 WaveAllTrue                   returns 1 if all the lanes evaluate the value to true
+115 WaveActiveAllEqual            returns 1 if all the lanes have the same value
+116 WaveActiveBallot              returns a struct with a bit set for each lane where the condition is true
+117 WaveReadLaneAt                returns the value from the specified lane
+118 WaveReadLaneFirst             returns the value from the first lane
+119 WaveActiveOp                  returns the result the operation across waves
+120 WaveActiveBit                 returns the result of the operation across all lanes
+121 WavePrefixOp                  returns the result of the operation on prior lanes
+122 QuadReadLaneAt                reads from a lane in the quad
+123 QuadOp                        returns the result of a quad-level operation
+124 BitcastI16toF16               bitcast between different sizes
+125 BitcastF16toI16               bitcast between different sizes
+126 BitcastI32toF32               bitcast between different sizes
+127 BitcastF32toI32               bitcast between different sizes
+128 BitcastI64toF64               bitcast between different sizes
+129 BitcastF64toI64               bitcast between different sizes
+130 LegacyF32ToF16                legacy fuction to convert float (f32) to half (f16) (this is not related to min-precision)
+131 LegacyF16ToF32                legacy fuction to convert half (f16) to float (f32) (this is not related to min-precision)
+132 LegacyDoubleToFloat           legacy fuction to convert double to float
+133 LegacyDoubleToSInt32          legacy fuction to convert double to int32
+134 LegacyDoubleToUInt32          legacy fuction to convert double to uint32
+135 WaveAllBitCount               returns the count of bits set to 1 across the wave
+136 WavePrefixBitCount            returns the count of bits set to 1 on prior lanes
+137 AttributeAtVertex_            returns the values of the attributes at the vertex.
+138 ViewID                        returns the view index
+=== ============================= =================================================================================================================
+
+
+Acos
+~~~~
+
+The return value is within the range of -PI/2 to PI/2.
+
++----------+------+--------------+---------+------+------+---------+------+-----+
+| src      | -inf | [-1,1]       | -denorm | -0   | +0   | +denorm | +inf | NaN |
++----------+------+--------------+---------+------+------+---------+------+-----+
+| acos(src)|  NaN | (-PI/2,+PI/2)|    PI/2 | PI/2 | PI/2 |    PI/2 |  NaN | NaN |
++----------+------+--------------+---------+------+------+---------+------+-----+
+
+Asin
+~~~~
+
+The return value is within the range of -PI/2 to PI/2.
+
++----------+------+--------------+---------+------+------+---------+------+-----+
+| src      | -inf | [-1,1]       | -denorm | -0   | +0   | +denorm | +inf | NaN |
++----------+------+--------------+---------+------+------+---------+------+-----+
+| asin(src)|  NaN | (-PI/2,+PI/2)|    0    |  0   |  0   |    0    |  NaN | NaN |
++----------+------+--------------+---------+------+------+---------+------+-----+
+
+Atan
+~~~~
+
++----------+------+--------------+---------+------+------+---------+---------------+-----+-----+
+| src      | -inf | -F           | -denorm | -0   | +0   | +denorm | +F            |+inf | NaN |
++----------+------+--------------+---------+------+------+---------+---------------+-----+-----+
+| atan(src)| -PI/2| (-PI/2,+PI/2)|    0    |  0   |  0   |    0    | (-PI/2,+PI/2) |PI/2 | NaN |
++----------+------+--------------+---------+------+------+---------+---------------+-----+-----+
+
+Returns the arctangent of the specified value. The return value is within the range of -PI/2 to PI/2
+
+AttributeAtVertex
+~~~~~~~~~~~~~~~~~
+
+returns the values of the attributes at the vertex. VertexID ranges from 0 to 2.
+
+Bfi
+~~~
+
+Given a bit range from the LSB of a number, place that number of bits in another number at any offset.
+
+dst = Bfi(src0, src1, src2, src3);
+
+The LSB 5 bits of src0 provide the bitfield width (0-31) to take from src2.
+The LSB 5 bits of src1 provide the bitfield offset (0-31) to start replacing bits in the  number read from src3.
+Given width, offset: bitmask = (((1 << width)-1) << offset) & 0xffffffff, dest = ((src2 << offset) & bitmask) | (src3 & ~bitmask)
+
+Bfrev
+~~~~~
+
+Reverses the order of the bits. For example given 0x12345678 the result would be 0x1e6a2c48.
+
+Cos
+~~~
+
+Theta values can be any IEEE 32-bit floating point values.
+
+The maximum absolute error is 0.0008 in the interval from -100*Pi to +100*Pi.
+
++----------+------+------------+---------+----+----+---------+------------+------+-----+
+| src      | -inf | -F         | -denorm | -0 | +0 | +denorm | +F         | +inf | NaN |
++----------+------+------------+---------+----+----+---------+------------+------+-----+
+| cos(src) |  NaN | [-1 to +1] |      +1 | +1 | +1 |      +1 | [-1 to +1] |  NaN | NaN |
++----------+------+------------+---------+----+----+---------+------------+------+-----+
+
+Countbits
+~~~~~~~~~
+
+Counts the number of bits in the input integer.
+
+DerivCoarseX
+~~~~~~~~~~~~
+
+dst = DerivCoarseX(src);
+
+Computes the rate of change per stamp in x direction. Only a single x derivative pair is computed for each 2x2 stamp of pixels.
+The data in the current Pixel Shader invocation may or may not participate in the calculation of the requested derivative, given the derivative will be calculated only once per 2x2 quad:
+As an example, the x derivative could be a delta from the top row of pixels.
+The exact calculation is up to the hardware vendor. There is also no specification dictating how the 2x2 quads will be aligned/tiled over a primitive.
+
+DerivCoarseY
+~~~~~~~~~~~~
+
+dst = DerivCoarseY(src);
+
+Computes the rate of change per stamp in y direction. Only a single y derivative pair is computed for each 2x2 stamp of pixels.
+The data in the current Pixel Shader invocation may or may not participate in the calculation of the requested derivative, given the derivative will be calculated only once per 2x2 quad:
+As an example, the y derivative could be a delta from the left column of pixels.
+The exact calculation is up to the hardware vendor. There is also no specification dictating how the 2x2 quads will be aligned/tiled over a primitive.
+
+DerivFineX
+~~~~~~~~~~
+
+dst = DerivFineX(src);
+
+Computes the rate of change per pixel in x direction. Each pixel in the 2x2 stamp gets a unique pair of x derivative calculations
+The data in the current Pixel Shader invocation always participates in the calculation of the requested derivative.
+There is no specification dictating how the 2x2 quads will be aligned/tiled over a primitive.
+
+DerivFineY
+~~~~~~~~~~
+
+dst = DerivFineY(src);
+
+Computes the rate of change per pixel in y direction. Each pixel in the 2x2 stamp gets a unique pair of y derivative calculations
+The data in the current Pixel Shader invocation always participates in the calculation of the requested derivative.
+There is no specification dictating how the 2x2 quads will be aligned/tiled over a primitive.
+
+Dot2
+~~~~
+
+Two-dimensional vector dot-product
+
+Dot3
+~~~~
+
+Three-dimensional vector dot-product
+
+Dot4
+~~~~
+
+Four-dimensional vector dot-product
+
+Exp
+~~~
+
+Returns 2^exponent. Note that hlsl log intrinsic returns the base-e exponent. Maximum relative error is e^-21.
+
++----------+------+------------+---------+----+----+---------+------------+------+-----+
+| src      | -inf | -F         | -denorm | -0 | +0 | +denorm | +F         | +inf | NaN |
++----------+------+------------+---------+----+----+---------+------------+------+-----+
+| exp(src) |  0   | +F         |    1    |  1 |  1 |       1 | +F         | +inf | NaN |
++----------+------+------------+---------+----+----+---------+------------+------+-----+
+
+FAbs
+~~~~
+
+The FAbs instruction takes simply forces the sign of the number(s) on the source operand positive, including on INF and denorm values.
+Applying FAbs on NaN preserves NaN, although the particular NaN bit pattern that results is not defined.
+
+FMad
+~~~~
+
+Floating point multiply & add. This operation is not fused for "precise" operations.
+FMad(a,b,c) = a * b + c
+
+FMax
+~~~~
+
+>= is used instead of > so that if min(x,y) = x then max(x,y) = y.
+
+NaN has special handling: If one source operand is NaN, then the other source operand is returned.
+If both are NaN, any NaN representation is returned.
+This conforms to new IEEE 754R rules.
+
+Denorms are flushed (sign preserved) before comparison, however the result written to dest may or may not be denorm flushed.
+
++------+-----------------------------+
+| a    | b                           |
+|      +------+--------+------+------+
+|      | -inf | F      | +inf | NaN  |
++------+------+--------+------+------+
+| -inf | -inf | b      | +inf | -inf |
++------+------+--------+------+------+
+| F    | a    | a or b | +inf | a    |
++------+------+--------+------+------+
+| +inf | +inf | +inf   | +inf | +inf |
++------+------+--------+------+------+
+| NaN  | -inf | b      | +inf | NaN  |
++------+------+--------+------+------+
+
+FMin
+~~~~
+
+NaN has special handling: If one source operand is NaN, then the other source operand is returned.
+If both are NaN, any NaN representation is returned.
+This conforms to new IEEE 754R rules.
+
+Denorms are flushed (sign preserved) before comparison, however the result written to dest may or may not be denorm flushed.
+
++------+-----------------------------+
+| a    | b                           |
+|      +------+--------+------+------+
+|      | -inf | F      | +inf | NaN  |
++------+------+--------+------+------+
+| -inf | -inf | -inf   | -inf | -inf |
++------+------+--------+------+------+
+| F    | -inf | a or b |    a |    a |
++------+------+--------+------+------+
+| +inf | -inf | b      | +inf | +inf |
++------+------+--------+------+------+
+| NaN  | -inf | b      | +inf | NaN  |
++------+------+--------+------+------+
+
+FirstbitHi
+~~~~~~~~~~
+
+Returns the integer position of the first bit set in the 32-bit input starting from the MSB. For example, 0x10000000 would return 3. Returns 0xffffffff if no match was found.
+
+FirstbitLo
+~~~~~~~~~~
+
+Returns the integer position of the first bit set in the 32-bit input starting from the LSB. For example, 0x00000000 would return 1. Returns 0xffffffff if no match was found.
+
+FirstbitSHi
+~~~~~~~~~~~
+
+Returns the first 0 from the MSB if the number is negative, else the first 1 from the MSB. Returns 0xffffffff if no match was found.
+
+Fma
+~~~
+
+Fused multiply-add. This operation is only defined in double precision.
+Fma(a,b,c) = a * b + c
+
+Frc
+~~~
+
++--------------+------+------+---------+----+----+---------+--------+------+-----+
+| src          | -inf | -F   | -denorm | -0 | +0 | +denorm | +F     | +inf | NaN |
++--------------+------+------+---------+----+----+---------+--------+------+-----+
+| log(src)     | NaN  |[+0,1)| +0      | +0 | +0 | +0      | [+0,1) | NaN  | NaN |
++--------------+------+------+---------+----+----+---------+--------+------+-----+
+
+Hcos
+~~~~
+
+Returns the hyperbolic cosine of the specified value.
+
++----------+------+------------+---------+----+----+---------+------------+------+-----+
+| src      | -inf | -F         | -denorm | -0 | +0 | +denorm | +F         | +inf | NaN |
++----------+------+------------+---------+----+----+---------+------------+------+-----+
+| hcos(src)| +inf | (1, +inf)  |      +1 | +1 | +1 |      +1 | (1, +inf)  | +inf | NaN |
++----------+------+------------+---------+----+----+---------+------------+------+-----+
+
+Hsin
+~~~~
+
+Returns the hyperbolic sine of the specified value.
+
++----------+------+------------+---------+----+----+---------+------------+------+-----+
+| src      | -inf | -F         | -denorm | -0 | +0 | +denorm | +F         | +inf | NaN |
++----------+------+------------+---------+----+----+---------+------------+------+-----+
+| hsin(src)| -inf | -F         |       0 |  0 |  0 |       0 | +F         | +inf | NaN |
++----------+------+------------+---------+----+----+---------+------------+------+-----+
+
+Htan
+~~~~
+
+Returns the hyperbolic tangent of the specified value.
+
++----------+------+------------+---------+----+----+---------+------------+------+-----+
+| src      | -inf | -F         | -denorm | -0 | +0 | +denorm | +F         | +inf | NaN |
++----------+------+------------+---------+----+----+---------+------------+------+-----+
+| htan(src)| -1   | -F         |       0 |  0 |  0 |       0 | +F         | +1   | NaN |
++----------+------+------------+---------+----+----+---------+------------+------+-----+
+
+IMad
+~~~~
+
+Signed integer multiply & add
+
+IMad(a,b,c) = a * b + c
+
+IMax
+~~~~
+
+IMax(a,b) returns a if a > b, else b. Optional negate modifier on source operands takes 2's complement before performing operation.
+
+IMin
+~~~~
+
+IMin(a,b) returns a if a < b, else b. Optional negate modifier on source operands takes 2's complement before performing operation.
+
+IMul
+~~~~
+
+IMul(src0, src1) = destHi, destLo
+multiply of 32-bit operands src0 and src1 (note they are signed), producing the correct full 64-bit result.
+The low 32 bits are placed in destLO. The high 32 bits are placed in destHI.
+
+Either of destHI or destLO may be specified as NULL instead of specifying a register, in the case high or low 32 bits of the 64-bit result are not needed.
+
+Optional negate modifier on source operands takes 2's complement before performing arithmetic operation.
+
+Ibfe
+~~~~
+
+dest = Ibfe(src0, src1, src2)
+
+Given a range of bits in a number, shift those bits to the LSB and sign extend the MSB of the range.
+
+width : The LSB 5 bits of src0 (0-31).
+
+offset: The LSB 5 bits of src1 (0-31)
+
+.. code:: c
+
+    if( width == 0 )
+    {
+        dest = 0
+    }
+    else if( width + offset < 32 )
+    {
+        shl dest, src2, 32-(width+offset)
+        ishr dest, dest, 32-width
+    }
+    else
+    {
+        ishr dest, src2, offset
+    }
+
+IsFinite
+~~~~~~~~
+
+Returns true if x is finite, false otherwise.
+
+IsInf
+~~~~~
+
+Returns true if x is +INF or -INF, false otherwise.
+
+IsNaN
+~~~~~
+
+Returns true if x is NAN or QNAN, false otherwise.
+
+IsNormal
+~~~~~~~~
+
+Returns IsNormal.
+
+LoadInput
+~~~~~~~~~
+
+Loads the value from shader input
+
+Log
+~~~
+
+Returns log base 2. Note that hlsl log intrinsic returns natural log.
+
++----------+------+------------+---------+----+----+---------+------------+------+-----+
+| src      | -inf | -F         | -denorm | -0 | +0 | +denorm | +F         | +inf | NaN |
++----------+------+------------+---------+----+----+---------+------------+------+-----+
+| log(src) |  NaN | NaN        |    -inf |-inf|-inf|    -inf |  F         | +inf | NaN |
++----------+------+------------+---------+----+----+---------+------------+------+-----+
+
+MinPrecXRegLoad
+~~~~~~~~~~~~~~~
+
+Helper load operation for minprecision
+
+MinPrecXRegStore
+~~~~~~~~~~~~~~~~
+
+Helper store operation for minprecision
+
+Msad
+~~~~
+
+Returns the masked Sum of Absolute Differences.
+
+dest = msad(ref, src, accum)
+
+ref: contains 4 packed 8-bit unsigned integers in 32 bits.
+
+src: contains 4 packed 8-bit unsigned integers in 32 bits.
+
+accum: a 32-bit unsigned integer, providing an existing accumulation.
+
+dest receives the result of the masked SAD operation added to the accumulation value.
+
+.. code:: c
+
+    UINT msad( UINT ref, UINT src, UINT accum )
+    {
+        for (UINT i = 0; i < 4; i++)
+        {
+            BYTE refByte, srcByte, absDiff;
+
+            refByte = (BYTE)(ref >> (i * 8));
+            if (!refByte)
+            {
+                continue;
+            }
+
+            srcByte = (BYTE)(src >> (i * 8));
+            if (refByte >= srcByte)
+            {
+                absDiff = refByte - srcByte;
+            }
+            else
+            {
+                absDiff = srcByte - refByte;
+            }
+
+            // The recommended overflow behavior for MSAD is
+            // to do a 32-bit saturate. This is not
+            // required, however, and wrapping is allowed.
+            // So from an application point of view,
+            // overflow behavior is undefined.
+            if (UINT_MAX - accum < absDiff)
+            {
+                accum = UINT_MAX;
+                break;
+            }
+
+            accum += absDiff;
+        }
+
+        return accum;
+    }
+
+Round_ne
+~~~~~~~~
+
+Floating-point round of the values in src,
+writing integral floating-point values to dest.
+
+round_ne rounds towards nearest even. For halfway, it rounds away from zero.
+
++--------------+------+----+---------+----+----+---------+----+------+-----+
+| src          | -inf | -F | -denorm | -0 | +0 | +denorm | +F | +inf | NaN |
++--------------+------+----+---------+----+----+---------+----+------+-----+
+| round_ne(src)| -inf | -F | -0      | -0 | +0 | +0      | +F | +inf | NaN |
++--------------+------+----+---------+----+----+---------+----+------+-----+
+
+Round_ni
+~~~~~~~~
+
+Floating-point round of the values in src,
+writing integral floating-point values to dest.
+
+round_ni rounds towards -INF, commonly known as floor().
+
++--------------+------+----+---------+----+----+---------+----+------+-----+
+| src          | -inf | -F | -denorm | -0 | +0 | +denorm | +F | +inf | NaN |
++--------------+------+----+---------+----+----+---------+----+------+-----+
+| round_ni(src)| -inf | -F | -0      | -0 | +0 | +0      | +F | +inf | NaN |
++--------------+------+----+---------+----+----+---------+----+------+-----+
+
+Round_pi
+~~~~~~~~
+
+Floating-point round of the values in src,
+writing integral floating-point values to dest.
+
+round_pi rounds towards +INF, commonly known as ceil().
+
++--------------+------+----+---------+----+----+---------+----+------+-----+
+| src          | -inf | -F | -denorm | -0 | +0 | +denorm | +F | +inf | NaN |
++--------------+------+----+---------+----+----+---------+----+------+-----+
+| round_pi(src)| -inf | -F | -0      | -0 | +0 | +0      | +F | +inf | NaN |
++--------------+------+----+---------+----+----+---------+----+------+-----+
+
+Round_z
+~~~~~~~
+
+Floating-point round of the values in src,
+writing integral floating-point values to dest.
+
+round_z rounds towards zero.
+
++--------------+------+----+---------+----+----+---------+----+------+-----+
+| src          | -inf | -F | -denorm | -0 | +0 | +denorm | +F | +inf | NaN |
++--------------+------+----+---------+----+----+---------+----+------+-----+
+| round_z(src) | -inf | -F | -0      | -0 | +0 | +0      | +F | +inf | NaN |
++--------------+------+----+---------+----+----+---------+----+------+-----+
+
+Rsqrt
+~~~~~
+
+Maximum relative error is 2^21.
+
++--------------+------+----+---------+----+----+---------+----+------+-----+
+| src          | -inf | -F | -denorm | -0 | +0 | +denorm | +F | +inf | NaN |
++--------------+------+----+---------+----+----+---------+----+------+-----+
+| rsqrt(src)   | -inf | -F | -0      | -0 | +0 | +0      | +F | +inf | NaN |
++--------------+------+----+---------+----+----+---------+----+------+-----+
+
+Saturate
+~~~~~~~~
+
+The Saturate instruction performs the following operation on its input value:
+
+min(1.0f, max(0.0f, value))
+
+where min() and max() in the above expression behave in the way Min and Max behave.
+
+Saturate(NaN) returns 0, by the rules for min and max.
+
+Sin
+~~~
+
+Theta values can be any IEEE 32-bit floating point values.
+
+The maximum absolute error is 0.0008 in the interval from -100*Pi to +100*Pi.
+
++----------+------+------------+---------+----+----+---------+------------+------+-----+
+| src      | -inf | -F         | -denorm | -0 | +0 | +denorm | +F         | +inf | NaN |
++----------+------+------------+---------+----+----+---------+------------+------+-----+
+| sin(src) |  NaN | [-1 to +1] |      -0 | -0 | +0 |      +0 | [-1 to +1] |  NaN | NaN |
++----------+------+------------+---------+----+----+---------+------------+------+-----+
+
+Sqrt
+~~~~
+
+Precision is 1 ulp.
+
++--------------+------+----+---------+----+----+---------+----+------+-----+
+| src          | -inf | -F | -denorm | -0 | +0 | +denorm | +F | +inf | NaN |
++--------------+------+----+---------+----+----+---------+----+------+-----+
+| sqrt(src)    | NaN  | NaN| -0      | -0 | +0 | +0      | +F | +inf | NaN |
++--------------+------+----+---------+----+----+---------+----+------+-----+
+
+StoreOutput
+~~~~~~~~~~~
+
+Stores the value to shader output
+
+Tan
+~~~
+
+Theta values can be any IEEE 32-bit floating point values.
+
++----------+----------+----------------+---------+----+----+---------+----------------+------+-----+
+| src      | -inf     | -F             | -denorm | -0 | +0 | +denorm | +F             | +inf | NaN |
++----------+----------+----------------+---------+----+----+---------+----------------+------+-----+
+| tan(src) | NaN      | [-inf to +inf] | -0      | -0 | +0 | +0      | [-inf to +inf] | NaN  | NaN |
++----------+----------+----------------+---------+----+----+---------+----------------+------+-----+
+
+TempRegLoad
+~~~~~~~~~~~
+
+Helper load operation
+
+TempRegStore
+~~~~~~~~~~~~
+
+Helper store operation
+
+UAddc
+~~~~~
+
+dest0, dest1 = UAddc(src0, src1)
+
+unsigned add of 32-bit operands src0 and src1, placing the LSB part of the 32-bit result in dest0.
+dest1 is written with: 1 if a carry is produced, 0 otherwise. Dest1 can be NULL if the carry is not needed
+
+UDiv
+~~~~
+
+destQUOT, destREM = UDiv(src0, src1);
+
+unsigned divide of the 32-bit operand src0 by the 32-bit operand src1.
+
+The results of the divides are the 32-bit quotients (placed in destQUOT) and 32-bit remainders (placed in destREM).
+
+Divide by zero returns 0xffffffff for both quotient and remainder.
+
+Either destQUOT or destREM may be specified as NULL instead of specifying a register, in the case the quotient or remainder are not needed.
+
+Unsigned subtract of 32-bit operands src1 from src0, placing the LSB part of the 32-bit result in dest0.
+dest1 is written with: 1 if a borrow is produced, 0 otherwise. Dest1 can be NULL if the borrow is not needed
+
+UMad
+~~~~
+
+Unsigned integer multiply & add.
+
+Umad(a,b,c) = a * b + c
+
+UMax
+~~~~
+
+unsigned integer maximum. UMax(a,b) = a > b ? a : b
+
+UMin
+~~~~
+
+unsigned integer minimum. UMin(a,b) = a < b ? a : b
+
+UMul
+~~~~
+
+multiply of 32-bit operands src0 and src1 (note they are unsigned), producing the correct full 64-bit result.
+The low 32 bits are placed in destLO. The high 32 bits are placed in destHI.
+Either of destHI or destLO may be specified as NULL instead of specifying a register, in the case high or low 32 bits of the 64-bit result are not needed
+
+USubb
+~~~~~
+
+dest0, dest1 = USubb(src0, src1)
+
+Ubfe
+~~~~
+
+dest = ubfe(src0, src1, src2)
+
+Given a range of bits in a number, shift those bits to the LSB and set remaining bits to 0.
+
+width : The LSB 5 bits of src0 (0-31).
+
+offset: The LSB 5 bits of src1 (0-31).
+
+Given width, offset:
+
+.. code:: c
+
+    if( width == 0 )
+    {
+        dest = 0
+    }
+    else if( width + offset < 32 )
+    {
+        shl dest, src2, 32-(width+offset)
+        ushr dest, dest, 32-width
+    }
+    else
+    {
+        ushr dest, src2, offset
+    }
+
+.. OPCODES-RST:END
+
+
+Custom instructions
+-------------------
+Instructions for third-party extensions will be specially-prefixed external function calls, identified by a declared extension-set-prefix. Additional metadata will be included to provide hints about uniformity, pure or const guarantees, alignment, etc.
+
+Validation Rules
+================
+
+The following rules are verified by the *Validator* component and thus can be relied upon by downstream consumers.
+
+The set of validation rules that are known to hold for a DXIL program is identifier by the 'dx.valver' named metadata node, which consists of a two-element tuple of constant int values, a major and minor version. Minor version numbers are increments as rules are added to a prior table or as the implementation fixes issues.
+
+.. <py::lines('VALRULES-RST')>hctdb_instrhelp.get_valrules_rst()</py>
+.. VALRULES-RST:BEGIN
+
+====================================== =======================================================================================================================================================================================================================================================================================================
+Rule Code                              Description
+====================================== =======================================================================================================================================================================================================================================================================================================
+BITCODE.VALID                          TODO - Module must be bitcode-valid
+CONTAINER.PARTINVALID                  DXIL Container must not contain unknown parts
+CONTAINER.PARTMATCHES                  DXIL Container Parts must match Module
+CONTAINER.PARTMISSING                  DXIL Container requires certain parts, corresponding to module
+CONTAINER.PARTREPEATED                 DXIL Container must have only one of each part type
+CONTAINER.ROOTSIGNATUREINCOMPATIBLE    Root Signature in DXIL Container must be compatible with shader
+DECL.DXILFNEXTERN                      External function must be a DXIL function
+DECL.DXILNSRESERVED                    The DXIL reserved prefixes must only be used by built-in functions and types
+DECL.FNFLATTENPARAM                    Function parameters must not use struct types
+DECL.FNISCALLED                        Functions can only be used by call instructions
+DECL.NOTUSEDEXTERNAL                   External declaration should not be used
+DECL.USEDEXTERNALFUNCTION              External function must be used
+DECL.USEDINTERNAL                      Internal declaration must be used
+FLOW.DEADLOOP                          Loop must have break
+FLOW.FUNCTIONCALL                      Function with parameter is not permitted
+FLOW.NORECUSION                        Recursion is not permitted
+FLOW.REDUCIBLE                         Execution flow must be reducible
+INSTR.ALLOWED                          Instructions must be of an allowed type
+INSTR.ATTRIBUTEATVERTEXNOINTERPOLATION Attribute %0 must have nointerpolation mode in order to use GetAttributeAtVertex function.
+INSTR.BARRIERMODEFORNONCS              sync in a non-Compute Shader must only sync UAV (sync_uglobal)
+INSTR.BARRIERMODENOMEMORY              sync must include some form of memory barrier - _u (UAV) and/or _g (Thread Group Shared Memory).  Only _t (thread group sync) is optional.
+INSTR.BARRIERMODEUSELESSUGROUP         sync can't specify both _ugroup and _uglobal. If both are needed, just specify _uglobal.
+INSTR.BUFFERUPDATECOUNTERONUAV         BufferUpdateCounter valid only on UAV
+INSTR.CALLOLOAD                        Call to DXIL intrinsic must match overload signature
+INSTR.CANNOTPULLPOSITION               pull-model evaluation of position disallowed
+INSTR.CBUFFERCLASSFORCBUFFERHANDLE     Expect Cbuffer for CBufferLoad handle
+INSTR.CBUFFEROUTOFBOUND                Cbuffer access out of bound
+INSTR.COORDINATECOUNTFORRAWTYPEDBUF    raw/typed buffer don't need 2 coordinates
+INSTR.COORDINATECOUNTFORSTRUCTBUF      structured buffer require 2 coordinates
+INSTR.DXILSTRUCTUSER                   Dxil struct types should only used by ExtractValue
+INSTR.DXILSTRUCTUSEROUTOFBOUND         Index out of bound when extract value from dxil struct types
+INSTR.EVALINTERPOLATIONMODE            Interpolation mode on %0 used with eval_* instruction must be linear, linear_centroid, linear_noperspective, linear_noperspective_centroid, linear_sample or linear_noperspective_sample
+INSTR.EXTRACTVALUE                     ExtractValue should only be used on dxil struct types and cmpxchg
+INSTR.FAILTORESLOVETGSMPOINTER         TGSM pointers must originate from an unambiguous TGSM global variable.
+INSTR.HANDLENOTFROMCREATEHANDLE        Resource handle should returned by createHandle
+INSTR.IMMBIASFORSAMPLEB                bias amount for sample_b must be in the range [%0,%1], but %2 was specified as an immediate
+INSTR.INBOUNDSACCESS                   Access to out-of-bounds memory is disallowed
+INSTR.MINPRECISIONNOTPRECISE           Instructions marked precise may not refer to minprecision values
+INSTR.MINPRECISONBITCAST               Bitcast on minprecison types is not allowed
+INSTR.MIPLEVELFORGETDIMENSION          Use mip level on buffer when GetDimensions
+INSTR.MIPONUAVLOAD                     uav load don't support mipLevel/sampleIndex
+INSTR.NOGENERICPTRADDRSPACECAST        Address space cast between pointer types must have one part to be generic address space
+INSTR.NOIDIVBYZERO                     No signed integer division by zero
+INSTR.NOINDEFINITEACOS                 No indefinite arccosine
+INSTR.NOINDEFINITEASIN                 No indefinite arcsine
+INSTR.NOINDEFINITEDSXY                 No indefinite derivative calculation
+INSTR.NOINDEFINITELOG                  No indefinite logarithm
+INSTR.NOREADINGUNINITIALIZED           Instructions should not read uninitialized value
+INSTR.NOUDIVBYZERO                     No unsigned integer division by zero
+INSTR.OFFSETONUAVLOAD                  uav load don't support offset
+INSTR.OLOAD                            DXIL intrinsic overload must be valid
+INSTR.ONLYONEALLOCCONSUME              RWStructuredBuffers may increment or decrement their counters, but not both.
+INSTR.OPCODERESERVED                   Instructions must not reference reserved opcodes
+INSTR.OPCONST                          DXIL intrinsic requires an immediate constant operand
+INSTR.OPCONSTRANGE                     Constant values must be in-range for operation
+INSTR.OPERANDRANGE                     DXIL intrinsic operand must be within defined range
+INSTR.PTRBITCAST                       Pointer type bitcast must be have same size
+INSTR.RESOURCECLASSFORLOAD             load can only run on UAV/SRV resource
+INSTR.RESOURCECLASSFORSAMPLERGATHER    sample, lod and gather should on srv resource.
+INSTR.RESOURCECLASSFORUAVSTORE         store should on uav resource.
+INSTR.RESOURCECOORDINATEMISS           coord uninitialized
+INSTR.RESOURCECOORDINATETOOMANY        out of bound coord must be undef
+INSTR.RESOURCEKINDFORBUFFERLOADSTORE   buffer load/store only works on Raw/Typed/StructuredBuffer
+INSTR.RESOURCEKINDFORCALCLOD           lod requires resource declared as texture1D/2D/3D/Cube/CubeArray/1DArray/2DArray
+INSTR.RESOURCEKINDFORGATHER            gather requires resource declared as texture/2D/Cube/2DArray/CubeArray
+INSTR.RESOURCEKINDFORGETDIM            Invalid resource kind on GetDimensions
+INSTR.RESOURCEKINDFORSAMPLE            sample/_l/_d requires resource declared as texture1D/2D/3D/Cube/1DArray/2DArray/CubeArray
+INSTR.RESOURCEKINDFORSAMPLEC           samplec requires resource declared as texture1D/2D/Cube/1DArray/2DArray/CubeArray
+INSTR.RESOURCEKINDFORTEXTURELOAD       texture load only works on Texture1D/1DArray/2D/2DArray/3D/MS2D/MS2DArray
+INSTR.RESOURCEKINDFORTEXTURESTORE      texture store only works on Texture1D/1DArray/2D/2DArray/3D
+INSTR.RESOURCEOFFSETMISS               offset uninitialized
+INSTR.RESOURCEOFFSETTOOMANY            out of bound offset must be undef
+INSTR.SAMPLECOMPTYPE                   sample_* instructions require resource to be declared to return UNORM, SNORM or FLOAT.
+INSTR.SAMPLEINDEXFORLOAD2DMS           load on Texture2DMS/2DMSArray require sampleIndex
+INSTR.SAMPLERMODEFORLOD                lod instruction requires sampler declared in default mode
+INSTR.SAMPLERMODEFORSAMPLE             sample/_l/_d/_cl_s/gather instruction requires sampler declared in default mode
+INSTR.SAMPLERMODEFORSAMPLEC            sample_c_*/gather_c instructions require sampler declared in comparison mode
+INSTR.STRUCTBITCAST                    Bitcast on struct types is not allowed
+INSTR.TEXTUREOFFSET                    offset texture instructions must take offset which can resolve to integer literal in the range -8 to 7
+INSTR.TGSMRACECOND                     Race condition writing to shared memory detected, consider making this write conditional
+INSTR.UNDEFRESULTFORGETDIMENSION       GetDimensions used undef dimension %0 on %1
+INSTR.WRITEMASKFORTYPEDUAVSTORE        store on typed uav must write to all four components of the UAV
+INSTR.WRITEMASKMATCHVALUEFORUAVSTORE   uav store write mask must match store value mask, write mask is %0 and store value mask is %1
+META.BARYCENTRICSFLOAT3                only 'float3' type is allowed for SV_Barycentrics.
+META.BARYCENTRICSINTERPOLATION         SV_Barycentrics cannot be used with 'nointerpolation' type
+META.BARYCENTRICSTWOPERSPECTIVES       There can only be up to two input attributes of SV_Barycentrics with different perspective interpolation mode.
+META.BRANCHFLATTEN                     Can't use branch and flatten attributes together
+META.CLIPCULLMAXCOMPONENTS             Combined elements of SV_ClipDistance and SV_CullDistance must fit in 8 components
+META.CLIPCULLMAXROWS                   Combined elements of SV_ClipDistance and SV_CullDistance must fit in two rows.
+META.CONTROLFLOWHINTNOTONCONTROLFLOW   Control flow hint only works on control flow inst
+META.DENSERESIDS                       Resource identifiers must be zero-based and dense
+META.DUPLICATESYSVALUE                 System value may only appear once in signature
+META.ENTRYFUNCTION                     entrypoint not found
+META.FLAGSUSAGE                        Flags must match usage
+META.FORCECASEONSWITCH                 Attribute forcecase only works for switch
+META.FUNCTIONANNOTATION                Cannot find function annotation for %0
+META.GLCNOTONAPPENDCONSUME             globallycoherent cannot be used with append/consume buffers
+META.INTEGERINTERPMODE                 Interpolation mode on integer must be Constant
+META.INTERPMODEINONEROW                Interpolation mode must be identical for all elements packed into the same row.
+META.INTERPMODEVALID                   Interpolation mode must be valid
+META.INVALIDCONTROLFLOWHINT            Invalid control flow hint
+META.KNOWN                             Named metadata should be known
+META.MAXTESSFACTOR                     Hull Shader MaxTessFactor must be [%0..%1].  %2 specified
+META.NOSEMANTICOVERLAP                 Semantics must not overlap
+META.REQUIRED                          TODO - Required metadata missing
+META.SEMAKINDMATCHESNAME               Semantic name must match system value, when defined.
+META.SEMAKINDVALID                     Semantic kind must be valid
+META.SEMANTICCOMPTYPE                  %0 must be %1
+META.SEMANTICINDEXMAX                  System value semantics have a maximum valid semantic index
+META.SEMANTICLEN                       Semantic length must be at least 1 and at most 64
+META.SEMANTICSHOULDBEALLOCATED         Semantic should have a valid packing location
+META.SEMANTICSHOULDNOTBEALLOCATED      Semantic should have a packing location of -1
+META.SIGNATURECOMPTYPE                 signature %0 specifies unrecognized or invalid component type
+META.SIGNATUREILLEGALCOMPONENTORDER    Component ordering for packed elements must be: arbitrary < system value < system generated value
+META.SIGNATUREINDEXCONFLICT            Only elements with compatible indexing rules may be packed together
+META.SIGNATUREOUTOFRANGE               Signature elements must fit within maximum signature size
+META.SIGNATUREOVERLAP                  Signature elements may not overlap in packing location.
+META.STRUCTBUFALIGNMENT                StructuredBuffer stride not aligned
+META.STRUCTBUFALIGNMENTOUTOFBOUND      StructuredBuffer stride out of bounds
+META.SYSTEMVALUEROWS                   System value may only have 1 row
+META.TARGET                            Target triple must be 'dxil-ms-dx'
+META.TESSELLATOROUTPUTPRIMITIVE        Invalid Tessellator Output Primitive specified. Must be point, line, triangleCW or triangleCCW.
+META.TESSELLATORPARTITION              Invalid Tessellator Partitioning specified. Must be integer, pow2, fractional_odd or fractional_even.
+META.TEXTURETYPE                       elements of typed buffers and textures must fit in four 32-bit quantities
+META.USED                              All metadata must be used by dxil
+META.VALIDSAMPLERMODE                  Invalid sampler mode on sampler
+META.VALUERANGE                        Metadata value must be within range
+META.WELLFORMED                        TODO - Metadata must be well-formed in operand count and types
+SM.APPENDANDCONSUMEONSAMEUAV           BufferUpdateCounter inc and dec on a given UAV (%d) cannot both be in the same shader for shader model less than 5.1.
+SM.CBUFFERELEMENTOVERFLOW              CBuffer elements must not overflow
+SM.CBUFFEROFFSETOVERLAP                CBuffer offsets must not overlap
+SM.CBUFFERTEMPLATETYPEMUSTBESTRUCT     D3D12 constant/texture buffer template element can only be a struct
+SM.COMPLETEPOSITION                    Not all elements of SV_Position were written
+SM.COUNTERONLYONSTRUCTBUF              BufferUpdateCounter valid only on structured buffers
+SM.CSNORETURN                          Compute shaders can't return values, outputs must be written in writable resources (UAVs).
+SM.DOMAINLOCATIONIDXOOB                DomainLocation component index out of bounds for the domain.
+SM.DSINPUTCONTROLPOINTCOUNTRANGE       DS input control point count must be [0..%0].  %1 specified
+SM.DXILVERSION                         Target shader model requires specific Dxil Version
+SM.GSINSTANCECOUNTRANGE                GS instance count must be [1..%0].  %1 specified
+SM.GSOUTPUTVERTEXCOUNTRANGE            GS output vertex count must be [0..%0].  %1 specified
+SM.GSTOTALOUTPUTVERTEXDATARANGE        Declared output vertex count (%0) multiplied by the total number of declared scalar components of output data (%1) equals %2.  This value cannot be greater than %3
+SM.GSVALIDINPUTPRIMITIVE               GS input primitive unrecognized
+SM.GSVALIDOUTPUTPRIMITIVETOPOLOGY      GS output primitive topology unrecognized
+SM.HSINPUTCONTROLPOINTCOUNTRANGE       HS input control point count must be [0..%0].  %1 specified
+SM.HULLPASSTHRUCONTROLPOINTCOUNTMATCH  For pass thru hull shader, input control point count must match output control point count
+SM.INSIDETESSFACTORSIZEMATCHDOMAIN     InsideTessFactor rows, columns (%0, %1) invalid for domain %2.  Expected %3 rows and 1 column.
+SM.INVALIDRESOURCECOMPTYPE             Invalid resource return type
+SM.INVALIDRESOURCEKIND                 Invalid resources kind
+SM.INVALIDTEXTUREKINDONUAV             Texture2DMS[Array] or TextureCube[Array] resources are not supported with UAVs
+SM.ISOLINEOUTPUTPRIMITIVEMISMATCH      Hull Shader declared with IsoLine Domain must specify output primitive point or line. Triangle_cw or triangle_ccw output are not compatible with the IsoLine Domain.
+SM.MAXTGSMSIZE                         Total Thread Group Shared Memory storage is %0, exceeded %1
+SM.MAXTHEADGROUP                       Declared Thread Group Count %0 (X*Y*Z) is beyond the valid maximum of %1
+SM.MULTISTREAMMUSTBEPOINT              When multiple GS output streams are used they must be pointlists
+SM.NAME                                Target shader model name must be known
+SM.NOINTERPMODE                        Interpolation mode must be undefined for VS input/PS output/patch constant.
+SM.NOPSOUTPUTIDX                       Pixel shader output registers are not indexable.
+SM.OPCODE                              Opcode must be defined in target shader model
+SM.OPCODEININVALIDFUNCTION             Invalid DXIL opcode usage like StorePatchConstant in patch constant function
+SM.OPERAND                             Operand must be defined in target shader model
+SM.OUTPUTCONTROLPOINTCOUNTRANGE        output control point count must be [0..%0].  %1 specified
+SM.OUTPUTCONTROLPOINTSTOTALSCALARS     Total number of scalars across all HS output control points must not exceed
+SM.PATCHCONSTANTONLYFORHSDS            patch constant signature only valid in HS and DS
+SM.PSCONSISTENTINTERP                  Interpolation mode for PS input position must be linear_noperspective_centroid or linear_noperspective_sample when outputting oDepthGE or oDepthLE and not running at sample frequency (which is forced by inputting SV_SampleIndex or declaring an input linear_sample or linear_noperspective_sample)
+SM.PSCOVERAGEANDINNERCOVERAGE          InnerCoverage and Coverage are mutually exclusive.
+SM.PSMULTIPLEDEPTHSEMANTIC             Pixel Shader only allows one type of depth semantic to be declared
+SM.PSOUTPUTSEMANTIC                    Pixel Shader allows output semantics to be SV_Target, SV_Depth, SV_DepthGreaterEqual, SV_DepthLessEqual, SV_Coverage or SV_StencilRef, %0 found
+SM.PSTARGETCOL0                        SV_Target packed location must start at column 0
+SM.PSTARGETINDEXMATCHESROW             SV_Target semantic index must match packed row location
+SM.RESOURCERANGEOVERLAP                Resource ranges must not overlap
+SM.ROVONLYINPS                         RasterizerOrdered objects are only allowed in 5.0+ pixel shaders
+SM.SAMPLECOUNTONLYON2DMS               Only Texture2DMS/2DMSArray could has sample count
+SM.SEMANTIC                            Semantic must be defined in target shader model
+SM.STREAMINDEXRANGE                    Stream index (%0) must between 0 and %1
+SM.TESSFACTORFORDOMAIN                 Required TessFactor for domain not found declared anywhere in Patch Constant data
+SM.TESSFACTORSIZEMATCHDOMAIN           TessFactor rows, columns (%0, %1) invalid for domain %2.  Expected %3 rows and 1 column.
+SM.THREADGROUPCHANNELRANGE             Declared Thread Group %0 size %1 outside valid range [%2..%3]
+SM.TRIOUTPUTPRIMITIVEMISMATCH          Hull Shader declared with Tri Domain must specify output primitive point, triangle_cw or triangle_ccw. Line output is not compatible with the Tri domain
+SM.UNDEFINEDOUTPUT                     Not all elements of output %0 were written
+SM.VALIDDOMAIN                         Invalid Tessellator Domain specified. Must be isoline, tri or quad
+SM.VIEWIDNEEDSSLOT                     ViewID requires compatible space in pixel shader input signature
+SM.ZEROHSINPUTCONTROLPOINTWITHINPUT    When HS input control point count is 0, no input signature should exist
+TYPES.DEFINED                          Type must be defined based on DXIL primitives
+TYPES.I8                               I8 can only used as immediate value for intrinsic
+TYPES.INTWIDTH                         Int type must be of valid width
+TYPES.NOMULTIDIM                       Only one dimension allowed for array type
+TYPES.NOVECTOR                         Vector types must not be present
+UNI.NOWAVESENSITIVEGRADIENT            Gradient operations are not affected by wave-sensitive data or control flow.
+====================================== =======================================================================================================================================================================================================================================================================================================
+
+.. VALRULES-RST:END
+
+
+Modules and Linking
+===================
+
+HLSL has linking capabilities to enable third-party libraries. The linking step happens before shader DXIL is given to the driver compilers.
+Experimental library generation is added in DXIL1.1. A library could be created by compile with lib_6_1 profile.
+A library is a dxil container like the compile result of other shader profiles. The difference is library will keep information for linking like resource link info and entry function signatures.
+Library support is not part of DXIL spec. Only requirement is linked shader must be valid DXIL.
+
+
+Additional Notes
+================
+
+These additional notes are not normative for DXIL, and are included for the convenience of implementers.
+
+Other Versioned Components
+--------------------------
+
+In addition to shader model, DXIL and bitcode representation versions, two other interesting versioned components are discussed: the supporting operating system and runtime, and the HLSL language.
+
+Support is provided in the Microsoft Windows family of operating systems, when running on the D3D12 runtime.
+
+The HLSL language is versioned independently of DXIL, and currently follows an 'HLSL <year>' naming scheme. HLSL 2015 is the dialect supported by the d3dcompiler_47 library; a limited form of support is provided in the open source HLSL on LLVM project. HLSL 2016 is the version supported by the current HLSL on LLVM project, which removes some features (primarily effect framework syntax, backquote operator) and adds new ones (wave intrinsics and basic i64 support).
+
+.. _dxil_container_format:
+
+DXIL Container Format
+---------------------
+
+DXIL is typically encapsulated in a DXIL container. A DXIL container is composed of a header, a sequence of part lengths, and a sequence of parts.
+
+The following C declaration describes this structure::
+
+  struct DxilContainerHeader {
+    uint32_t  HeaderFourCC;
+    uint8_t   Digest[DxilContainerHashSize];
+    uint16_t  MajorVersion;
+    uint16_t  MinorVersion;
+    uint32_t  ContainerSizeInBytes; // From start of this header
+    uint32_t  PartCount;
+    // Structure is followed by uint32_t PartOffset[PartCount];
+    // The offset is to a DxilPartHeader.
+  };
+
+Each part has a standard header, followed by a part-specify body::
+
+  struct DxilPartHeader {
+    uint32_t  PartFourCC; // Four char code for part type.
+    uint32_t  PartSize;   // Byte count for PartData.
+    // Structure is followed by uint8_t PartData[PartSize].
+  };
+
+The DXIL program is found in a part with the following body::
+
+  struct DxilProgramHeader {
+    uint32_t          ProgramVersion;   /// Major and minor version of shader, including type.
+    uint32_t          SizeInUint32;     /// Size in uint32_t units including this header.
+    uint32_t DxilMagic;       // 0x4C495844, ASCII "DXIL".
+    uint32_t DxilVersion;     // DXIL version.
+    uint32_t BitcodeOffset;   // Offset to LLVM bitcode (from DxilMagic).
+    uint32_t BitcodeSize;     // Size of LLVM bitcode.
+    // Followed by uint8_t[BitcodeHeader.BitcodeSize] after possible gap from BitcodeOffset
+  };
+
+The bitcode payload is defined as per bitcode encoding.
+
+Future Directions
+-----------------
+
+This section provides background on future directions for DXIL that may or may not materialize. They imply a new version of DXIL.
+
+It's desirable to support generic pointers, pointing to one of other kinds of pointers. If the compiler fails to disambiguate, memory access is done via a generic pointer; the HLSL compiler will warn the user about each access that it cannot disambiguate. Not supported for SM6.
+
+HLSL will eventually support more primitive types such as i8, i16, i32, i64, half, float, double, as well as declspec(align(n)) and #pragma pack(n) directives. SM6.0 will eventually require byte-granularity access support in hardware, especially writes. Not supported for SM6.
+
+There will be a Requires32BitAlignedAccesses CAP flag. If absent, this would indicate that the shader requires writes that (1) do not write full four bytes, or (2) are not aligned on four-byte boundary. If hardware does not natively support these, the shader is rejected. Programmers can work around this hardware limitation by manually aligning smaller data on four-byte boundary in HLSL.
+
+When libraries are supported as first-class DXIL constructs, "lib_*" shader models can specify more than one entry point per module; the other shader models must specify exactly one entry point.
+
+The target machine specification for HLSL might specify a 64-bit pointer side with 64-bit offsets.
+
+Hardware support for generic pointer is essential for HLSL next as a fallback mechanism for cases when compiler cannot disambiguate pointer's address space.
+
+Future DXIL will change how half and i16 are treated:
+* i16 will have to be supported natively either in hardware or via emulation,
+* half's behavior will depend on the value of RequiresHardwareHalf CAP; if it's not set, half can be treated as min-precision type (min16float); i.e., computation may be done with values implicitly promoted to floats; if it's set and hardware does not support half type natively, the driver compiler can either emulate exact IEEE half behavior or fail shader creation.
+
+Pending Specification Work
+==========================
+
+The following work on this specification is still pending:
+
+* Consider moving some additional tables and lists into hctdb and cross-reference.
+* Complete the extended documentation for instructions.
+
diff --git a/lib/HLSL/DxilOutputColorBecomesConstant.cpp b/lib/HLSL/DxilOutputColorBecomesConstant.cpp
index 6ff132fc1..95139c8c6 100644
--- a/lib/HLSL/DxilOutputColorBecomesConstant.cpp
+++ b/lib/HLSL/DxilOutputColorBecomesConstant.cpp
@@ -1,176 +1,176 @@
-///////////////////////////////////////////////////////////////////////////////
-//                                                                           //
-// DxilOutputColorBecomesConstant.cpp                                        //
-// Copyright (C) Microsoft Corporation. All rights reserved.                 //
-// This file is distributed under the University of Illinois Open Source     //
-// License. See LICENSE.TXT for details.                                     //
-//                                                                           //
-// Provides a pass to stomp a pixel shader's output color to a given         //
-// constant value                                                            //
-//                                                                           //
-///////////////////////////////////////////////////////////////////////////////
-
-#include "dxc/HLSL/DxilGenerationPass.h"
-#include "dxc/HLSL/DxilOperations.h"
-#include "dxc/HLSL/DxilSignatureElement.h"
-#include "dxc/HLSL/DxilModule.h"
-#include "dxc/Support/Global.h"
-#include "dxc/HLSL/DxilTypeSystem.h"
-#include "dxc/HLSL/DxilInstructions.h"
-#include "dxc/HLSL/DxilSpanAllocator.h"
-
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/InstIterator.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/PassManager.h"
-#include "llvm/ADT/BitVector.h"
-#include "llvm/Pass.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include <memory>
-#include <unordered_set>
-
-using namespace llvm;
-using namespace hlsl;
-
-class DxilOutputColorBecomesConstant : public ModulePass {
-
-  enum VisualizerInstrumentationMode
-  {
-    PRESERVE_ORIGINAL_INSTRUCTIONS,
-    REMOVE_DISCARDS_AND_OPTIONALLY_OTHER_INSTRUCTIONS
-  };
-
-  float Red = 1.f;
-  float Green = 1.f;
-  float Blue = 1.f;
-  float Alpha = 1.f;
-  VisualizerInstrumentationMode Mode;
-
-  bool convertTarget0ToConstantValue(Function * OutputFunction, const hlsl::DxilSignature &OutputSignature, OP * HlslOP, float * color);
-
-public:
-  static char ID; // Pass identification, replacement for typeid
-  explicit DxilOutputColorBecomesConstant() : ModulePass(ID) {}
-  const char *getPassName() const override { return "DXIL Constant Color Mod"; }
-  void applyOptions(PassOptions O) override;
-  bool runOnModule(Module &M) override;
-};
-
-void DxilOutputColorBecomesConstant::applyOptions(PassOptions O)
-{
-  for (const auto & option : O)
-  {
-    if (0 == option.first.compare("constant-red"))
-    {
-      Red = atof(option.second.data());
-    }
-    else if (0 == option.first.compare("constant-green"))
-    {
-      Green = atof(option.second.data());
-    }
-    else if (0 == option.first.compare("constant-blue"))
-    {
-      Blue = atof(option.second.data());
-    }
-    else if (0 == option.first.compare("constant-alpha"))
-    {
-      Alpha = atof(option.second.data());
-    }
-    else if (0 == option.first.compare("mod-mode"))
-    {
-      Mode = static_cast<VisualizerInstrumentationMode>(atoi(option.second.data()));
-    }
-  }
-}
-
-bool DxilOutputColorBecomesConstant::convertTarget0ToConstantValue(
-  Function * OutputFunction, 
-  const hlsl::DxilSignature &OutputSignature, 
-  OP * HlslOP, 
-  float * color) {
-
-  bool Modified = false;
-    auto OutputFunctionUses = OutputFunction->uses();
-
-    for (Use &FunctionUse : OutputFunctionUses) {
-      iterator_range<Value::user_iterator> FunctionUsers = FunctionUse->users();
-      for (User * FunctionUser : FunctionUsers) {
-        if (isa<Instruction>(FunctionUser)) {
-          auto CallInstruction = cast<CallInst>(FunctionUser);
-
-          // Check if the instruction writes to a render target (as opposed to a system-value, such as RenderTargetArrayIndex)
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// DxilOutputColorBecomesConstant.cpp                                        //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// Provides a pass to stomp a pixel shader's output color to a given         //
+// constant value                                                            //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#include "dxc/HLSL/DxilGenerationPass.h"
+#include "dxc/HLSL/DxilOperations.h"
+#include "dxc/HLSL/DxilSignatureElement.h"
+#include "dxc/HLSL/DxilModule.h"
+#include "dxc/Support/Global.h"
+#include "dxc/HLSL/DxilTypeSystem.h"
+#include "dxc/HLSL/DxilInstructions.h"
+#include "dxc/HLSL/DxilSpanAllocator.h"
+
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <memory>
+#include <unordered_set>
+
+using namespace llvm;
+using namespace hlsl;
+
+class DxilOutputColorBecomesConstant : public ModulePass {
+
+  enum VisualizerInstrumentationMode
+  {
+    PRESERVE_ORIGINAL_INSTRUCTIONS,
+    REMOVE_DISCARDS_AND_OPTIONALLY_OTHER_INSTRUCTIONS
+  };
+
+  float Red = 1.f;
+  float Green = 1.f;
+  float Blue = 1.f;
+  float Alpha = 1.f;
+  VisualizerInstrumentationMode Mode;
+
+  bool convertTarget0ToConstantValue(Function * OutputFunction, const hlsl::DxilSignature &OutputSignature, OP * HlslOP, float * color);
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  explicit DxilOutputColorBecomesConstant() : ModulePass(ID) {}
+  const char *getPassName() const override { return "DXIL Constant Color Mod"; }
+  void applyOptions(PassOptions O) override;
+  bool runOnModule(Module &M) override;
+};
+
+void DxilOutputColorBecomesConstant::applyOptions(PassOptions O)
+{
+  for (const auto & option : O)
+  {
+    if (0 == option.first.compare("constant-red"))
+    {
+      Red = atof(option.second.data());
+    }
+    else if (0 == option.first.compare("constant-green"))
+    {
+      Green = atof(option.second.data());
+    }
+    else if (0 == option.first.compare("constant-blue"))
+    {
+      Blue = atof(option.second.data());
+    }
+    else if (0 == option.first.compare("constant-alpha"))
+    {
+      Alpha = atof(option.second.data());
+    }
+    else if (0 == option.first.compare("mod-mode"))
+    {
+      Mode = static_cast<VisualizerInstrumentationMode>(atoi(option.second.data()));
+    }
+  }
+}
+
+bool DxilOutputColorBecomesConstant::convertTarget0ToConstantValue(
+  Function * OutputFunction, 
+  const hlsl::DxilSignature &OutputSignature, 
+  OP * HlslOP, 
+  float * color) {
+
+  bool Modified = false;
+    auto OutputFunctionUses = OutputFunction->uses();
+
+    for (Use &FunctionUse : OutputFunctionUses) {
+      iterator_range<Value::user_iterator> FunctionUsers = FunctionUse->users();
+      for (User * FunctionUser : FunctionUsers) {
+        if (isa<Instruction>(FunctionUser)) {
+          auto CallInstruction = cast<CallInst>(FunctionUser);
+
+          // Check if the instruction writes to a render target (as opposed to a system-value, such as RenderTargetArrayIndex)
           Value *OutputID = CallInstruction->getArgOperand(DXIL::OperandIndex::kStoreOutputIDOpIdx);
-          unsigned SignatureElementIndex = cast<ConstantInt>(OutputID)->getLimitedValue();
+          unsigned SignatureElementIndex = cast<ConstantInt>(OutputID)->getLimitedValue();
           const DxilSignatureElement &SignatureElement = OutputSignature.GetElement(SignatureElementIndex);
-
-        // We only modify the output color for RTV0
-        if (SignatureElement.GetSemantic()->GetKind() == DXIL::SemanticKind::Target &&
-          SignatureElement.GetSemanticStartIndex() == 0)
-          {
-            // The output column is the channel (red, green, blue or alpha) within the output pixel
-            Value * OutputColumnOperand = CallInstruction->getOperand(hlsl::DXIL::OperandIndex::kStoreOutputColOpIdx);
-            ConstantInt * OutputColumnConstant = cast<ConstantInt>(OutputColumnOperand);
-            APInt OutputColumn = OutputColumnConstant->getValue();
-
-            Value * OutputValueOperand = CallInstruction->getOperand(hlsl::DXIL::OperandIndex::kStoreOutputValOpIdx);
-
-            // Replace the source operand with the appropriate constant literal value
-          if (OutputValueOperand->getType()->isFloatingPointTy())
-            {
-            Modified = true;
-              Constant * FloatConstant = HlslOP->GetFloatConst(color[*OutputColumn.getRawData()]);
-              CallInstruction->setOperand(hlsl::DXIL::OperandIndex::kStoreOutputValOpIdx, FloatConstant);
-            }
-          else if (OutputValueOperand->getType()->isIntegerTy())
-            {
-            Modified = true;
-              Constant * pIntegerConstant = HlslOP->GetI32Const(static_cast<int>(color[*OutputColumn.getRawData()]));
-              CallInstruction->setOperand(hlsl::DXIL::OperandIndex::kStoreOutputValOpIdx, pIntegerConstant);
-            }
-          }
-        }
-      }
-    }
-  return Modified;
-}
-
-bool DxilOutputColorBecomesConstant::runOnModule(Module &M)
-{
-  // This pass finds all users of the "StoreOutput" function, and replaces their source operands with a constant
-  // value. 
-
-  float color[4] = { Red, Green, Blue, Alpha };
-
-  DxilModule &DM = M.GetOrCreateDxilModule();
-
-  LLVMContext & Ctx = M.getContext();
-
-  OP *HlslOP = DM.GetOP();
-
-  const hlsl::DxilSignature & OutputSignature = DM.GetOutputSignature();
-
-  bool Modified = false;
-
-  // The StoreOutput function can store either a float or an integer, depending on the intended output
-  // render-target resource view.
-  Function * FloatOutputFunction = HlslOP->GetOpFunc(DXIL::OpCode::StoreOutput, Type::getFloatTy(Ctx));
-  if (FloatOutputFunction->getNumUses() != 0) {
-    Modified = convertTarget0ToConstantValue(FloatOutputFunction, OutputSignature, HlslOP, color);
-  }
-
-  Function * IntOutputFunction = HlslOP->GetOpFunc(DXIL::OpCode::StoreOutput, Type::getInt32Ty(Ctx));
-  if (IntOutputFunction->getNumUses() != 0) {
-    Modified = convertTarget0ToConstantValue(IntOutputFunction, OutputSignature, HlslOP, color);
-  }
-
-  return Modified;
-}
-
-char DxilOutputColorBecomesConstant::ID = 0;
-
-ModulePass *llvm::createDxilOutputColorBecomesConstantPass() {
-  return new DxilOutputColorBecomesConstant();
-}
-
-INITIALIZE_PASS(DxilOutputColorBecomesConstant, "hlsl-dxil-constantColor", "DXIL Constant Color Mod", false, false)
+
+        // We only modify the output color for RTV0
+        if (SignatureElement.GetSemantic()->GetKind() == DXIL::SemanticKind::Target &&
+          SignatureElement.GetSemanticStartIndex() == 0)
+          {
+            // The output column is the channel (red, green, blue or alpha) within the output pixel
+            Value * OutputColumnOperand = CallInstruction->getOperand(hlsl::DXIL::OperandIndex::kStoreOutputColOpIdx);
+            ConstantInt * OutputColumnConstant = cast<ConstantInt>(OutputColumnOperand);
+            APInt OutputColumn = OutputColumnConstant->getValue();
+
+            Value * OutputValueOperand = CallInstruction->getOperand(hlsl::DXIL::OperandIndex::kStoreOutputValOpIdx);
+
+            // Replace the source operand with the appropriate constant literal value
+          if (OutputValueOperand->getType()->isFloatingPointTy())
+            {
+            Modified = true;
+              Constant * FloatConstant = HlslOP->GetFloatConst(color[*OutputColumn.getRawData()]);
+              CallInstruction->setOperand(hlsl::DXIL::OperandIndex::kStoreOutputValOpIdx, FloatConstant);
+            }
+          else if (OutputValueOperand->getType()->isIntegerTy())
+            {
+            Modified = true;
+              Constant * pIntegerConstant = HlslOP->GetI32Const(static_cast<int>(color[*OutputColumn.getRawData()]));
+              CallInstruction->setOperand(hlsl::DXIL::OperandIndex::kStoreOutputValOpIdx, pIntegerConstant);
+            }
+          }
+        }
+      }
+    }
+  return Modified;
+}
+
+bool DxilOutputColorBecomesConstant::runOnModule(Module &M)
+{
+  // This pass finds all users of the "StoreOutput" function, and replaces their source operands with a constant
+  // value. 
+
+  float color[4] = { Red, Green, Blue, Alpha };
+
+  DxilModule &DM = M.GetOrCreateDxilModule();
+
+  LLVMContext & Ctx = M.getContext();
+
+  OP *HlslOP = DM.GetOP();
+
+  const hlsl::DxilSignature & OutputSignature = DM.GetOutputSignature();
+
+  bool Modified = false;
+
+  // The StoreOutput function can store either a float or an integer, depending on the intended output
+  // render-target resource view.
+  Function * FloatOutputFunction = HlslOP->GetOpFunc(DXIL::OpCode::StoreOutput, Type::getFloatTy(Ctx));
+  if (FloatOutputFunction->getNumUses() != 0) {
+    Modified = convertTarget0ToConstantValue(FloatOutputFunction, OutputSignature, HlslOP, color);
+  }
+
+  Function * IntOutputFunction = HlslOP->GetOpFunc(DXIL::OpCode::StoreOutput, Type::getInt32Ty(Ctx));
+  if (IntOutputFunction->getNumUses() != 0) {
+    Modified = convertTarget0ToConstantValue(IntOutputFunction, OutputSignature, HlslOP, color);
+  }
+
+  return Modified;
+}
+
+char DxilOutputColorBecomesConstant::ID = 0;
+
+ModulePass *llvm::createDxilOutputColorBecomesConstantPass() {
+  return new DxilOutputColorBecomesConstant();
+}
+
+INITIALIZE_PASS(DxilOutputColorBecomesConstant, "hlsl-dxil-constantColor", "DXIL Constant Color Mod", false, false)
diff --git a/tools/clang/test/HLSL/pix/constantcolorMRT.hlsl b/tools/clang/test/HLSL/pix/constantcolorMRT.hlsl
index 32ab2ad87..612647dcc 100644
--- a/tools/clang/test/HLSL/pix/constantcolorMRT.hlsl
+++ b/tools/clang/test/HLSL/pix/constantcolorMRT.hlsl
@@ -1,13 +1,13 @@
 // RUN: %dxc -Emain -Tps_6_0 %s | %opt -S -hlsl-dxil-constantColor | %FileCheck %s
 
-// Check the write to the integer part was replaced (since it is RTV0):
-// CHECK: call void @dx.op.storeOutput.i32(i32 5, i32 0, i32 0, i8 0, i32 1)
-
+// Check the write to the integer part was replaced (since it is RTV0):
+// CHECK: call void @dx.op.storeOutput.i32(i32 5, i32 0, i32 0, i8 0, i32 1)
+
 // Check color in RTV1 is unaffected:
-// CHECK: call void @dx.op.storeOutput.f32(i32 5, i32 1, i32 0, i8 0, float 0.000000e+00)
-// CHECK: call void @dx.op.storeOutput.f32(i32 5, i32 1, i32 0, i8 1, float 0.000000e+00)
-// CHECK: call void @dx.op.storeOutput.f32(i32 5, i32 1, i32 0, i8 2, float 0.000000e+00)
-// CHECK: call void @dx.op.storeOutput.f32(i32 5, i32 1, i32 0, i8 3, float 0.000000e+00)
+// CHECK: call void @dx.op.storeOutput.f32(i32 5, i32 1, i32 0, i8 0, float 0.000000e+00)
+// CHECK: call void @dx.op.storeOutput.f32(i32 5, i32 1, i32 0, i8 1, float 0.000000e+00)
+// CHECK: call void @dx.op.storeOutput.f32(i32 5, i32 1, i32 0, i8 2, float 0.000000e+00)
+// CHECK: call void @dx.op.storeOutput.f32(i32 5, i32 1, i32 0, i8 3, float 0.000000e+00)
 
 struct RTOut
 {
@@ -17,8 +17,8 @@ struct RTOut
 
 [RootSignature("")]
 RTOut main()  {
-  RTOut rtOut;
-  rtOut.i = 8;
-  rtOut.c = float4(0.f, 0.f, 0.f, 0.f);
+  RTOut rtOut;
+  rtOut.i = 8;
+  rtOut.c = float4(0.f, 0.f, 0.f, 0.f);
   return rtOut;
 }
diff --git a/tools/clang/test/HLSL/pix/constantcolorOtherSIVs.hlsl b/tools/clang/test/HLSL/pix/constantcolorOtherSIVs.hlsl
index 6694c197c..24a658b66 100644
--- a/tools/clang/test/HLSL/pix/constantcolorOtherSIVs.hlsl
+++ b/tools/clang/test/HLSL/pix/constantcolorOtherSIVs.hlsl
@@ -1,7 +1,7 @@
 // RUN: %dxc -Emain -Tps_6_0 %s | %opt -S -hlsl-dxil-constantColor,constant-red=1 | %FileCheck %s
 
 // Check that we overrode output color (0.0 becomes 1.0):
-// CHECK: call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 0, float 1.000000e+00)
+// CHECK: call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 0, float 1.000000e+00)
 
 // Check output depth wasn't affected (0.0 stays 0.0):
 // CHECK: call void @dx.op.storeOutput.f32(i32 5, i32 1, i32 0, i8 0, float 0.000000e+00)
diff --git a/tools/clang/test/HLSL/pix/constantcolorUAVs.hlsl b/tools/clang/test/HLSL/pix/constantcolorUAVs.hlsl
index 4e988a5b8..e01f38f30 100644
--- a/tools/clang/test/HLSL/pix/constantcolorUAVs.hlsl
+++ b/tools/clang/test/HLSL/pix/constantcolorUAVs.hlsl
@@ -1,22 +1,22 @@
 // RUN: %dxc -Emain -Tps_6_0 %s | %opt -S -hlsl-dxil-constantColor | %FileCheck %s
 
-// Check the write to the UAVs were unaffected:
-// CHECK: %floatRWUAV_UAV_structbuf = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 1, i32 1, i32 1, i1 false)
-// CHECK: %uav0_UAV_2d = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 1, i32 0, i32 0, i1 false)
-// CHECK: call void @dx.op.textureStore.f32(i32 67, %dx.types.Handle %uav0_UAV_2d, i32 0, i32 0, i32 undef, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, i8 15)
-
+// Check the write to the UAVs were unaffected:
+// CHECK: %floatRWUAV_UAV_structbuf = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 1, i32 1, i32 1, i1 false)
+// CHECK: %uav0_UAV_2d = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 1, i32 0, i32 0, i1 false)
+// CHECK: call void @dx.op.textureStore.f32(i32 67, %dx.types.Handle %uav0_UAV_2d, i32 0, i32 0, i32 undef, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, i8 15)
+
 // Added override output color:
-// CHECK: call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 0, float 1.000000e+00)
+// CHECK: call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 0, float 1.000000e+00)
 
-RWTexture2D<float4> uav0 : register(u0);
-RWStructuredBuffer<float> floatRWUAV: register(u1);
+RWTexture2D<float4> uav0 : register(u0);
+RWStructuredBuffer<float> floatRWUAV: register(u1);
 
-[RootSignature(
+[RootSignature(
   "DescriptorTable(UAV(u0, numDescriptors = 1, space = 0, offset = DESCRIPTOR_RANGE_OFFSET_APPEND)), "
   "UAV(u1)"
 )]
 float main() : SV_Target {
   floatRWUAV[0] = 3.5;
-  uav0[uint2(0,0)] = float4(1,2,3,4);
+  uav0[uint2(0,0)] = float4(1,2,3,4);
   return 2.5;
 }
diff --git a/tools/clang/test/HLSL/pix/constantcolorint.hlsl b/tools/clang/test/HLSL/pix/constantcolorint.hlsl
index b33f71147..7fd9b33ad 100644
--- a/tools/clang/test/HLSL/pix/constantcolorint.hlsl
+++ b/tools/clang/test/HLSL/pix/constantcolorint.hlsl
@@ -1,7 +1,7 @@
 // RUN: %dxc -Emain -Tps_6_0 %s | %opt -S -hlsl-dxil-constantColor,constant-red=2.5,constant-green=8,constant-blue=7,constant-alpha=6 | %FileCheck %s
 
 // Added override output color:
-// CHECK: call void @dx.op.storeOutput.i32(i32 5, i32 0, i32 0, i8 0, i32 2)
+// CHECK: call void @dx.op.storeOutput.i32(i32 5, i32 0, i32 0, i8 0, i32 2)
 
 [RootSignature("")]
 unsigned int main() : SV_Target {
diff --git a/tools/clang/unittests/HLSL/ExecutionTest.cpp b/tools/clang/unittests/HLSL/ExecutionTest.cpp
index ce50f6d89..166313e42 100644
--- a/tools/clang/unittests/HLSL/ExecutionTest.cpp
+++ b/tools/clang/unittests/HLSL/ExecutionTest.cpp
@@ -1,4320 +1,4320 @@
-///////////////////////////////////////////////////////////////////////////////
-//                                                                           //
-// ExecutionTest.cpp                                                         //
-// Copyright (C) Microsoft Corporation. All rights reserved.                 //
-// This file is distributed under the University of Illinois Open Source     //
-// License. See LICENSE.TXT for details.                                     //
-//                                                                           //
-// These tests run by executing compiled programs, and thus involve more     //
-// moving parts, like the runtime and drivers.                               //
-//                                                                           //
-///////////////////////////////////////////////////////////////////////////////
-
-#include <algorithm>
-#include <memory>
-#include <vector>
-#include <string>
-#include <map>
-#include <unordered_set>
-#include <strstream>
-#include <iomanip>
-#include "CompilationResult.h"
-#include "HLSLTestData.h"
-#include <Shlwapi.h>
-#include <atlcoll.h>
-#include <locale>
-#include <algorithm>
-
-#undef _read
-#include "WexTestClass.h"
-#include "HlslTestUtils.h"
-#include "DxcTestUtils.h"
-#include "dxc/Support/Global.h"
-#include "dxc/Support/WinIncludes.h"
-#include "dxc/Support/FileIOHelper.h"
-#include "dxc/Support/Unicode.h"
-
-//
-// d3d12.h and dxgi1_4.h are included in the Windows 10 SDK
-// https://msdn.microsoft.com/en-us/library/windows/desktop/dn899120(v=vs.85).aspx
-// https://developer.microsoft.com/en-US/windows/downloads/windows-10-sdk
-//
-#include <d3d12.h>
-#include <dxgi1_4.h>
-#include <DXGIDebug.h>
-#include <D3dx12.h>
-#include <DirectXMath.h>
-#include <strsafe.h>
-#include <d3dcompiler.h>
-#include <wincodec.h>
-#include "ShaderOpTest.h"
-
-#pragma comment(lib, "d3dcompiler.lib")
-#pragma comment(lib, "windowscodecs.lib")
-#pragma comment(lib, "dxguid.lib")
-#pragma comment(lib, "version.lib")
-
-// A more recent Windows SDK than currently required is needed for these.
-typedef HRESULT(WINAPI *D3D12EnableExperimentalFeaturesFn)(
-  UINT                                    NumFeatures,
-  __in_ecount(NumFeatures) const IID*     pIIDs,
-  __in_ecount_opt(NumFeatures) void*      pConfigurationStructs,
-  __in_ecount_opt(NumFeatures) UINT*      pConfigurationStructSizes);
-
-static const GUID D3D12ExperimentalShaderModelsID = { /* 76f5573e-f13a-40f5-b297-81ce9e18933f */
-  0x76f5573e,
-  0xf13a,
-  0x40f5,
-  { 0xb2, 0x97, 0x81, 0xce, 0x9e, 0x18, 0x93, 0x3f }
-};
-
-using namespace DirectX;
-using namespace hlsl_test;
-
-template <typename TSequence, typename T>
-static bool contains(TSequence s, const T &val) {
-  return std::cend(s) != std::find(std::cbegin(s), std::cend(s), val);
-}
-
-template <typename InputIterator, typename T>
-static bool contains(InputIterator b, InputIterator e, const T &val) {
-  return e != std::find(b, e, val);
-}
-
-static HRESULT EnableExperimentalShaderModels() {
-  HMODULE hRuntime = LoadLibraryW(L"d3d12.dll");
-  if (hRuntime == NULL) {
-    return HRESULT_FROM_WIN32(GetLastError());
-  }
-
-  D3D12EnableExperimentalFeaturesFn pD3D12EnableExperimentalFeatures =
-    (D3D12EnableExperimentalFeaturesFn)GetProcAddress(hRuntime, "D3D12EnableExperimentalFeatures");
-  if (pD3D12EnableExperimentalFeatures == nullptr) {
-    FreeLibrary(hRuntime);
-    return HRESULT_FROM_WIN32(GetLastError());
-  }
-
-  HRESULT hr = pD3D12EnableExperimentalFeatures(1, &D3D12ExperimentalShaderModelsID, nullptr, nullptr);
-  FreeLibrary(hRuntime);
-  return hr;
-}
-
-static HRESULT ReportLiveObjects() {
-  CComPtr<IDXGIDebug1> pDebug;
-  IFR(DXGIGetDebugInterface1(0, IID_PPV_ARGS(&pDebug)));
-  IFR(pDebug->ReportLiveObjects(DXGI_DEBUG_ALL, DXGI_DEBUG_RLO_ALL));
-  return S_OK;
-}
-
-static void WriteInfoQueueMessages(void *pStrCtx, st::OutputStringFn pOutputStrFn, ID3D12InfoQueue *pInfoQueue) {
-  bool allMessagesOK = true;
-  UINT64 count = pInfoQueue->GetNumStoredMessages();
-  CAtlArray<BYTE> message;
-  for (UINT64 i = 0; i < count; ++i) {
-    // 'GetMessageA' rather than 'GetMessage' is an artifact of user32 headers.
-    SIZE_T msgLen = 0;
-    if (FAILED(pInfoQueue->GetMessageA(i, nullptr, &msgLen))) {
-      allMessagesOK = false;
-      continue;
-    }
-    if (message.GetCount() < msgLen) {
-      if (!message.SetCount(msgLen)) {
-        allMessagesOK = false;
-        continue;
-      }
-    }
-    D3D12_MESSAGE *pMessage = (D3D12_MESSAGE *)message.GetData();
-    if (FAILED(pInfoQueue->GetMessageA(i, pMessage, &msgLen))) {
-      allMessagesOK = false;
-      continue;
-    }
-    CA2W msgW(pMessage->pDescription, CP_ACP);
-    pOutputStrFn(pStrCtx, msgW.m_psz);
-    pOutputStrFn(pStrCtx, L"\r\n");
-  }
-  if (!allMessagesOK) {
-    pOutputStrFn(pStrCtx, L"Failed to retrieve some messages.\r\n");
-  }
-}
-
-class CComContext {
-private:
-  bool m_init;
-public:
-  CComContext() : m_init(false) {}
-  ~CComContext() { Dispose(); }
-  void Dispose() { if (!m_init) return; m_init = false; CoUninitialize(); }
-  HRESULT Init() { HRESULT hr = CoInitializeEx(0, COINIT_MULTITHREADED); if (SUCCEEDED(hr)) { m_init = true; } return hr; }
-};
-
-static void SavePixelsToFile(LPCVOID pPixels, DXGI_FORMAT format, UINT32 m_width, UINT32 m_height, LPCWSTR pFileName) {
-  CComContext ctx;
-  CComPtr<IWICImagingFactory> pFactory;
-  CComPtr<IWICBitmap> pBitmap;
-  CComPtr<IWICBitmapEncoder> pEncoder;
-  CComPtr<IWICBitmapFrameEncode> pFrameEncode;
-  CComPtr<hlsl::AbstractMemoryStream> pStream;
-  CComPtr<IMalloc> pMalloc;
-
-  struct PF {
-    DXGI_FORMAT Format;
-    GUID PixelFormat;
-    UINT32 PixelSize;
-    bool operator==(DXGI_FORMAT F) const {
-      return F == Format;
-    }
-  } Vals[] = {
-    // Add more pixel format mappings as needed.
-    { DXGI_FORMAT_R8G8B8A8_UNORM, GUID_WICPixelFormat32bppRGBA, 4 }
-  };
-  PF *pFormat = std::find(Vals, Vals + _countof(Vals), format);
-
-  VERIFY_SUCCEEDED(ctx.Init());
-  VERIFY_SUCCEEDED(CoCreateInstance(CLSID_WICImagingFactory, NULL, CLSCTX_INPROC_SERVER, IID_IWICImagingFactory, (LPVOID*)&pFactory));
-  VERIFY_SUCCEEDED(CoGetMalloc(1, &pMalloc));
-  VERIFY_SUCCEEDED(hlsl::CreateMemoryStream(pMalloc, &pStream));
-  VERIFY_ARE_NOT_EQUAL(pFormat, Vals + _countof(Vals));
-  VERIFY_SUCCEEDED(pFactory->CreateBitmapFromMemory(m_width, m_height, pFormat->PixelFormat, m_width * pFormat->PixelSize, m_width * m_height * pFormat->PixelSize, (BYTE *)pPixels, &pBitmap));
-  VERIFY_SUCCEEDED(pFactory->CreateEncoder(GUID_ContainerFormatBmp, nullptr, &pEncoder));
-  VERIFY_SUCCEEDED(pEncoder->Initialize(pStream, WICBitmapEncoderNoCache));
-  VERIFY_SUCCEEDED(pEncoder->CreateNewFrame(&pFrameEncode, nullptr));
-  VERIFY_SUCCEEDED(pFrameEncode->Initialize(nullptr));
-  VERIFY_SUCCEEDED(pFrameEncode->WriteSource(pBitmap, nullptr));
-  VERIFY_SUCCEEDED(pFrameEncode->Commit());
-  VERIFY_SUCCEEDED(pEncoder->Commit());
-  hlsl::WriteBinaryFile(pFileName, pStream->GetPtr(), pStream->GetPtrSize());
-}
-
-// Setup for wave intrinsics tests
-enum class ShaderOpKind {
-  WaveSum,
-  WaveProduct,
-  WaveActiveMax,
-  WaveActiveMin,
-  WaveCountBits,
-  WaveActiveAllEqual,
-  WaveActiveAnyTrue,
-  WaveActiveAllTrue,
-  WaveActiveBitOr,
-  WaveActiveBitAnd,
-  WaveActiveBitXor,
-  ShaderOpInvalid
-};
-
-struct ShaderOpKindPair {
-  LPCWSTR name;
-  ShaderOpKind kind;
-};
-
-static ShaderOpKindPair ShaderOpKindTable[] = {
-  { L"WaveActiveSum", ShaderOpKind::WaveSum },
-  { L"WaveActiveUSum", ShaderOpKind::WaveSum },
-  { L"WaveActiveProduct", ShaderOpKind::WaveProduct },
-  { L"WaveActiveUProduct", ShaderOpKind::WaveProduct },
-  { L"WaveActiveMax", ShaderOpKind::WaveActiveMax },
-  { L"WaveActiveUMax", ShaderOpKind::WaveActiveMax },
-  { L"WaveActiveMin", ShaderOpKind::WaveActiveMin },
-  { L"WaveActiveUMin", ShaderOpKind::WaveActiveMin },
-  { L"WaveActiveCountBits", ShaderOpKind::WaveCountBits },
-  { L"WaveActiveAllEqual", ShaderOpKind::WaveActiveAllEqual },
-  { L"WaveActiveAnyTrue", ShaderOpKind::WaveActiveAnyTrue },
-  { L"WaveActiveAllTrue", ShaderOpKind::WaveActiveAllTrue },
-  { L"WaveActiveBitOr", ShaderOpKind::WaveActiveBitOr },
-  { L"WaveActiveBitAnd", ShaderOpKind::WaveActiveBitAnd },
-  { L"WaveActiveBitXor", ShaderOpKind::WaveActiveBitXor },
-  { L"WavePrefixSum", ShaderOpKind::WaveSum },
-  { L"WavePrefixUSum", ShaderOpKind::WaveSum },
-  { L"WavePrefixProduct", ShaderOpKind::WaveProduct },
-  { L"WavePrefixUProduct", ShaderOpKind::WaveProduct },
-  { L"WavePrefixMax", ShaderOpKind::WaveActiveMax },
-  { L"WavePrefixUMax", ShaderOpKind::WaveActiveMax },
-  { L"WavePrefixMin", ShaderOpKind::WaveActiveMin },
-  { L"WavePrefixUMin", ShaderOpKind::WaveActiveMin },
-  { L"WavePrefixCountBits", ShaderOpKind::WaveCountBits }
-};
-
-ShaderOpKind GetShaderOpKind(LPCWSTR str) {
-  for (size_t i = 0; i < sizeof(ShaderOpKindTable)/sizeof(ShaderOpKindPair); ++i) {
-    if (_wcsicmp(ShaderOpKindTable[i].name, str) == 0) {
-      return ShaderOpKindTable[i].kind;
-    }
-  }
-  DXASSERT(false, "Invalid ShaderOp name: %s", str);
-  return ShaderOpKind::ShaderOpInvalid;
-}
-
-// Virtual class to compute the expected result given a set of inputs
-struct TableParameter;
-
-template <typename InType, typename OutType, ShaderOpKind kind>
-struct computeExpected {
-  OutType operator()(const std::vector<InType> &inputs,
-                     const std::vector<int> &masks, int maskValue,
-                     unsigned int index) {
-    return 0;
-  }
-};
-
-template <typename InType, typename OutType>
-struct computeExpected<InType, OutType, ShaderOpKind::WaveSum> {
-  OutType operator()(const std::vector<InType> &inputs,
-                     const std::vector<int> &masks, int maskValue,
-                     unsigned int index) {
-    OutType sum = 0;
-    for (size_t i = 0; i < index; ++i) {
-      if (masks.at(i) == maskValue) {
-        sum += inputs.at(i);
-      }
-    }
-    return sum;
-  }
-};
-
-template <typename InType, typename OutType>
-struct computeExpected<InType, OutType, ShaderOpKind::WaveProduct> {
-  OutType operator()(const std::vector<InType> &inputs,
-                     const std::vector<int> &masks, int maskValue,
-                     unsigned int index) {
-    OutType prod = 1;
-    for (size_t i = 0; i < index; ++i) {
-      if (masks.at(i) == maskValue) {
-        prod *= inputs.at(i);
-      }
-    }
-    return prod;
-  }
-};
-
-template <typename InType, typename OutType>
-struct computeExpected<InType, OutType, ShaderOpKind::WaveActiveMax> {
-  OutType operator()(const std::vector<InType> &inputs,
-                     const std::vector<int> &masks, int maskValue,
-                     unsigned int index) {
-    OutType maximum = std::numeric_limits<OutType>::min();
-    for (size_t i = 0; i < index; ++i) {
-      if (masks.at(i) == maskValue && inputs.at(i) > maximum)
-        maximum = inputs.at(i);
-    }
-    return maximum;
-  }
-};
-
-template <typename InType, typename OutType>
-struct computeExpected<InType, OutType, ShaderOpKind::WaveActiveMin> {
-  OutType operator()(const std::vector<InType> &inputs,
-                     const std::vector<int> &masks, int maskValue,
-                     unsigned int index) {
-    OutType minimum = std::numeric_limits<OutType>::max();
-    for (size_t i = 0; i < index; ++i) {
-      if (masks.at(i) == maskValue && inputs.at(i) < minimum)
-        minimum = inputs.at(i);
-    }
-    return minimum;
-  }
-};
-
-template <typename InType, typename OutType>
-struct computeExpected<InType, OutType, ShaderOpKind::WaveCountBits> {
-  OutType operator()(const std::vector<InType> &inputs,
-                     const std::vector<int> &masks, int maskValue,
-                     unsigned int index) {
-    OutType count = 0;
-    for (size_t i = 0; i < index; ++i) {
-      if (masks.at(i) == maskValue && inputs.at(i) > 3) {
-        count++;
-      }
-    }
-    return count;
-  }
-};
-
-// In HLSL, boolean is represented in a 4 byte (uint32) format,
-// So we cannot use c++ bool type to represent bool in HLSL
-// HLSL returns 0 for false and 1 for true
-template <typename InType, typename OutType>
-struct computeExpected<InType, OutType, ShaderOpKind::WaveActiveAnyTrue> {
-  OutType operator()(const std::vector<InType> &inputs,
-                     const std::vector<int> &masks, int maskValue,
-                     unsigned int index) {
-    for (size_t i = 0; i < index; ++i) {
-      if (masks.at(i) == maskValue && inputs.at(i) != 0) {
-        return 1;
-      }
-    }
-    return 0;
-  }
-};
-
-template <typename InType, typename OutType>
-struct computeExpected<InType, OutType, ShaderOpKind::WaveActiveAllTrue> {
-  OutType operator()(const std::vector<InType> &inputs,
-                     const std::vector<int> &masks, int maskValue,
-                     unsigned int index) {
-    for (size_t i = 0; i < index; ++i) {
-      if (masks.at(i) == maskValue && inputs.at(i) == 0) {
-        return 0;
-      }
-    }
-    return 1;
-  }
-};
-
-template <typename InType, typename OutType>
-struct computeExpected<InType, OutType, ShaderOpKind::WaveActiveAllEqual> {
-  OutType operator()(const std::vector<InType> &inputs,
-                     const std::vector<int> &masks, int maskValue,
-                     unsigned int index) {
-    const InType *val = nullptr;
-    for (size_t i = 0; i < index; ++i) {
-      if (masks.at(i) == maskValue) {
-        if (val && *val != inputs.at(i)) {
-          return 0;
-        }
-        val = &inputs.at(i);
-      }
-    }
-    return 1;
-  }
-};
-
-template <typename InType, typename OutType>
-struct computeExpected<InType, OutType, ShaderOpKind::WaveActiveBitOr> {
-  OutType operator()(const std::vector<InType> &inputs,
-                     const std::vector<int> &masks, int maskValue,
-                     unsigned int index) {
-    OutType bits = 0x00000000;
-    for (size_t i = 0; i < index; ++i) {
-      if (masks.at(i) == maskValue) {
-        bits |= inputs.at(i);
-      }
-    }
-    return bits;
-  }
-};
-
-template <typename InType, typename OutType>
-struct computeExpected<InType, OutType, ShaderOpKind::WaveActiveBitAnd> {
-  OutType operator()(const std::vector<InType> &inputs,
-                     const std::vector<int> &masks, int maskValue,
-                     unsigned int index) {
-    OutType bits = 0xffffffff;
-    for (size_t i = 0; i < index; ++i) {
-      if (masks.at(i) == maskValue) {
-        bits &= inputs.at(i);
-      }
-    }
-    return bits;
-  }
-};
-
-template <typename InType, typename OutType>
-struct computeExpected<InType, OutType, ShaderOpKind::WaveActiveBitXor> {
-  OutType operator()(const std::vector<InType> &inputs,
-                     const std::vector<int> &masks, int maskValue,
-                     unsigned int index) {
-    OutType bits = 0x00000000;
-    for (size_t i = 0; i < index; ++i) {
-      if (masks.at(i) == maskValue) {
-        bits ^= inputs.at(i);
-      }
-    }
-    return bits;
-  }
-};
-
-// Mask functions used to control active lanes
-static int MaskAll(int i) {
-  return 1;
-}
-
-static int MaskEveryOther(int i) {
-  return i % 2 == 0 ? 1 : 0;
-}
-
-static int MaskEveryThird(int i) {
-  return i % 3 == 0 ? 1 : 0;
-}
-// TODO: It seems there is an issue with WARP with controlling active lanes
-// Add more masks when this is resolved
-typedef int(*MaskFunction)(int);
-static MaskFunction MaskFunctionTable[] = {
-  MaskAll, MaskEveryOther, MaskEveryThird
-};
-
-template <typename InType, typename OutType>
-static OutType computeExpectedWithShaderOp(const std::vector<InType> &inputs,
-                                           const std::vector<int> &masks,
-                                           int maskValue, unsigned int index,
-                                           LPCWSTR str) {
-  ShaderOpKind kind = GetShaderOpKind(str);
-  switch (kind) {
-  case ShaderOpKind::WaveSum:
-    return computeExpected<InType, OutType, ShaderOpKind::WaveSum>()(inputs, masks, maskValue, index);
-  case ShaderOpKind::WaveProduct:
-    return computeExpected<InType, OutType, ShaderOpKind::WaveProduct>()(inputs, masks, maskValue, index);
-  case ShaderOpKind::WaveActiveMax:
-    return computeExpected<InType, OutType, ShaderOpKind::WaveActiveMax>()(inputs, masks, maskValue, index);
-  case ShaderOpKind::WaveActiveMin:
-    return computeExpected<InType, OutType, ShaderOpKind::WaveActiveMin>()(inputs, masks, maskValue, index);
-  case ShaderOpKind::WaveCountBits:
-    return computeExpected<InType, OutType, ShaderOpKind::WaveCountBits>()(inputs, masks, maskValue, index);
-  case ShaderOpKind::WaveActiveBitOr:
-    return computeExpected<InType, OutType, ShaderOpKind::WaveActiveBitOr>()(inputs, masks, maskValue, index);
-  case ShaderOpKind::WaveActiveBitAnd:
-    return computeExpected<InType, OutType, ShaderOpKind::WaveActiveBitAnd>()(inputs, masks, maskValue, index);
-  case ShaderOpKind::WaveActiveBitXor:
-    return computeExpected<InType, OutType, ShaderOpKind::WaveActiveBitXor>()(inputs, masks, maskValue, index);
-  case ShaderOpKind::WaveActiveAnyTrue:
-    return computeExpected<InType, OutType, ShaderOpKind::WaveActiveAnyTrue>()(inputs, masks, maskValue, index);
-  case ShaderOpKind::WaveActiveAllTrue:
-    return computeExpected<InType, OutType, ShaderOpKind::WaveActiveAllTrue>()(inputs, masks, maskValue, index);
-  case ShaderOpKind::WaveActiveAllEqual:
-    return computeExpected<InType, OutType, ShaderOpKind::WaveActiveAllEqual>()(inputs, masks, maskValue, index);
-  default:
-    DXASSERT(false, "Invalid ShaderOp Name: %s", str);
-    return (OutType) 0;
-  }
-};
-
-
-// Checks if the given warp version supports the given operation.
-bool IsValidWarpDllVersion(unsigned int minBuildNumber) {
-    HMODULE pLibrary = LoadLibrary("D3D10Warp.dll");
-    if (pLibrary) {
-        char path[MAX_PATH];
-        DWORD length = GetModuleFileName(pLibrary, path, MAX_PATH);
-        if (length) {
-            DWORD dwVerHnd = 0;
-            DWORD dwVersionInfoSize = GetFileVersionInfoSize(path, &dwVerHnd);
-            std::unique_ptr<int[]> VffInfo(new int[dwVersionInfoSize]);
-            if (GetFileVersionInfo(path, NULL, dwVersionInfoSize, VffInfo.get())) {
-                LPVOID versionInfo;
-                UINT size;
-                if (VerQueryValue(VffInfo.get(), "\\", &versionInfo, &size)) {
-                    if (size) {
-                        VS_FIXEDFILEINFO *verInfo = (VS_FIXEDFILEINFO *)versionInfo;
-                        unsigned int warpBuildNumber = verInfo->dwFileVersionLS >> 16 & 0xffff;
-                        if (verInfo->dwSignature == 0xFEEF04BD && warpBuildNumber >= minBuildNumber) {
-                            return true;
-                        }
-                    }
-                }
-            }
-        }
-        FreeLibrary(pLibrary);
-    }
-    return false;
-}
-
-
-class ExecutionTest {
-public:
-  // By default, ignore these tests, which require a recent build to run properly.
-  BEGIN_TEST_CLASS(ExecutionTest)
-    TEST_CLASS_PROPERTY(L"Parallel", L"true")
-    TEST_CLASS_PROPERTY(L"Ignore", L"true")
-    TEST_METHOD_PROPERTY(L"Priority", L"0")
-  END_TEST_CLASS()
-  TEST_CLASS_SETUP(ExecutionTestClassSetup)
-
-  TEST_METHOD(BasicComputeTest);
-  TEST_METHOD(BasicTriangleTest);
-  TEST_METHOD(BasicTriangleOpTest);
-  TEST_METHOD(OutOfBoundsTest);
-  TEST_METHOD(SaturateTest);
-  TEST_METHOD(SignTest);
-  TEST_METHOD(Int64Test);
-  TEST_METHOD(WaveIntrinsicsTest);
-  TEST_METHOD(WaveIntrinsicsInPSTest);
-  TEST_METHOD(PartialDerivTest);
-
-  // TODO: Change the priority to 0 once there is a driver that fixes the issue with WaveActive operations
-  BEGIN_TEST_METHOD(WaveIntrinsicsActiveIntTest)
-    TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#WaveIntrinsicsActiveIntTable")
-  END_TEST_METHOD()
-
-  BEGIN_TEST_METHOD(WaveIntrinsicsActiveUintTest)
-    TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#WaveIntrinsicsActiveUintTable")
-  END_TEST_METHOD()
-
-  BEGIN_TEST_METHOD(WaveIntrinsicsPrefixIntTest)
-  TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#WaveIntrinsicsPrefixIntTable")
-  END_TEST_METHOD()
-
-  BEGIN_TEST_METHOD(WaveIntrinsicsPrefixUintTest)
-  TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#WaveIntrinsicsPrefixUintTable")
-  END_TEST_METHOD()
-  // TAEF data-driven tests.
-  BEGIN_TEST_METHOD(UnaryFloatOpTest)
-    TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#UnaryFloatOpTable")
-  END_TEST_METHOD()
-  BEGIN_TEST_METHOD(BinaryFloatOpTest)
-    TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#BinaryFloatOpTable")
-  END_TEST_METHOD()
-  BEGIN_TEST_METHOD(TertiaryFloatOpTest)
-    TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#TertiaryFloatOpTable")
-  END_TEST_METHOD()
-
-  BEGIN_TEST_METHOD(UnaryIntOpTest)
-    TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#UnaryIntOpTable")
-  END_TEST_METHOD()
-  BEGIN_TEST_METHOD(BinaryIntOpTest)
-    TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#BinaryIntOpTable")
-  END_TEST_METHOD()
-  BEGIN_TEST_METHOD(TertiaryIntOpTest)
-    TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#TertiaryIntOpTable")
-  END_TEST_METHOD()
-
-  BEGIN_TEST_METHOD(UnaryUintOpTest)
-     TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#UnaryUintOpTable")
-  END_TEST_METHOD()
-  BEGIN_TEST_METHOD(BinaryUintOpTest)
-    TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#BinaryUintOpTable")
-  END_TEST_METHOD()
-  BEGIN_TEST_METHOD(TertiaryUintOpTest)
-    TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#TertiaryUintOpTable")
-  END_TEST_METHOD()
-
-  BEGIN_TEST_METHOD(DotTest)
-    TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#DotOpTable")
-  END_TEST_METHOD()
-
-  BEGIN_TEST_METHOD(Msad4Test)
-    TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#Msad4Table")
-  END_TEST_METHOD()
-
-  dxc::DxcDllSupport m_support;
-  bool m_ExperimentalModeEnabled = false;
-  static const float ClearColor[4];
-
-  template <class T1, class T2>
-  void WaveIntrinsicsActivePrefixTest(
-    TableParameter *pParameterList, size_t numParameter, bool isPrefix);
-
-  bool UseDxbc() {
-    return GetTestParamBool(L"DXBC");
-  }
-
-  bool UseDebugIfaces() {
-    return true;
-  }
-
-  bool SaveImages() {
-    return GetTestParamBool(L"SaveImages");
-  }
-
-  void CompileFromText(LPCSTR pText, LPCWSTR pEntryPoint, LPCWSTR pTargetProfile, ID3DBlob **ppBlob) {
-    VERIFY_SUCCEEDED(m_support.Initialize());
-    CComPtr<IDxcCompiler> pCompiler;
-    CComPtr<IDxcLibrary> pLibrary;
-    CComPtr<IDxcBlobEncoding> pTextBlob;
-    CComPtr<IDxcOperationResult> pResult;
-    HRESULT resultCode;
-    VERIFY_SUCCEEDED(m_support.CreateInstance(CLSID_DxcCompiler, &pCompiler));
-    VERIFY_SUCCEEDED(m_support.CreateInstance(CLSID_DxcLibrary, &pLibrary));
-    VERIFY_SUCCEEDED(pLibrary->CreateBlobWithEncodingFromPinned((LPBYTE)pText, strlen(pText), CP_UTF8, &pTextBlob));
-    VERIFY_SUCCEEDED(pCompiler->Compile(pTextBlob, L"hlsl.hlsl", pEntryPoint, pTargetProfile, nullptr, 0, nullptr, 0, nullptr, &pResult));
-    VERIFY_SUCCEEDED(pResult->GetStatus(&resultCode));
-    if (FAILED(resultCode)) {
-      CComPtr<IDxcBlobEncoding> errors;
-      VERIFY_SUCCEEDED(pResult->GetErrorBuffer(&errors));
-      LogCommentFmt(L"Failed to compile shader: %s", BlobToUtf16(errors).data());
-    }
-    VERIFY_SUCCEEDED(resultCode);
-    VERIFY_SUCCEEDED(pResult->GetResult((IDxcBlob **)ppBlob));
-  }
-
-  void CreateComputeCommandQueue(ID3D12Device *pDevice, LPCWSTR pName, ID3D12CommandQueue **ppCommandQueue) {
-    D3D12_COMMAND_QUEUE_DESC queueDesc = {};
-    queueDesc.Flags = D3D12_COMMAND_QUEUE_FLAG_NONE;
-    queueDesc.Type = D3D12_COMMAND_LIST_TYPE_COMPUTE;
-    VERIFY_SUCCEEDED(pDevice->CreateCommandQueue(&queueDesc, IID_PPV_ARGS(ppCommandQueue)));
-    VERIFY_SUCCEEDED((*ppCommandQueue)->SetName(pName));
-  }
-
-  void CreateComputePSO(ID3D12Device *pDevice, ID3D12RootSignature *pRootSignature, LPCSTR pShader, ID3D12PipelineState **ppComputeState) {
-    CComPtr<ID3DBlob> pComputeShader;
-
-    // Load and compile shaders.
-    if (UseDxbc()) {
-      DXBCFromText(pShader, L"main", L"cs_6_0", &pComputeShader);
-    }
-    else {
-      CompileFromText(pShader, L"main", L"cs_6_0", &pComputeShader);
-    }
-
-    // Describe and create the compute pipeline state object (PSO).
-    D3D12_COMPUTE_PIPELINE_STATE_DESC computePsoDesc = {};
-    computePsoDesc.pRootSignature = pRootSignature;
-    computePsoDesc.CS = CD3DX12_SHADER_BYTECODE(pComputeShader);
-
-    VERIFY_SUCCEEDED(pDevice->CreateComputePipelineState(&computePsoDesc, IID_PPV_ARGS(ppComputeState)));
-  }
-
-  bool CreateDevice(_COM_Outptr_ ID3D12Device **ppDevice) {
-    const D3D_FEATURE_LEVEL FeatureLevelRequired = D3D_FEATURE_LEVEL_11_0;
-    CComPtr<IDXGIFactory4> factory;
-    CComPtr<ID3D12Device> pDevice;
-
-    *ppDevice = nullptr;
-
-    VERIFY_SUCCEEDED(CreateDXGIFactory1(IID_PPV_ARGS(&factory)));
-    if (GetTestParamUseWARP(true)) {
-      CComPtr<IDXGIAdapter> warpAdapter;
-      VERIFY_SUCCEEDED(factory->EnumWarpAdapter(IID_PPV_ARGS(&warpAdapter)));
-      HRESULT createHR = D3D12CreateDevice(warpAdapter, FeatureLevelRequired,
-                                           IID_PPV_ARGS(&pDevice));
-      if (FAILED(createHR)) {
-        LogCommentFmt(L"The available version of WARP does not support d3d12.");
-        WEX::Logging::Log::Result(WEX::Logging::TestResults::Blocked);
-        return false;
-      }
-    } else {
-      CComPtr<IDXGIAdapter1> hardwareAdapter;
-      WEX::Common::String AdapterValue;
-      IFT(WEX::TestExecution::RuntimeParameters::TryGetValue(L"Adapter",
-                                                             AdapterValue));
-      GetHardwareAdapter(factory, AdapterValue, &hardwareAdapter);
-      if (hardwareAdapter == nullptr) {
-        WEX::Logging::Log::Error(
-            L"Unable to find hardware adapter with D3D12 support.");
-        return false;
-      }
-      VERIFY_SUCCEEDED(D3D12CreateDevice(hardwareAdapter, FeatureLevelRequired,
-                                         IID_PPV_ARGS(&pDevice)));
-      DXGI_ADAPTER_DESC1 AdapterDesc;
-      VERIFY_SUCCEEDED(hardwareAdapter->GetDesc1(&AdapterDesc));
-      LogCommentFmt(L"Using Adapter: %s", AdapterDesc.Description);
-    }
-    if (pDevice == nullptr)
-      return false;
-
-    if (!UseDxbc()) {
-      // Check for DXIL support.
-      // This is defined in d3d.h for Windows 10 Anniversary Edition SDK, but we only
-      // require the Windows 10 SDK.
-      typedef enum D3D_SHADER_MODEL {
-        D3D_SHADER_MODEL_5_1 = 0x51,
-        D3D_SHADER_MODEL_6_0 = 0x60
-      } D3D_SHADER_MODEL;
-      typedef struct D3D12_FEATURE_DATA_SHADER_MODEL {
-        _Inout_ D3D_SHADER_MODEL HighestShaderModel;
-      } D3D12_FEATURE_DATA_SHADER_MODEL;
-      const UINT D3D12_FEATURE_SHADER_MODEL = 7;
-      D3D12_FEATURE_DATA_SHADER_MODEL SMData;
-      SMData.HighestShaderModel = D3D_SHADER_MODEL_6_0;
-      VERIFY_SUCCEEDED(pDevice->CheckFeatureSupport(
-        (D3D12_FEATURE)D3D12_FEATURE_SHADER_MODEL, &SMData, sizeof(SMData)));
-      if (SMData.HighestShaderModel != D3D_SHADER_MODEL_6_0) {
-        LogCommentFmt(L"The selected device does not support "
-                      L"shader model 6 (required for DXIL).");
-        WEX::Logging::Log::Result(WEX::Logging::TestResults::Blocked);
-        return false;
-      }
-    }
-
-    if (UseDebugIfaces()) {
-      CComPtr<ID3D12InfoQueue> pInfoQueue;
-      if (SUCCEEDED(pDevice->QueryInterface(&pInfoQueue))) {
-        pInfoQueue->SetMuteDebugOutput(FALSE);
-      }
-    }
-
-    *ppDevice = pDevice.Detach();
-    return true;
-  }
-
-  void CreateGraphicsCommandQueue(ID3D12Device *pDevice, ID3D12CommandQueue **ppCommandQueue) {
-    D3D12_COMMAND_QUEUE_DESC queueDesc = {};
-    queueDesc.Flags = D3D12_COMMAND_QUEUE_FLAG_NONE;
-    queueDesc.Type = D3D12_COMMAND_LIST_TYPE_DIRECT;;
-    VERIFY_SUCCEEDED(pDevice->CreateCommandQueue(&queueDesc, IID_PPV_ARGS(ppCommandQueue)));
-  }
-
-  void CreateGraphicsCommandQueueAndList(
-      ID3D12Device *pDevice, ID3D12CommandQueue **ppCommandQueue,
-      ID3D12CommandAllocator **ppAllocator,
-      ID3D12GraphicsCommandList **ppCommandList, ID3D12PipelineState *pPSO) {
-    CreateGraphicsCommandQueue(pDevice, ppCommandQueue);
-    VERIFY_SUCCEEDED(pDevice->CreateCommandAllocator(
-        D3D12_COMMAND_LIST_TYPE_DIRECT, IID_PPV_ARGS(ppAllocator)));
-    VERIFY_SUCCEEDED(pDevice->CreateCommandList(
-        0, D3D12_COMMAND_LIST_TYPE_DIRECT, *ppAllocator, pPSO,
-        IID_PPV_ARGS(ppCommandList)));
-  }
-
-  void CreateGraphicsPSO(ID3D12Device *pDevice,
-                         D3D12_INPUT_LAYOUT_DESC *pInputLayout,
-                         ID3D12RootSignature *pRootSignature, LPCSTR pShaders,
-                         ID3D12PipelineState **ppPSO) {
-    CComPtr<ID3DBlob> vertexShader;
-    CComPtr<ID3DBlob> pixelShader;
-
-    if (UseDxbc()) {
-      DXBCFromText(pShaders, L"VSMain", L"vs_6_0", &vertexShader);
-      DXBCFromText(pShaders, L"PSMain", L"ps_6_0", &pixelShader);
-    } else {
-      CompileFromText(pShaders, L"VSMain", L"vs_6_0", &vertexShader);
-      CompileFromText(pShaders, L"PSMain", L"ps_6_0", &pixelShader);
-    }
-
-    // Describe and create the graphics pipeline state object (PSO).
-    D3D12_GRAPHICS_PIPELINE_STATE_DESC psoDesc = {};
-    psoDesc.InputLayout = *pInputLayout;
-    psoDesc.pRootSignature = pRootSignature;
-    psoDesc.VS = CD3DX12_SHADER_BYTECODE(vertexShader);
-    psoDesc.PS = CD3DX12_SHADER_BYTECODE(pixelShader);
-    psoDesc.RasterizerState = CD3DX12_RASTERIZER_DESC(D3D12_DEFAULT);
-    psoDesc.BlendState = CD3DX12_BLEND_DESC(D3D12_DEFAULT);
-    psoDesc.DepthStencilState.DepthEnable = FALSE;
-    psoDesc.DepthStencilState.StencilEnable = FALSE;
-    psoDesc.SampleMask = UINT_MAX;
-    psoDesc.PrimitiveTopologyType = D3D12_PRIMITIVE_TOPOLOGY_TYPE_TRIANGLE;
-    psoDesc.NumRenderTargets = 1;
-    psoDesc.RTVFormats[0] = DXGI_FORMAT_R8G8B8A8_UNORM;
-    psoDesc.SampleDesc.Count = 1;
-    VERIFY_SUCCEEDED(
-        pDevice->CreateGraphicsPipelineState(&psoDesc, IID_PPV_ARGS(ppPSO)));
-  }
-
-  void CreateRenderTargetAndReadback(ID3D12Device *pDevice,
-                                     ID3D12DescriptorHeap *pHeap, UINT width,
-                                     UINT height,
-                                     ID3D12Resource **ppRenderTarget,
-                                     ID3D12Resource **ppBuffer) {
-    const DXGI_FORMAT format = DXGI_FORMAT_R8G8B8A8_UNORM;
-    const size_t formatElementSize = 4;
-    CComPtr<ID3D12Resource> pRenderTarget;
-    CComPtr<ID3D12Resource> pBuffer;
-
-    CD3DX12_CPU_DESCRIPTOR_HANDLE rtvHandle(
-        pHeap->GetCPUDescriptorHandleForHeapStart());
-    CD3DX12_HEAP_PROPERTIES rtHeap(D3D12_HEAP_TYPE_DEFAULT);
-    CD3DX12_RESOURCE_DESC rtDesc(
-        CD3DX12_RESOURCE_DESC::Tex2D(format, width, height));
-    CD3DX12_CLEAR_VALUE rtClearVal(format, ClearColor);
-    rtDesc.Flags = D3D12_RESOURCE_FLAG_ALLOW_RENDER_TARGET;
-    VERIFY_SUCCEEDED(pDevice->CreateCommittedResource(
-        &rtHeap, D3D12_HEAP_FLAG_NONE, &rtDesc, D3D12_RESOURCE_STATE_COPY_DEST,
-        &rtClearVal, IID_PPV_ARGS(&pRenderTarget)));
-    pDevice->CreateRenderTargetView(pRenderTarget, nullptr, rtvHandle);
-    // rtvHandle.Offset(1, rtvDescriptorSize);  // Not needed for a single
-    // resource.
-
-    CD3DX12_HEAP_PROPERTIES readHeap(D3D12_HEAP_TYPE_READBACK);
-    CD3DX12_RESOURCE_DESC readDesc(
-        CD3DX12_RESOURCE_DESC::Buffer(width * height * formatElementSize));
-    VERIFY_SUCCEEDED(pDevice->CreateCommittedResource(
-        &readHeap, D3D12_HEAP_FLAG_NONE, &readDesc,
-        D3D12_RESOURCE_STATE_COPY_DEST, nullptr, IID_PPV_ARGS(&pBuffer)));
-
-    *ppRenderTarget = pRenderTarget.Detach();
-    *ppBuffer = pBuffer.Detach();
-  }
-
-  void CreateRootSignatureFromDesc(ID3D12Device *pDevice,
-                                   const D3D12_ROOT_SIGNATURE_DESC *pDesc,
-                                   ID3D12RootSignature **pRootSig) {
-    CComPtr<ID3DBlob> signature;
-    CComPtr<ID3DBlob> error;
-    VERIFY_SUCCEEDED(D3D12SerializeRootSignature(pDesc, D3D_ROOT_SIGNATURE_VERSION_1, &signature, &error));
-    VERIFY_SUCCEEDED(pDevice->CreateRootSignature(
-        0, signature->GetBufferPointer(), signature->GetBufferSize(),
-        IID_PPV_ARGS(pRootSig)));
-  }
-
-  void CreateRtvDescriptorHeap(ID3D12Device *pDevice, UINT numDescriptors,
-                               ID3D12DescriptorHeap **pRtvHeap, UINT *rtvDescriptorSize) {
-    D3D12_DESCRIPTOR_HEAP_DESC rtvHeapDesc = {};
-    rtvHeapDesc.NumDescriptors = numDescriptors;
-    rtvHeapDesc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_RTV;
-    rtvHeapDesc.Flags = D3D12_DESCRIPTOR_HEAP_FLAG_NONE;
-    VERIFY_SUCCEEDED(
-        pDevice->CreateDescriptorHeap(&rtvHeapDesc, IID_PPV_ARGS(pRtvHeap)));
-
-    if (rtvDescriptorSize != nullptr) {
-      *rtvDescriptorSize = pDevice->GetDescriptorHandleIncrementSize(
-          D3D12_DESCRIPTOR_HEAP_TYPE_RTV);
-    }
-  }
-
-  void CreateTestUavs(ID3D12Device *pDevice,
-                      ID3D12GraphicsCommandList *pCommandList, LPCVOID values,
-                      UINT32 valueSizeInBytes, ID3D12Resource **ppUavResource,
-                      ID3D12Resource **ppReadBuffer,
-                      ID3D12Resource **ppUploadResource) {
-    CComPtr<ID3D12Resource> pUavResource;
-    CComPtr<ID3D12Resource> pReadBuffer;
-    CComPtr<ID3D12Resource> pUploadResource;
-    D3D12_SUBRESOURCE_DATA transferData;
-    D3D12_HEAP_PROPERTIES defaultHeapProperties = CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_DEFAULT);
-    D3D12_HEAP_PROPERTIES uploadHeapProperties = CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_UPLOAD);
-    D3D12_RESOURCE_DESC bufferDesc = CD3DX12_RESOURCE_DESC::Buffer(valueSizeInBytes, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS);
-    D3D12_RESOURCE_DESC uploadBufferDesc = CD3DX12_RESOURCE_DESC::Buffer(valueSizeInBytes);
-    CD3DX12_HEAP_PROPERTIES readHeap(D3D12_HEAP_TYPE_READBACK);
-    CD3DX12_RESOURCE_DESC readDesc(CD3DX12_RESOURCE_DESC::Buffer(valueSizeInBytes));
-
-    VERIFY_SUCCEEDED(pDevice->CreateCommittedResource(
-      &defaultHeapProperties,
-      D3D12_HEAP_FLAG_NONE,
-      &bufferDesc,
-      D3D12_RESOURCE_STATE_COPY_DEST,
-      nullptr,
-      IID_PPV_ARGS(&pUavResource)));
-
-    VERIFY_SUCCEEDED(pDevice->CreateCommittedResource(
-      &uploadHeapProperties,
-      D3D12_HEAP_FLAG_NONE,
-      &uploadBufferDesc,
-      D3D12_RESOURCE_STATE_GENERIC_READ,
-      nullptr,
-      IID_PPV_ARGS(&pUploadResource)));
-
-    VERIFY_SUCCEEDED(pDevice->CreateCommittedResource(
-      &readHeap, D3D12_HEAP_FLAG_NONE, &readDesc,
-      D3D12_RESOURCE_STATE_COPY_DEST, nullptr, IID_PPV_ARGS(&pReadBuffer)));
-
-    transferData.pData = values;
-    transferData.RowPitch = valueSizeInBytes;
-    transferData.SlicePitch = transferData.RowPitch;
-
-    UpdateSubresources<1>(pCommandList, pUavResource.p, pUploadResource.p, 0, 0, 1, &transferData);
-    RecordTransitionBarrier(pCommandList, pUavResource, D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_UNORDERED_ACCESS);
-
-    *ppUavResource = pUavResource.Detach();
-    *ppReadBuffer = pReadBuffer.Detach();
-    *ppUploadResource = pUploadResource.Detach();
-  }
-
-  template <typename TVertex, int len>
-  void CreateVertexBuffer(ID3D12Device *pDevice, TVertex(&vertices)[len],
-                          ID3D12Resource **ppVertexBuffer,
-                          D3D12_VERTEX_BUFFER_VIEW *pVertexBufferView) {
-    size_t vertexBufferSize = sizeof(vertices);
-    CComPtr<ID3D12Resource> pVertexBuffer;
-    CD3DX12_HEAP_PROPERTIES heapProps(D3D12_HEAP_TYPE_UPLOAD);
-    CD3DX12_RESOURCE_DESC bufferDesc(
-        CD3DX12_RESOURCE_DESC::Buffer(vertexBufferSize));
-    VERIFY_SUCCEEDED(pDevice->CreateCommittedResource(
-        &heapProps, D3D12_HEAP_FLAG_NONE, &bufferDesc,
-        D3D12_RESOURCE_STATE_GENERIC_READ, nullptr,
-        IID_PPV_ARGS(&pVertexBuffer)));
-
-    UINT8 *pVertexDataBegin;
-    CD3DX12_RANGE readRange(0, 0);
-    VERIFY_SUCCEEDED(pVertexBuffer->Map(
-        0, &readRange, reinterpret_cast<void **>(&pVertexDataBegin)));
-    memcpy(pVertexDataBegin, vertices, vertexBufferSize);
-    pVertexBuffer->Unmap(0, nullptr);
-
-    // Initialize the vertex buffer view.
-    pVertexBufferView->BufferLocation = pVertexBuffer->GetGPUVirtualAddress();
-    pVertexBufferView->StrideInBytes = sizeof(TVertex);
-    pVertexBufferView->SizeInBytes = vertexBufferSize;
-
-    *ppVertexBuffer = pVertexBuffer.Detach();
-  }
-
-  // Requires Anniversary Edition headers, so simplifying things for current setup.
-  const UINT D3D12_FEATURE_D3D12_OPTIONS1 = 8;
-  struct D3D12_FEATURE_DATA_D3D12_OPTIONS1 {
-    BOOL WaveOps;
-    UINT WaveLaneCountMin;
-    UINT WaveLaneCountMax;
-    UINT TotalLaneCount;
-    BOOL ExpandedComputeResourceStates;
-    BOOL Int64ShaderOps;
-  };
-
-  bool DoesDeviceSupportInt64(ID3D12Device *pDevice) {
-    D3D12_FEATURE_DATA_D3D12_OPTIONS1 O;
-    if (FAILED(pDevice->CheckFeatureSupport((D3D12_FEATURE)D3D12_FEATURE_D3D12_OPTIONS1, &O, sizeof(O))))
-      return false;
-    return O.Int64ShaderOps != FALSE;
-  }
-
-  bool DoesDeviceSupportWaveOps(ID3D12Device *pDevice) {
-    D3D12_FEATURE_DATA_D3D12_OPTIONS1 O;
-    if (FAILED(pDevice->CheckFeatureSupport((D3D12_FEATURE)D3D12_FEATURE_D3D12_OPTIONS1, &O, sizeof(O))))
-      return false;
-    return O.WaveOps != FALSE;
-  }
-
-  void DXBCFromText(LPCSTR pText, LPCWSTR pEntryPoint, LPCWSTR pTargetProfile, ID3DBlob **ppBlob) {
-    CW2A pEntryPointA(pEntryPoint, CP_UTF8);
-    CW2A pTargetProfileA(pTargetProfile, CP_UTF8);
-    CComPtr<ID3DBlob> pErrors;
-    D3D_SHADER_MACRO d3dMacro[2];
-    ZeroMemory(d3dMacro, sizeof(d3dMacro));
-    d3dMacro[0].Definition = "1";
-    d3dMacro[0].Name = "USING_DXBC";
-    HRESULT hr = D3DCompile(pText, strlen(pText), "hlsl.hlsl", d3dMacro, nullptr, pEntryPointA, pTargetProfileA, 0, 0, ppBlob, &pErrors);
-    if (pErrors != nullptr) {
-      CA2W errors((char *)pErrors->GetBufferPointer(), CP_ACP);
-      LogCommentFmt(L"Compilation failure: %s", errors.m_szBuffer);
-    }
-    VERIFY_SUCCEEDED(hr);
-  }
-
-  HRESULT EnableDebugLayer() {
-    // The debug layer does net yet validate DXIL programs that require rewriting,
-    // but basic logging should work properly.
-    HRESULT hr = S_FALSE;
-    if (UseDebugIfaces()) {
-      CComPtr<ID3D12Debug> debugController;
-      hr = D3D12GetDebugInterface(IID_PPV_ARGS(&debugController));
-      if (SUCCEEDED(hr)) {
-        debugController->EnableDebugLayer();
-        hr = S_OK;
-      }
-    }
-    return hr;
-  }
-
-  HRESULT EnableExperimentalMode() {
-    if (m_ExperimentalModeEnabled) {
-      return S_OK;
-    }
-    if (!GetTestParamBool(L"ExperimentalShaders")) {
-      return S_FALSE;
-    }
-    HRESULT hr = EnableExperimentalShaderModels();
-    if (SUCCEEDED(hr)) {
-      m_ExperimentalModeEnabled = true;
-    }
-    return hr;
-  }
-
-  struct FenceObj {
-    HANDLE m_fenceEvent = NULL;
-    CComPtr<ID3D12Fence> m_fence;
-    UINT64 m_fenceValue;
-    ~FenceObj() {
-      if (m_fenceEvent) CloseHandle(m_fenceEvent);
-    }
-  };
-
-  void InitFenceObj(ID3D12Device *pDevice, FenceObj *pObj) {
-    pObj->m_fenceValue = 1;
-    VERIFY_SUCCEEDED(pDevice->CreateFence(0, D3D12_FENCE_FLAG_NONE,
-                                          IID_PPV_ARGS(&pObj->m_fence)));
-    // Create an event handle to use for frame synchronization.
-    pObj->m_fenceEvent = CreateEvent(nullptr, FALSE, FALSE, nullptr);
-    if (pObj->m_fenceEvent == nullptr) {
-      VERIFY_SUCCEEDED(HRESULT_FROM_WIN32(GetLastError()));
-    }
-  }
-
-  void ReadHlslDataIntoNewStream(LPCWSTR relativePath, IStream **ppStream) {
-    VERIFY_SUCCEEDED(m_support.Initialize());
-    CComPtr<IDxcLibrary> pLibrary;
-    CComPtr<IDxcBlobEncoding> pBlob;
-    CComPtr<IStream> pStream;
-    std::wstring path = GetPathToHlslDataFile(relativePath);
-    VERIFY_SUCCEEDED(m_support.CreateInstance(CLSID_DxcLibrary, &pLibrary));
-    VERIFY_SUCCEEDED(pLibrary->CreateBlobFromFile(path.c_str(), nullptr, &pBlob));
-    VERIFY_SUCCEEDED(pLibrary->CreateStreamFromBlobReadOnly(pBlob, &pStream));
-    *ppStream = pStream.Detach();
-  }
-
-  void RecordRenderAndReadback(ID3D12GraphicsCommandList *pList,
-                               ID3D12DescriptorHeap *pRtvHeap,
-                               UINT rtvDescriptorSize,
-                               UINT instanceCount,
-                               D3D12_VERTEX_BUFFER_VIEW *pVertexBufferView,
-                               ID3D12RootSignature *pRootSig,
-                               ID3D12Resource *pRenderTarget,
-                               ID3D12Resource *pReadBuffer) {
-    D3D12_RESOURCE_DESC rtDesc = pRenderTarget->GetDesc();
-    D3D12_VIEWPORT viewport;
-    D3D12_RECT scissorRect;
-
-    memset(&viewport, 0, sizeof(viewport));
-    viewport.Height = rtDesc.Height;
-    viewport.Width = rtDesc.Width;
-    viewport.MaxDepth = 1.0f;
-    memset(&scissorRect, 0, sizeof(scissorRect));
-    scissorRect.right = rtDesc.Width;
-    scissorRect.bottom = rtDesc.Height;
-    if (pRootSig != nullptr) {
-      pList->SetGraphicsRootSignature(pRootSig);
-    }
-    pList->RSSetViewports(1, &viewport);
-    pList->RSSetScissorRects(1, &scissorRect);
-
-    // Indicate that the buffer will be used as a render target.
-    RecordTransitionBarrier(pList, pRenderTarget, D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_RENDER_TARGET);
-
-    CD3DX12_CPU_DESCRIPTOR_HANDLE rtvHandle(pRtvHeap->GetCPUDescriptorHandleForHeapStart(), 0, rtvDescriptorSize);
-    pList->OMSetRenderTargets(1, &rtvHandle, FALSE, nullptr);
-
-    pList->ClearRenderTargetView(rtvHandle, ClearColor, 0, nullptr);
-    pList->IASetPrimitiveTopology(D3D_PRIMITIVE_TOPOLOGY_TRIANGLELIST);
-    pList->IASetVertexBuffers(0, 1, pVertexBufferView);
-    pList->DrawInstanced(3, instanceCount, 0, 0);
-
-    // Transition to copy source and copy into read-back buffer.
-    RecordTransitionBarrier(pList, pRenderTarget, D3D12_RESOURCE_STATE_RENDER_TARGET, D3D12_RESOURCE_STATE_COPY_SOURCE);
-
-    // Copy into read-back buffer.
-    UINT rowPitch = rtDesc.Width * 4;
-    if (rowPitch % D3D12_TEXTURE_DATA_PITCH_ALIGNMENT)
-      rowPitch += D3D12_TEXTURE_DATA_PITCH_ALIGNMENT - (rowPitch % D3D12_TEXTURE_DATA_PITCH_ALIGNMENT);
-    D3D12_PLACED_SUBRESOURCE_FOOTPRINT Footprint;
-    Footprint.Offset = 0;
-    Footprint.Footprint = CD3DX12_SUBRESOURCE_FOOTPRINT(DXGI_FORMAT_R8G8B8A8_UNORM, rtDesc.Width, rtDesc.Height, 1, rowPitch);
-    CD3DX12_TEXTURE_COPY_LOCATION DstLoc(pReadBuffer, Footprint);
-    CD3DX12_TEXTURE_COPY_LOCATION SrcLoc(pRenderTarget, 0);
-    pList->CopyTextureRegion(&DstLoc, 0, 0, 0, &SrcLoc, nullptr);
-  }
-
-  void RunRWByteBufferComputeTest(ID3D12Device *pDevice, LPCSTR shader, std::vector<uint32_t> &values);
-
-  void SetDescriptorHeap(ID3D12GraphicsCommandList *pCommandList, ID3D12DescriptorHeap *pHeap) {
-    ID3D12DescriptorHeap *const pHeaps[1] = { pHeap };
-    pCommandList->SetDescriptorHeaps(1, pHeaps);
-  }
-
-  void WaitForSignal(ID3D12CommandQueue *pCQ, FenceObj &FO) {
-    ::WaitForSignal(pCQ, FO.m_fence, FO.m_fenceEvent, FO.m_fenceValue++);
-  }
-};
-
-const float ExecutionTest::ClearColor[4] = { 0.0f, 0.2f, 0.4f, 1.0f };
-
-#define WAVE_INTRINSIC_DXBC_GUARD \
-  "#ifdef USING_DXBC\r\n" \
-  "uint WaveGetLaneIndex() { return 1; }\r\n" \
-  "uint WaveReadLaneFirst(uint u) { return u; }\r\n" \
-  "bool WaveIsFirstLane() { return true; }\r\n" \
-  "uint WaveGetLaneCount() { return 1; }\r\n" \
-  "uint WaveReadLaneAt(uint n, uint u) { return u; }\r\n" \
-  "bool WaveActiveAnyTrue(bool b) { return b; }\r\n" \
-  "bool WaveActiveAllTrue(bool b) { return false; }\r\n" \
-  "uint WaveActiveAllEqual(uint u) { return u; }\r\n" \
-  "uint4 WaveActiveBallot(bool b) { return 1; }\r\n" \
-  "uint WaveActiveCountBits(uint u) { return 1; }\r\n" \
-  "uint WaveActiveSum(uint u) { return 1; }\r\n" \
-  "uint WaveActiveProduct(uint u) { return 1; }\r\n" \
-  "uint WaveActiveBitAnd(uint u) { return 1; }\r\n" \
-  "uint WaveActiveBitOr(uint u) { return 1; }\r\n" \
-  "uint WaveActiveBitXor(uint u) { return 1; }\r\n" \
-  "uint WaveActiveMin(uint u) { return 1; }\r\n" \
-  "uint WaveActiveMax(uint u) { return 1; }\r\n" \
-  "uint WavePrefixCountBits(uint u) { return 1; }\r\n" \
-  "uint WavePrefixSum(uint u) { return 1; }\r\n" \
-  "uint WavePrefixProduct(uint u) { return 1; }\r\n" \
-  "uint QuadReadLaneAt(uint a, uint u) { return 1; }\r\n" \
-  "uint QuadReadAcrossX(uint u) { return 1; }\r\n" \
-  "uint QuadReadAcrossY(uint u) { return 1; }\r\n" \
-  "uint QuadReadAcrossDiagonal(uint u) { return 1; }\r\n" \
-  "#endif\r\n"
-
-
-static void SetupComputeValuePattern(std::vector<uint32_t> &values, size_t count) {
-  values.resize(count); // one element per dispatch group, in bytes
-  for (size_t i = 0; i < count; ++i) {
-    values[i] = i;
-  }
-}
-
-bool ExecutionTest::ExecutionTestClassSetup() {
-  HRESULT hr = EnableExperimentalMode();
-  if (FAILED(hr)) {
-    LogCommentFmt(L"Unable to enable shader experimental mode - 0x%08x.", hr);
-  }
-  else if (hr == S_FALSE) {
-    LogCommentFmt(L"Experimental mode not enabled.");
-  }
-  else {
-    LogCommentFmt(L"Experimental mode enabled.");
-  }
-  hr = EnableDebugLayer();
-  if (FAILED(hr)) {
-    LogCommentFmt(L"Unable to enable debug layer - 0x%08x.", hr);
-  }
-  else {
-    LogCommentFmt(L"Debug layer enabled.");
-  }
-  return true;
-}
-
-void ExecutionTest::RunRWByteBufferComputeTest(ID3D12Device *pDevice, LPCSTR pShader, std::vector<uint32_t> &values) {
-  static const int DispatchGroupX = 1;
-  static const int DispatchGroupY = 1;
-  static const int DispatchGroupZ = 1;
-
-  CComPtr<ID3D12GraphicsCommandList> pCommandList;
-  CComPtr<ID3D12CommandQueue> pCommandQueue;
-  CComPtr<ID3D12DescriptorHeap> pUavHeap;
-  CComPtr<ID3D12CommandAllocator> pCommandAllocator;
-  UINT uavDescriptorSize;
-  FenceObj FO;
-
-  const size_t valueSizeInBytes = values.size() * sizeof(uint32_t);
-  CreateComputeCommandQueue(pDevice, L"RunRWByteBufferComputeTest Command Queue", &pCommandQueue);
-  InitFenceObj(pDevice, &FO);
-
-  // Describe and create a UAV descriptor heap.
-  D3D12_DESCRIPTOR_HEAP_DESC heapDesc = {};
-  heapDesc.NumDescriptors = 1;
-  heapDesc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV;
-  heapDesc.Flags = D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE;
-  VERIFY_SUCCEEDED(pDevice->CreateDescriptorHeap(&heapDesc, IID_PPV_ARGS(&pUavHeap)));
-  uavDescriptorSize = pDevice->GetDescriptorHandleIncrementSize(heapDesc.Type);
-
-  // Create root signature.
-  CComPtr<ID3D12RootSignature> pRootSignature;
-  {
-    CD3DX12_DESCRIPTOR_RANGE ranges[1];
-    ranges[0].Init(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 0, 0);
-
-    CD3DX12_ROOT_PARAMETER rootParameters[1];
-    rootParameters[0].InitAsDescriptorTable(1, &ranges[0], D3D12_SHADER_VISIBILITY_ALL);
-
-    CD3DX12_ROOT_SIGNATURE_DESC rootSignatureDesc;
-    rootSignatureDesc.Init(_countof(rootParameters), rootParameters, 0, nullptr, D3D12_ROOT_SIGNATURE_FLAG_NONE);
-
-    CreateRootSignatureFromDesc(pDevice, &rootSignatureDesc, &pRootSignature);
-  }
-
-  // Create pipeline state object.
-  CComPtr<ID3D12PipelineState> pComputeState;
-  CreateComputePSO(pDevice, pRootSignature, pShader, &pComputeState);
-
-  // Create a command allocator and list for compute.
-  VERIFY_SUCCEEDED(pDevice->CreateCommandAllocator(D3D12_COMMAND_LIST_TYPE_COMPUTE, IID_PPV_ARGS(&pCommandAllocator)));
-  VERIFY_SUCCEEDED(pDevice->CreateCommandList(0, D3D12_COMMAND_LIST_TYPE_COMPUTE, pCommandAllocator, pComputeState, IID_PPV_ARGS(&pCommandList)));
-  pCommandList->SetName(L"ExecutionTest::RunRWByteButterComputeTest Command List");
-
-  // Set up UAV resource.
-  CComPtr<ID3D12Resource> pUavResource;
-  CComPtr<ID3D12Resource> pReadBuffer;
-  CComPtr<ID3D12Resource> pUploadResource;
-  CreateTestUavs(pDevice, pCommandList, values.data(), valueSizeInBytes, &pUavResource, &pReadBuffer, &pUploadResource);
-  VERIFY_SUCCEEDED(pUavResource->SetName(L"RunRWByteBufferComputeText UAV"));
-  VERIFY_SUCCEEDED(pReadBuffer->SetName(L"RunRWByteBufferComputeText UAV Read Buffer"));
-  VERIFY_SUCCEEDED(pUploadResource->SetName(L"RunRWByteBufferComputeText UAV Upload Buffer"));
-
-  // Close the command list and execute it to perform the GPU setup.
-  pCommandList->Close();
-  ExecuteCommandList(pCommandQueue, pCommandList);
-  WaitForSignal(pCommandQueue, FO);
-  VERIFY_SUCCEEDED(pCommandAllocator->Reset());
-  VERIFY_SUCCEEDED(pCommandList->Reset(pCommandAllocator, pComputeState));
-
-  // Run the compute shader and copy the results back to readable memory.
-  {
-    D3D12_UNORDERED_ACCESS_VIEW_DESC uavDesc = {};
-    uavDesc.Format = DXGI_FORMAT_R32_TYPELESS;
-    uavDesc.ViewDimension = D3D12_UAV_DIMENSION_BUFFER;
-    uavDesc.Buffer.FirstElement = 0;
-    uavDesc.Buffer.NumElements = values.size();
-    uavDesc.Buffer.StructureByteStride = 0;
-    uavDesc.Buffer.CounterOffsetInBytes = 0;
-    uavDesc.Buffer.Flags = D3D12_BUFFER_UAV_FLAG_RAW;
-    CD3DX12_CPU_DESCRIPTOR_HANDLE uavHandle(pUavHeap->GetCPUDescriptorHandleForHeapStart());
-    CD3DX12_GPU_DESCRIPTOR_HANDLE uavHandleGpu(pUavHeap->GetGPUDescriptorHandleForHeapStart());
-    pDevice->CreateUnorderedAccessView(pUavResource, nullptr, &uavDesc, uavHandle);
-    SetDescriptorHeap(pCommandList, pUavHeap);
-    pCommandList->SetComputeRootSignature(pRootSignature);
-    pCommandList->SetComputeRootDescriptorTable(0, uavHandleGpu);
-  }
-  pCommandList->Dispatch(DispatchGroupX, DispatchGroupY, DispatchGroupZ);
-  RecordTransitionBarrier(pCommandList, pUavResource, D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COPY_SOURCE);
-  pCommandList->CopyResource(pReadBuffer, pUavResource);
-  pCommandList->Close();
-  ExecuteCommandList(pCommandQueue, pCommandList);
-  WaitForSignal(pCommandQueue, FO);
-  {
-    MappedData mappedData(pReadBuffer, valueSizeInBytes);
-    uint32_t *pData = (uint32_t *)mappedData.data();
-    memcpy(values.data(), pData, valueSizeInBytes);
-  }
-  WaitForSignal(pCommandQueue, FO);
-}
-
-TEST_F(ExecutionTest, BasicComputeTest) {
-  //
-  // BasicComputeTest is a simple compute shader that can be used as the basis
-  // for more interesting compute execution tests.
-  // The HLSL is compatible with shader models <=5.1 to allow using the DXBC
-  // rendering code paths for comparison.
-  //
-  static const char pShader[] =
-    "RWByteAddressBuffer g_bab : register(u0);\r\n"
-    "[numthreads(8,8,1)]\r\n"
-    "void main(uint GI : SV_GroupIndex) {"
-    "  uint addr = GI * 4;\r\n"
-    "  uint val = g_bab.Load(addr);\r\n"
-    "  DeviceMemoryBarrierWithGroupSync();\r\n"
-    "  g_bab.Store(addr, val + 1);\r\n"
-    "}";
-  static const int NumThreadsX = 8;
-  static const int NumThreadsY = 8;
-  static const int NumThreadsZ = 1;
-  static const int ThreadsPerGroup = NumThreadsX * NumThreadsY * NumThreadsZ;
-  static const int DispatchGroupCount = 1;
-
-  CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice))
-    return;
-
-  std::vector<uint32_t> values;
-  SetupComputeValuePattern(values, ThreadsPerGroup * DispatchGroupCount);
-  VERIFY_ARE_EQUAL(values[0], 0);
-  RunRWByteBufferComputeTest(pDevice, pShader, values);
-  VERIFY_ARE_EQUAL(values[0], 1);
-}
-
-TEST_F(ExecutionTest, BasicTriangleTest) {
-  static const UINT FrameCount = 2;
-  static const UINT m_width = 320;
-  static const UINT m_height = 200;
-  static const float m_aspectRatio = static_cast<float>(m_width) / static_cast<float>(m_height);
-
-  struct Vertex {
-    XMFLOAT3 position;
-    XMFLOAT4 color;
-  };
-
-  // Pipeline objects.
-  CComPtr<ID3D12Device> pDevice;
-  CComPtr<ID3D12Resource> pRenderTarget;
-  CComPtr<ID3D12CommandAllocator> pCommandAllocator;
-  CComPtr<ID3D12CommandQueue> pCommandQueue;
-  CComPtr<ID3D12RootSignature> pRootSig;
-  CComPtr<ID3D12DescriptorHeap> pRtvHeap;
-  CComPtr<ID3D12PipelineState> pPipelineState;
-  CComPtr<ID3D12GraphicsCommandList> pCommandList;
-  CComPtr<ID3D12Resource> pReadBuffer;
-  UINT rtvDescriptorSize;
-
-  CComPtr<ID3D12Resource> pVertexBuffer;
-  D3D12_VERTEX_BUFFER_VIEW vertexBufferView;
-
-  // Synchronization objects.
-  FenceObj FO;
-
-  // Shaders.
-  static const char pShaders[] =
-    "struct PSInput {\r\n"
-    "  float4 position : SV_POSITION;\r\n"
-    "  float4 color : COLOR;\r\n"
-    "};\r\n\r\n"
-    "PSInput VSMain(float4 position : POSITION, float4 color : COLOR) {\r\n"
-    "  PSInput result;\r\n"
-    "\r\n"
-    "  result.position = position;\r\n"
-    "  result.color = color;\r\n"
-    "  return result;\r\n"
-    "}\r\n\r\n"
-    "float4 PSMain(PSInput input) : SV_TARGET {\r\n"
-    "  return 1; //input.color;\r\n"
-    "};\r\n";
-
-  if (!CreateDevice(&pDevice))
-    return;
-
-  struct BasicTestChecker {
-    CComPtr<ID3D12Device> m_pDevice;
-    CComPtr<ID3D12InfoQueue> m_pInfoQueue;
-    bool m_OK = false;
-    void SetOK(bool value) { m_OK = value; }
-    BasicTestChecker(ID3D12Device *pDevice) : m_pDevice(pDevice) {
-      if (FAILED(m_pDevice.QueryInterface(&m_pInfoQueue)))
-        return;
-      m_pInfoQueue->PushEmptyStorageFilter();
-      m_pInfoQueue->PushEmptyRetrievalFilter();
-    }
-    ~BasicTestChecker() {
-      if (!m_OK && m_pInfoQueue != nullptr) {
-        UINT64 count = m_pInfoQueue->GetNumStoredMessages();
-        bool invalidBytecodeFound = false;
-        CAtlArray<BYTE> m_pBytes;
-        for (UINT64 i = 0; i < count; ++i) {
-          SIZE_T len = 0;
-          if (FAILED(m_pInfoQueue->GetMessageA(i, nullptr, &len)))
-            continue;
-          if (m_pBytes.GetCount() < len && !m_pBytes.SetCount(len))
-            continue;
-          D3D12_MESSAGE *pMsg = (D3D12_MESSAGE *)m_pBytes.GetData();
-          if (FAILED(m_pInfoQueue->GetMessageA(i, pMsg, &len)))
-            continue;
-          if (pMsg->ID == D3D12_MESSAGE_ID_CREATEVERTEXSHADER_INVALIDSHADERBYTECODE ||
-              pMsg->ID == D3D12_MESSAGE_ID_CREATEPIXELSHADER_INVALIDSHADERBYTECODE) {
-            invalidBytecodeFound = true;
-            break;
-          }
-        }
-        if (invalidBytecodeFound) {
-          LogCommentFmt(L"%s", L"Found an invalid bytecode message. This "
-            L"typically indicates that experimental mode "
-            L"is not set up properly.");
-          if (!GetTestParamBool(L"ExperimentalShaders")) {
-            LogCommentFmt(L"Note that the ExperimentalShaders test parameter isn't set.");
-          }
-        }
-        else {
-          LogCommentFmt(L"Did not find corrupt pixel or vertex shaders in "
-                        L"queue - dumping complete queue.");
-          WriteInfoQueueMessages(nullptr, OutputFn, m_pInfoQueue);
-        }
-      }
-    }
-    static void __stdcall OutputFn(void *pCtx, const wchar_t *pMsg) {
-      LogCommentFmt(L"%s", pMsg);
-    }
-  };
-  BasicTestChecker BTC(pDevice);
-  {
-    InitFenceObj(pDevice, &FO);
-    CreateRtvDescriptorHeap(pDevice, FrameCount, &pRtvHeap, &rtvDescriptorSize);
-    CreateRenderTargetAndReadback(pDevice, pRtvHeap, m_width, m_height, &pRenderTarget, &pReadBuffer);
-
-    // Create an empty root signature.
-    CD3DX12_ROOT_SIGNATURE_DESC rootSignatureDesc;
-    rootSignatureDesc.Init(
-      0, nullptr, 0, nullptr,
-      D3D12_ROOT_SIGNATURE_FLAG_ALLOW_INPUT_ASSEMBLER_INPUT_LAYOUT);
-    CreateRootSignatureFromDesc(pDevice, &rootSignatureDesc, &pRootSig);
-
-    // Create the pipeline state, which includes compiling and loading shaders.
-    // Define the vertex input layout.
-    D3D12_INPUT_ELEMENT_DESC inputElementDescs[] = {
-        {"POSITION", 0, DXGI_FORMAT_R32G32B32_FLOAT, 0, 0,
-         D3D12_INPUT_CLASSIFICATION_PER_VERTEX_DATA, 0},
-        {"COLOR", 0, DXGI_FORMAT_R32G32B32A32_FLOAT, 0, 12,
-         D3D12_INPUT_CLASSIFICATION_PER_VERTEX_DATA, 0}};
-    D3D12_INPUT_LAYOUT_DESC InputLayout = { inputElementDescs, _countof(inputElementDescs) };
-    CreateGraphicsPSO(pDevice, &InputLayout, pRootSig, pShaders, &pPipelineState);
-
-    CreateGraphicsCommandQueueAndList(pDevice, &pCommandQueue,
-                                      &pCommandAllocator, &pCommandList,
-                                      pPipelineState);
-
-    // Define the geometry for a triangle.
-    Vertex triangleVertices[] = {
-      { { 0.0f, 0.25f * m_aspectRatio, 0.0f },{ 1.0f, 0.0f, 0.0f, 1.0f } },
-      { { 0.25f, -0.25f * m_aspectRatio, 0.0f },{ 0.0f, 1.0f, 0.0f, 1.0f } },
-      { { -0.25f, -0.25f * m_aspectRatio, 0.0f },{ 0.0f, 0.0f, 1.0f, 1.0f } } };
-
-    CreateVertexBuffer(pDevice, triangleVertices, &pVertexBuffer, &vertexBufferView);
-    WaitForSignal(pCommandQueue, FO);
-  }
-
-  // Render and execute the command list.
-  RecordRenderAndReadback(pCommandList, pRtvHeap, rtvDescriptorSize, 1,
-                          &vertexBufferView, pRootSig, pRenderTarget,
-                          pReadBuffer);
-  VERIFY_SUCCEEDED(pCommandList->Close());
-  ExecuteCommandList(pCommandQueue, pCommandList);
-
-  // Wait for previous frame.
-  WaitForSignal(pCommandQueue, FO);
-
-  // At this point, we've verified that execution succeeded with DXIL.
-  BTC.SetOK(true);
-
-  // Read back to CPU and examine contents.
-  {
-    MappedData data(pReadBuffer, m_width * m_height * 4);
-    const uint32_t *pPixels = (uint32_t *)data.data();
-    if (SaveImages()) {
-      SavePixelsToFile(pPixels, DXGI_FORMAT_R8G8B8A8_UNORM, m_width, m_height, L"basic.bmp");
-    }
-    uint32_t top = pPixels[m_width / 2]; // Top center.
-    uint32_t mid = pPixels[m_width / 2 + m_width * (m_height / 2)]; // Middle center.
-    VERIFY_ARE_EQUAL(0xff663300, top); // clear color
-    VERIFY_ARE_EQUAL(0xffffffff, mid); // white
-  }
-}
-
-TEST_F(ExecutionTest, Int64Test) {
-  static const char pShader[] =
-    "RWByteAddressBuffer g_bab : register(u0);\r\n"
-    "[numthreads(8,8,1)]\r\n"
-    "void main(uint GI : SV_GroupIndex) {"
-    "  uint addr = GI * 4;\r\n"
-    "  uint val = g_bab.Load(addr);\r\n"
-    "  uint64_t u64 = val;\r\n"
-    "  u64 *= val;\r\n"
-    "  g_bab.Store(addr, (uint)(u64 >> 32));\r\n"
-    "}";
-  static const int NumThreadsX = 8;
-  static const int NumThreadsY = 8;
-  static const int NumThreadsZ = 1;
-  static const int ThreadsPerGroup = NumThreadsX * NumThreadsY * NumThreadsZ;
-  static const int DispatchGroupCount = 1;
-
-  CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice))
-    return;
-
-  if (!DoesDeviceSupportInt64(pDevice)) {
-    // Optional feature, so it's correct to not support it if declared as such.
-    WEX::Logging::Log::Comment(L"Device does not support int64 operations.");
-    return;
-  }
-  std::vector<uint32_t> values;
-  SetupComputeValuePattern(values, ThreadsPerGroup * DispatchGroupCount);
-  VERIFY_ARE_EQUAL(values[0], 0);
-  RunRWByteBufferComputeTest(pDevice, pShader, values);
-  VERIFY_ARE_EQUAL(values[0], 0);
-}
-
-TEST_F(ExecutionTest, SignTest) {
-  static const char pShader[] =
-    "RWByteAddressBuffer g_bab : register(u0);\r\n"
-    "[numthreads(8,1,1)]\r\n"
-    "void main(uint GI : SV_GroupIndex) {"
-    "  uint addr = GI * 4;\r\n"
-    "  int val = g_bab.Load(addr);\r\n"
-    "  g_bab.Store(addr, (uint)(sign(val)));\r\n"
-    "}";
-  static const int NumThreadsX = 8;
-  static const int NumThreadsY = 1;
-  static const int NumThreadsZ = 1;
-  static const int ThreadsPerGroup = NumThreadsX * NumThreadsY * NumThreadsZ;
-  static const int DispatchGroupCount = 1;
-
-  CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice))
-    return;
-
-  std::vector<uint32_t> values = { (uint32_t)-3, (uint32_t)-2, (uint32_t)-1, 0, 1, 2, 3, 4};
-  RunRWByteBufferComputeTest(pDevice, pShader, values);
-  VERIFY_ARE_EQUAL(values[0], -1);
-  VERIFY_ARE_EQUAL(values[1], -1);
-  VERIFY_ARE_EQUAL(values[2], -1);
-  VERIFY_ARE_EQUAL(values[3], 0);
-  VERIFY_ARE_EQUAL(values[4], 1);
-  VERIFY_ARE_EQUAL(values[5], 1);
-  VERIFY_ARE_EQUAL(values[6], 1);
-  VERIFY_ARE_EQUAL(values[7], 1);
-}
-
-TEST_F(ExecutionTest, WaveIntrinsicsTest) {
-  WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
-
-  struct PerThreadData {
-    uint32_t id, flags, laneIndex, laneCount, firstLaneId, preds, firstlaneX, lane1X;
-    uint32_t allBC, allSum, allProd, allAND, allOR, allXOR, allMin, allMax;
-    uint32_t pfBC, pfSum, pfProd;
-    uint32_t ballot[4];
-    uint32_t diver;   // divergent value, used in calculation
-    int32_t i_diver;  // divergent value, used in calculation
-    int32_t i_allMax, i_allMin, i_allSum, i_allProd;
-    int32_t i_pfSum, i_pfProd;
-  };
-  static const char pShader[] =
-    WAVE_INTRINSIC_DXBC_GUARD
-    "struct PerThreadData {\r\n"
-    " uint id, flags, laneIndex, laneCount, firstLaneId, preds, firstlaneX, lane1X;\r\n"
-    " uint allBC, allSum, allProd, allAND, allOR, allXOR, allMin, allMax;\r\n"
-    " uint pfBC, pfSum, pfProd;\r\n"
-    " uint4 ballot;\r\n"
-    " uint diver;\r\n"
-    " int i_diver;\r\n"
-    " int i_allMax, i_allMin, i_allSum, i_allProd;\r\n"
-    " int i_pfSum, i_pfProd;\r\n"
-    "};\r\n"
-    "RWStructuredBuffer<PerThreadData> g_sb : register(u0);\r\n"
-    "[numthreads(8,8,1)]\r\n"
-    "void main(uint GI : SV_GroupIndex, uint3 GTID : SV_GroupThreadID) {"
-    "  PerThreadData pts = g_sb[GI];\r\n"
-    "  uint diver = GTID.x + 2;\r\n"
-    "  pts.diver = diver;\r\n"
-    "  pts.flags = 0;\r\n"
-    "  pts.preds = 0;\r\n"
-    "  if (WaveIsFirstLane()) pts.flags |= 1;\r\n"
-    "  pts.laneIndex = WaveGetLaneIndex();\r\n"
-    "  pts.laneCount = WaveGetLaneCount();\r\n"
-    "  pts.firstLaneId = WaveReadLaneFirst(pts.id);\r\n"
-    "  pts.preds |= ((WaveActiveAnyTrue(diver == 1) ? 1 : 0) << 0);\r\n"
-    "  pts.preds |= ((WaveActiveAllTrue(diver == 1) ? 1 : 0) << 1);\r\n"
-    "  pts.preds |= ((WaveActiveAllEqual(diver) ? 1 : 0) << 2);\r\n"
-    "  pts.preds |= ((WaveActiveAllEqual(GTID.z) ? 1 : 0) << 3);\r\n"
-    "  pts.preds |= ((WaveActiveAllEqual(WaveReadLaneFirst(diver)) ? 1 : 0) << 4);\r\n"
-    "  pts.ballot = WaveActiveBallot(diver > 3);\r\n"
-    "  pts.firstlaneX = WaveReadLaneFirst(GTID.x);\r\n"
-    "  pts.lane1X = WaveReadLaneAt(GTID.x, 1);\r\n"
-    "\r\n"
-    "  pts.allBC = WaveActiveCountBits(diver > 3);\r\n"
-    "  pts.allSum = WaveActiveSum(diver);\r\n"
-    "  pts.allProd = WaveActiveProduct(diver);\r\n"
-    "  pts.allAND = WaveActiveBitAnd(diver);\r\n"
-    "  pts.allOR = WaveActiveBitOr(diver);\r\n"
-    "  pts.allXOR = WaveActiveBitXor(diver);\r\n"
-    "  pts.allMin = WaveActiveMin(diver);\r\n"
-    "  pts.allMax = WaveActiveMax(diver);\r\n"
-    "\r\n"
-    "  pts.pfBC = WavePrefixCountBits(diver > 3);\r\n"
-    "  pts.pfSum = WavePrefixSum(diver);\r\n"
-    "  pts.pfProd = WavePrefixProduct(diver);\r\n"
-    "\r\n"
-    "  int i_diver = pts.i_diver;\r\n"
-    "  pts.i_allMax = WaveActiveMax(i_diver);\r\n"
-    "  pts.i_allMin = WaveActiveMin(i_diver);\r\n"
-    "  pts.i_allSum = WaveActiveSum(i_diver);\r\n"
-    "  pts.i_allProd = WaveActiveProduct(i_diver);\r\n"
-    "  pts.i_pfSum = WavePrefixSum(i_diver);\r\n"
-    "  pts.i_pfProd = WavePrefixProduct(i_diver);\r\n"
-    "\r\n"
-    "  g_sb[GI] = pts;\r\n"
-    "}";
-  static const int NumtheadsX = 8;
-  static const int NumtheadsY = 8;
-  static const int NumtheadsZ = 1;
-  static const int ThreadsPerGroup = NumtheadsX * NumtheadsY * NumtheadsZ;
-  static const int DispatchGroupCount = 1;
-
-  CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice))
-    return;
-
-  if (!DoesDeviceSupportWaveOps(pDevice)) {
-    // Optional feature, so it's correct to not support it if declared as such.
-    WEX::Logging::Log::Comment(L"Device does not support wave operations.");
-    return;
-  }
-
-  std::vector<PerThreadData> values;
-  values.resize(ThreadsPerGroup * DispatchGroupCount);
-  for (size_t i = 0; i < values.size(); ++i) {
-    memset(&values[i], 0, sizeof(PerThreadData));
-    values[i].id = i;
-    values[i].i_diver = (int)i;
-    values[i].i_diver *= (i % 2) ? 1 : -1;
-  }
-
-  static const int DispatchGroupX = 1;
-  static const int DispatchGroupY = 1;
-  static const int DispatchGroupZ = 1;
-
-  CComPtr<ID3D12GraphicsCommandList> pCommandList;
-  CComPtr<ID3D12CommandQueue> pCommandQueue;
-  CComPtr<ID3D12DescriptorHeap> pUavHeap;
-  CComPtr<ID3D12CommandAllocator> pCommandAllocator;
-  UINT uavDescriptorSize;
-  FenceObj FO;
-  bool dxbc = UseDxbc();
-
-  const size_t valueSizeInBytes = values.size() * sizeof(PerThreadData);
-  CreateComputeCommandQueue(pDevice, L"WaveIntrinsicsTest Command Queue", &pCommandQueue);
-  InitFenceObj(pDevice, &FO);
-
-  // Describe and create a UAV descriptor heap.
-  D3D12_DESCRIPTOR_HEAP_DESC heapDesc = {};
-  heapDesc.NumDescriptors = 1;
-  heapDesc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV;
-  heapDesc.Flags = D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE;
-  VERIFY_SUCCEEDED(pDevice->CreateDescriptorHeap(&heapDesc, IID_PPV_ARGS(&pUavHeap)));
-  uavDescriptorSize = pDevice->GetDescriptorHandleIncrementSize(heapDesc.Type);
-
-  // Create root signature.
-  CComPtr<ID3D12RootSignature> pRootSignature;
-  {
-    CD3DX12_DESCRIPTOR_RANGE ranges[1];
-    ranges[0].Init(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 0, 0);
-
-    CD3DX12_ROOT_PARAMETER rootParameters[1];
-    rootParameters[0].InitAsDescriptorTable(1, &ranges[0], D3D12_SHADER_VISIBILITY_ALL);
-
-    CD3DX12_ROOT_SIGNATURE_DESC rootSignatureDesc;
-    rootSignatureDesc.Init(_countof(rootParameters), rootParameters, 0, nullptr, D3D12_ROOT_SIGNATURE_FLAG_NONE);
-
-    CComPtr<ID3DBlob> signature;
-    CComPtr<ID3DBlob> error;
-    VERIFY_SUCCEEDED(D3D12SerializeRootSignature(&rootSignatureDesc, D3D_ROOT_SIGNATURE_VERSION_1, &signature, &error));
-    VERIFY_SUCCEEDED(pDevice->CreateRootSignature(0, signature->GetBufferPointer(), signature->GetBufferSize(), IID_PPV_ARGS(&pRootSignature)));
-  }
-
-  // Create pipeline state object.
-  CComPtr<ID3D12PipelineState> pComputeState;
-  CreateComputePSO(pDevice, pRootSignature, pShader, &pComputeState);
-
-  // Create a command allocator and list for compute.
-  VERIFY_SUCCEEDED(pDevice->CreateCommandAllocator(D3D12_COMMAND_LIST_TYPE_COMPUTE, IID_PPV_ARGS(&pCommandAllocator)));
-  VERIFY_SUCCEEDED(pDevice->CreateCommandList(0, D3D12_COMMAND_LIST_TYPE_COMPUTE, pCommandAllocator, pComputeState, IID_PPV_ARGS(&pCommandList)));
-
-  // Set up UAV resource.
-  CComPtr<ID3D12Resource> pUavResource;
-  CComPtr<ID3D12Resource> pReadBuffer;
-  CComPtr<ID3D12Resource> pUploadResource;
-  CreateTestUavs(pDevice, pCommandList, values.data(), valueSizeInBytes, &pUavResource, &pReadBuffer, &pUploadResource);
-
-  // Close the command list and execute it to perform the GPU setup.
-  pCommandList->Close();
-  ExecuteCommandList(pCommandQueue, pCommandList);
-  WaitForSignal(pCommandQueue, FO);
-  VERIFY_SUCCEEDED(pCommandAllocator->Reset());
-  VERIFY_SUCCEEDED(pCommandList->Reset(pCommandAllocator, pComputeState));
-
-  // Run the compute shader and copy the results back to readable memory.
-  {
-    D3D12_UNORDERED_ACCESS_VIEW_DESC uavDesc = {};
-    uavDesc.Format = DXGI_FORMAT_UNKNOWN;
-    uavDesc.ViewDimension = D3D12_UAV_DIMENSION_BUFFER;
-    uavDesc.Buffer.FirstElement = 0;
-    uavDesc.Buffer.NumElements = values.size();
-    uavDesc.Buffer.StructureByteStride = sizeof(PerThreadData);
-    uavDesc.Buffer.CounterOffsetInBytes = 0;
-    uavDesc.Buffer.Flags = D3D12_BUFFER_UAV_FLAG_NONE;
-    CD3DX12_CPU_DESCRIPTOR_HANDLE uavHandle(pUavHeap->GetCPUDescriptorHandleForHeapStart());
-    CD3DX12_GPU_DESCRIPTOR_HANDLE uavHandleGpu(pUavHeap->GetGPUDescriptorHandleForHeapStart());
-    pDevice->CreateUnorderedAccessView(pUavResource, nullptr, &uavDesc, uavHandle);
-    SetDescriptorHeap(pCommandList, pUavHeap);
-    pCommandList->SetComputeRootSignature(pRootSignature);
-    pCommandList->SetComputeRootDescriptorTable(0, uavHandleGpu);
-  }
-  pCommandList->Dispatch(DispatchGroupX, DispatchGroupY, DispatchGroupZ);
-  RecordTransitionBarrier(pCommandList, pUavResource, D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COPY_SOURCE);
-  pCommandList->CopyResource(pReadBuffer, pUavResource);
-  pCommandList->Close();
-  ExecuteCommandList(pCommandQueue, pCommandList);
-  WaitForSignal(pCommandQueue, FO);
-  {
-    MappedData mappedData(pReadBuffer, valueSizeInBytes);
-    PerThreadData *pData = (PerThreadData *)mappedData.data();
-    memcpy(values.data(), pData, valueSizeInBytes);
-
-    // Gather some general data.
-    // The 'firstLaneId' captures a unique number per first-lane per wave.
-    // Counting the number distinct firstLaneIds gives us the number of waves.
-    std::vector<uint32_t> firstLaneIds;
-    for (size_t i = 0; i < values.size(); ++i) {
-      PerThreadData &pts = values[i];
-      uint32_t firstLaneId = pts.firstLaneId;
-      if (!contains(firstLaneIds, firstLaneId)) {
-        firstLaneIds.push_back(firstLaneId);
-      }
-    }
-
-    // Waves should cover 4 threads or more.
-    LogCommentFmt(L"Found %u distinct lane ids: %u", firstLaneIds.size());
-    if (!dxbc) {
-      VERIFY_IS_GREATER_THAN_OR_EQUAL(values.size() / 4, firstLaneIds.size());
-    }
-
-    // Now, group threads into waves.
-    std::map<uint32_t, std::unique_ptr<std::vector<PerThreadData *> > > waves;
-    for (size_t i = 0; i < firstLaneIds.size(); ++i) {
-      waves[firstLaneIds[i]] = std::make_unique<std::vector<PerThreadData *> >();
-    }
-    for (size_t i = 0; i < values.size(); ++i) {
-      PerThreadData &pts = values[i];
-      std::unique_ptr<std::vector<PerThreadData *> > &wave = waves[pts.firstLaneId];
-      wave->push_back(&pts);
-    }
-
-    // Verify that all the wave values are coherent across the wave.
-    for (size_t i = 0; i < values.size(); ++i) {
-      PerThreadData &pts = values[i];
-      std::unique_ptr<std::vector<PerThreadData *> > &wave = waves[pts.firstLaneId];
-      // Sort the lanes by increasing lane ID.
-      struct LaneIdOrderPred {
-        bool operator()(PerThreadData *a, PerThreadData *b) {
-          return a->laneIndex < b->laneIndex;
-        }
-      };
-      std::sort(wave.get()->begin(), wave.get()->end(), LaneIdOrderPred());
-
-      // Verify some interesting properties of the first lane.
-      uint32_t pfBC, pfSum, pfProd;
-      int32_t i_pfSum, i_pfProd;
-      int32_t i_allMax, i_allMin;
-      {
-        PerThreadData *ptdFirst = wave->front();
-        VERIFY_IS_TRUE(0 != (ptdFirst->flags & 1)); // FirstLane sets this bit.
-        VERIFY_IS_TRUE(0 == ptdFirst->pfBC);
-        VERIFY_IS_TRUE(0 == ptdFirst->pfSum);
-        VERIFY_IS_TRUE(1 == ptdFirst->pfProd);
-        VERIFY_IS_TRUE(0 == ptdFirst->i_pfSum);
-        VERIFY_IS_TRUE(1 == ptdFirst->i_pfProd);
-        pfBC = (ptdFirst->diver > 3) ? 1 : 0;
-        pfSum = ptdFirst->diver;
-        pfProd = ptdFirst->diver;
-        i_pfSum = ptdFirst->i_diver;
-        i_pfProd = ptdFirst->i_diver;
-        i_allMax = i_allMin = ptdFirst->i_diver;
-      }
-
-      // Calculate values which take into consideration all lanes.
-      uint32_t preds = 0;
-      preds |= 1 << 1; // AllTrue starts true, switches to false if needed.
-      preds |= 1 << 2; // AllEqual starts true, switches to false if needed.
-      preds |= 1 << 3; // WaveActiveAllEqual(GTID.z) is always true
-      preds |= 1 << 4; // (WaveActiveAllEqual(WaveReadLaneFirst(diver)) is always true
-      uint32_t ballot[4] = { 0, 0, 0, 0 };
-      int32_t i_allSum = 0, i_allProd = 1;
-      for (size_t n = 0; n < wave->size(); ++n) {
-        std::vector<PerThreadData *> &lanes = *wave.get();
-        // pts.preds |= ((WaveActiveAnyTrue(diver == 1) ? 1 : 0) << 0);
-        if (lanes[n]->diver == 1) preds |= (1 << 0);
-        // pts.preds |= ((WaveActiveAllTrue(diver == 1) ? 1 : 0) << 1);
-        if (lanes[n]->diver != 1) preds &= ~(1 << 1);
-        // pts.preds |= ((WaveActiveAllEqual(diver) ? 1 : 0) << 2);
-        if (lanes[0]->diver != lanes[n]->diver) preds &= ~(1 << 2);
-        // pts.ballot = WaveActiveBallot(diver > 3);\r\n"
-        if (lanes[n]->diver > 3) {
-          // This is the uint4 result layout:
-          // .x -> bits  0 .. 31
-          // .y -> bits 32 .. 63
-          // .z -> bits 64 .. 95
-          // .w -> bits 96 ..127
-          uint32_t component = lanes[n]->laneIndex / 32;
-          uint32_t bit = lanes[n]->laneIndex % 32;
-          ballot[component] |= 1 << bit;
-        }
-        i_allMax = std::max(lanes[n]->i_diver, i_allMax);
-        i_allMin = std::min(lanes[n]->i_diver, i_allMin);
-        i_allProd *= lanes[n]->i_diver;
-        i_allSum += lanes[n]->i_diver;
-      }
-
-      for (size_t n = 1; n < wave->size(); ++n) {
-        // 'All' operations are uniform across the wave.
-        std::vector<PerThreadData *> &lanes = *wave.get();
-        VERIFY_IS_TRUE(0 == (lanes[n]->flags & 1)); // non-firstlanes do not set this bit
-        VERIFY_ARE_EQUAL(lanes[0]->allBC, lanes[n]->allBC);
-        VERIFY_ARE_EQUAL(lanes[0]->allSum, lanes[n]->allSum);
-        VERIFY_ARE_EQUAL(lanes[0]->allProd, lanes[n]->allProd);
-        VERIFY_ARE_EQUAL(lanes[0]->allAND, lanes[n]->allAND);
-        VERIFY_ARE_EQUAL(lanes[0]->allOR, lanes[n]->allOR);
-        VERIFY_ARE_EQUAL(lanes[0]->allXOR, lanes[n]->allXOR);
-        VERIFY_ARE_EQUAL(lanes[0]->allMin, lanes[n]->allMin);
-        VERIFY_ARE_EQUAL(lanes[0]->allMax, lanes[n]->allMax);
-        VERIFY_ARE_EQUAL(i_allMax, lanes[n]->i_allMax);
-        VERIFY_ARE_EQUAL(i_allMin, lanes[n]->i_allMin);
-        VERIFY_ARE_EQUAL(i_allProd, lanes[n]->i_allProd);
-        VERIFY_ARE_EQUAL(i_allSum, lanes[n]->i_allSum);
-
-        // first-lane reads and uniform reads are uniform across the wave.
-        VERIFY_ARE_EQUAL(lanes[0]->firstlaneX, lanes[n]->firstlaneX);
-        VERIFY_ARE_EQUAL(lanes[0]->lane1X, lanes[n]->lane1X);
-
-        // the lane count is uniform across the wave.
-        VERIFY_ARE_EQUAL(lanes[0]->laneCount, lanes[n]->laneCount);
-
-        // The predicates are uniform across the wave.
-        VERIFY_ARE_EQUAL(lanes[n]->preds, preds);
-
-        // the lane index is distinct per thread.
-        for (size_t prior = 0; prior < n; ++prior) {
-          VERIFY_ARE_NOT_EQUAL(lanes[prior]->laneIndex, lanes[n]->laneIndex);
-        }
-        // Ballot results are uniform across the wave.
-        VERIFY_ARE_EQUAL(0, memcmp(ballot, lanes[n]->ballot, sizeof(ballot)));
-
-        // Keep running total of prefix calculation. Prefix values are exclusive to
-        // the executing lane.
-        VERIFY_ARE_EQUAL(pfBC, lanes[n]->pfBC);
-        VERIFY_ARE_EQUAL(pfSum, lanes[n]->pfSum);
-        VERIFY_ARE_EQUAL(pfProd, lanes[n]->pfProd);
-        VERIFY_ARE_EQUAL(i_pfSum, lanes[n]->i_pfSum);
-        VERIFY_ARE_EQUAL(i_pfProd, lanes[n]->i_pfProd);
-        pfBC += (lanes[n]->diver > 3) ? 1 : 0;
-        pfSum += lanes[n]->diver;
-        pfProd *= lanes[n]->diver;
-        i_pfSum += lanes[n]->i_diver;
-        i_pfProd *= lanes[n]->i_diver;
-      }
-      // TODO: add divergent branching and verify that the otherwise uniform values properly diverge
-    }
-
-    // Compare each value of each per-thread element.
-    for (size_t i = 0; i < values.size(); ++i) {
-      PerThreadData &pts = values[i];
-      VERIFY_ARE_EQUAL(i, pts.id); // ID is unchanged.
-    }
-  }
-}
-
-TEST_F(ExecutionTest, WaveIntrinsicsInPSTest) {
-  WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
-
-  struct Vertex {
-    XMFLOAT3 position;
-  };
-
-  struct PerPixelData {
-    XMFLOAT4 position;
-    uint32_t id, flags, laneIndex, laneCount, firstLaneId, sum1;
-    uint32_t id0, id1, id2, id3;
-    uint32_t acrossX, acrossY, acrossDiag, quadActiveCount;
-  };
-
-  const UINT RTWidth = 128;
-  const UINT RTHeight = 128;
-
-  // Shaders.
-  static const char pShaders[] =
-    WAVE_INTRINSIC_DXBC_GUARD
-    "struct PSInput {\r\n"
-    "  float4 position : SV_POSITION;\r\n"
-    "};\r\n\r\n"
-    "PSInput VSMain(float4 position : POSITION) {\r\n"
-    "  PSInput result;\r\n"
-    "\r\n"
-    "  result.position = position;\r\n"
-    "  return result;\r\n"
-    "}\r\n\r\n"
-    "typedef uint uint32_t;\r\n"
-    "uint pos_to_id(float4 pos) { return pos.x * 128 + pos.y; }\r\n"
-    "struct PerPixelData {\r\n"
-    " float4 position;\r\n"
-    " uint32_t id, flags, laneIndex, laneCount, firstLaneId, sum1;\r\n"
-    " uint32_t id0, id1, id2, id3;\r\n"
-    " uint32_t acrossX, acrossY, acrossDiag, quadActiveCount;\r\n"
-    "};\r\n"
-    "AppendStructuredBuffer<PerPixelData> g_sb : register(u1);\r\n"
-    "float4 PSMain(PSInput input) : SV_TARGET {\r\n"
-    "  uint one = 1;\r\n"
-    "  PerPixelData d;\r\n"
-    "  d.position = input.position;\r\n"
-    "  d.id = pos_to_id(input.position);\r\n"
-    "  d.flags = 0;\r\n"
-    "  if (WaveIsFirstLane()) d.flags |= 1;\r\n"
-    "  d.laneIndex = WaveGetLaneIndex();\r\n"
-    "  d.laneCount = WaveGetLaneCount();\r\n"
-    "  d.firstLaneId = WaveReadLaneFirst(d.id);\r\n"
-    "  d.sum1 = WaveActiveSum(one);\r\n"
-    "  d.id0 = QuadReadLaneAt(d.id, 0);\r\n"
-    "  d.id1 = QuadReadLaneAt(d.id, 1);\r\n"
-    "  d.id2 = QuadReadLaneAt(d.id, 2);\r\n"
-    "  d.id3 = QuadReadLaneAt(d.id, 3);\r\n"
-    "  d.acrossX = QuadReadAcrossX(d.id);\r\n"
-    "  d.acrossY = QuadReadAcrossY(d.id);\r\n"
-    "  d.acrossDiag = QuadReadAcrossDiagonal(d.id);\r\n"
-    "  d.quadActiveCount = one + QuadReadAcrossX(one) + QuadReadAcrossY(one) + QuadReadAcrossDiagonal(one);\r\n"
-    "  g_sb.Append(d);\r\n"
-    "  return 1;\r\n"
-    "};\r\n";
-
-  CComPtr<ID3D12Device> pDevice;
-  CComPtr<ID3D12CommandQueue> pCommandQueue;
-  CComPtr<ID3D12DescriptorHeap> pUavHeap, pRtvHeap;
-  CComPtr<ID3D12CommandAllocator> pCommandAllocator;
-  CComPtr<ID3D12GraphicsCommandList> pCommandList;
-  CComPtr<ID3D12PipelineState> pPSO;
-  CComPtr<ID3D12Resource> pRenderTarget, pReadBuffer;
-  UINT uavDescriptorSize, rtvDescriptorSize;
-  CComPtr<ID3D12Resource> pVertexBuffer;
-  D3D12_VERTEX_BUFFER_VIEW vertexBufferView;
-
-  if (!CreateDevice(&pDevice))
-    return;
-  if (!DoesDeviceSupportWaveOps(pDevice)) {
-    // Optional feature, so it's correct to not support it if declared as such.
-    WEX::Logging::Log::Comment(L"Device does not support wave operations.");
-    return;
-  }
-
-  FenceObj FO;
-  InitFenceObj(pDevice, &FO);
-
-  // Describe and create a UAV descriptor heap.
-  D3D12_DESCRIPTOR_HEAP_DESC heapDesc = {};
-  heapDesc.NumDescriptors = 1;
-  heapDesc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV;
-  heapDesc.Flags = D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE;
-  VERIFY_SUCCEEDED(pDevice->CreateDescriptorHeap(&heapDesc, IID_PPV_ARGS(&pUavHeap)));
-  uavDescriptorSize = pDevice->GetDescriptorHandleIncrementSize(heapDesc.Type);
-
-  CreateRtvDescriptorHeap(pDevice, 1, &pRtvHeap, &rtvDescriptorSize);
-  CreateRenderTargetAndReadback(pDevice, pRtvHeap, RTHeight, RTWidth, &pRenderTarget, &pReadBuffer);
-
-  // Create root signature: one UAV.
-  CComPtr<ID3D12RootSignature> pRootSignature;
-  {
-    CD3DX12_DESCRIPTOR_RANGE ranges[1];
-    ranges[0].Init(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 1, 0, 0);
-
-    CD3DX12_ROOT_PARAMETER rootParameters[1];
-    rootParameters[0].InitAsDescriptorTable(1, &ranges[0], D3D12_SHADER_VISIBILITY_ALL);
-
-    CD3DX12_ROOT_SIGNATURE_DESC rootSignatureDesc;
-    rootSignatureDesc.Init(_countof(rootParameters), rootParameters, 0, nullptr, D3D12_ROOT_SIGNATURE_FLAG_ALLOW_INPUT_ASSEMBLER_INPUT_LAYOUT);
-
-    CreateRootSignatureFromDesc(pDevice, &rootSignatureDesc, &pRootSignature);
-  }
-
-  D3D12_INPUT_ELEMENT_DESC elementDesc[] = {
-      {"POSITION", 0, DXGI_FORMAT_R32G32B32_FLOAT, 0, 0,
-       D3D12_INPUT_CLASSIFICATION_PER_VERTEX_DATA, 0}};
-  D3D12_INPUT_LAYOUT_DESC InputLayout = {elementDesc, _countof(elementDesc)};
-  CreateGraphicsPSO(pDevice, &InputLayout, pRootSignature, pShaders, &pPSO);
-
-  CreateGraphicsCommandQueueAndList(pDevice, &pCommandQueue, &pCommandAllocator,
-                                    &pCommandList, pPSO);
-
-  // Single triangle covering half the target.
-  Vertex vertices[] = {
-    { { -1.0f,  1.0f, 0.0f } },
-    { {  1.0f,  1.0f, 0.0f } },
-    { { -1.0f, -1.0f, 0.0f } } };
-  const UINT TriangleCount = _countof(vertices) / 3;
-
-  CreateVertexBuffer(pDevice, vertices, &pVertexBuffer, &vertexBufferView);
-
-  bool dxbc = UseDxbc();
-
-  // Set up UAV resource.
-  std::vector<PerPixelData> values;
-  values.resize(RTWidth * RTHeight * 2);
-  UINT valueSizeInBytes = values.size() * sizeof(PerPixelData);
-  memset(values.data(), 0, valueSizeInBytes);
-  CComPtr<ID3D12Resource> pUavResource;
-  CComPtr<ID3D12Resource> pUavReadBuffer;
-  CComPtr<ID3D12Resource> pUploadResource;
-  CreateTestUavs(pDevice, pCommandList, values.data(), valueSizeInBytes, &pUavResource, &pUavReadBuffer, &pUploadResource);
-
-  // Set up the append counter resource.
-  CComPtr<ID3D12Resource> pUavCounterResource;
-  CComPtr<ID3D12Resource> pReadCounterBuffer;
-  CComPtr<ID3D12Resource> pUploadCounterResource;
-  BYTE zero[sizeof(UINT)] = { 0 };
-  CreateTestUavs(pDevice, pCommandList, zero, sizeof(zero), &pUavCounterResource, &pReadCounterBuffer, &pUploadCounterResource);
-
-  // Close the command list and execute it to perform the GPU setup.
-  pCommandList->Close();
-  ExecuteCommandList(pCommandQueue, pCommandList);
-  WaitForSignal(pCommandQueue, FO);
-  VERIFY_SUCCEEDED(pCommandAllocator->Reset());
-  VERIFY_SUCCEEDED(pCommandList->Reset(pCommandAllocator, pPSO));
-
-  pCommandList->SetGraphicsRootSignature(pRootSignature);
-  SetDescriptorHeap(pCommandList, pUavHeap);
-  {
-    D3D12_UNORDERED_ACCESS_VIEW_DESC uavDesc = {};
-    uavDesc.Format = DXGI_FORMAT_UNKNOWN;
-    uavDesc.ViewDimension = D3D12_UAV_DIMENSION_BUFFER;
-    uavDesc.Buffer.FirstElement = 0;
-    uavDesc.Buffer.NumElements = values.size();
-    uavDesc.Buffer.StructureByteStride = sizeof(PerPixelData);
-    uavDesc.Buffer.CounterOffsetInBytes = 0;
-    uavDesc.Buffer.Flags = D3D12_BUFFER_UAV_FLAG_NONE;
-    CD3DX12_CPU_DESCRIPTOR_HANDLE uavHandle(pUavHeap->GetCPUDescriptorHandleForHeapStart());
-    CD3DX12_GPU_DESCRIPTOR_HANDLE uavHandleGpu(pUavHeap->GetGPUDescriptorHandleForHeapStart());
-    pDevice->CreateUnorderedAccessView(pUavResource, pUavCounterResource, &uavDesc, uavHandle);
-    pCommandList->SetGraphicsRootDescriptorTable(0, uavHandleGpu);
-  }
-  RecordRenderAndReadback(pCommandList, pRtvHeap, rtvDescriptorSize, TriangleCount, &vertexBufferView, nullptr, pRenderTarget, pReadBuffer);
-  RecordTransitionBarrier(pCommandList, pUavResource, D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COPY_SOURCE);
-  RecordTransitionBarrier(pCommandList, pUavCounterResource, D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COPY_SOURCE);
-  pCommandList->CopyResource(pUavReadBuffer, pUavResource);
-  pCommandList->CopyResource(pReadCounterBuffer, pUavCounterResource);
-  VERIFY_SUCCEEDED(pCommandList->Close());
-  LogCommentFmt(L"Rendering to %u by %u", RTWidth, RTHeight);
-  ExecuteCommandList(pCommandQueue, pCommandList);
-  WaitForSignal(pCommandQueue, FO);
-  {
-    MappedData data(pReadBuffer, RTWidth * RTHeight * 4);
-    const uint32_t *pPixels = (uint32_t *)data.data();
-    if (SaveImages()) {
-      SavePixelsToFile(pPixels, DXGI_FORMAT_R8G8B8A8_UNORM, RTWidth, RTHeight, L"psintrin.bmp");
-    }
-  }
-
-  uint32_t appendCount;
-  {
-    MappedData mappedData(pReadCounterBuffer, sizeof(uint32_t));
-    appendCount = *((uint32_t *)mappedData.data());
-    LogCommentFmt(L"%u elements in append buffer", appendCount);
-  }
-
-  {
-    MappedData mappedData(pUavReadBuffer, values.size());
-    PerPixelData *pData = (PerPixelData *)mappedData.data();
-    memcpy(values.data(), pData, valueSizeInBytes);
-
-    // DXBC is handy to test pipeline setup, but interesting functions are
-    // stubbed out, so there is no point in further validation.
-    if (dxbc)
-      return;
-
-    uint32_t maxActiveLaneCount = 0;
-    uint32_t maxLaneCount = 0;
-    for (uint32_t i = 0; i < appendCount; ++i) {
-      maxActiveLaneCount = std::max(maxActiveLaneCount, values[i].sum1);
-      maxLaneCount = std::max(maxLaneCount, values[i].laneCount);
-    }
-
-    uint32_t peerOfHelperLanes = 0;
-    for (uint32_t i = 0; i < appendCount; ++i) {
-      if (values[i].sum1 != maxActiveLaneCount) {
-        ++peerOfHelperLanes;
-      }
-    }
-
-    LogCommentFmt(
-        L"Found: %u threads. Waves reported up to %u total lanes, up "
-        L"to %u active lanes, and %u threads had helper/inactive lanes.",
-        appendCount, maxLaneCount, maxActiveLaneCount, peerOfHelperLanes);
-
-    // Group threads into quad invocations.
-    uint32_t singlePixelCount = 0;
-    uint32_t multiPixelCount = 0;
-    std::unordered_set<uint32_t> ids;
-    std::multimap<uint32_t, PerPixelData *> idGroups;
-    std::multimap<uint32_t, PerPixelData *> firstIdGroups;
-    for (uint32_t i = 0; i < appendCount; ++i) {
-      ids.insert(values[i].id);
-      idGroups.insert(std::make_pair(values[i].id, &values[i]));
-      firstIdGroups.insert(std::make_pair(values[i].firstLaneId, &values[i]));
-    }
-    for (uint32_t id : ids) {
-      if (idGroups.count(id) == 1)
-        ++singlePixelCount;
-      else
-        ++multiPixelCount;
-    }
-    LogCommentFmt(L"%u pixels were processed by a single thread. %u invocations were for shared pixels.",
-      singlePixelCount, multiPixelCount);
-
-    // Multiple threads may have tried to shade the same pixel.
-    // Where every pixel is distinct, it's very straightforward to validate.
-    {
-      auto cur = firstIdGroups.begin(), end = firstIdGroups.end();
-      while (cur != end) {
-        bool simpleWave = true;
-        uint32_t firstId = (*cur).first;
-        auto groupEnd = cur;
-        while (groupEnd != end && (*groupEnd).first == firstId) {
-          if (idGroups.count((*groupEnd).second->id) > 1)
-            simpleWave = false;
-          ++groupEnd;
-        }
-        if (simpleWave) {
-          // Break the wave into quads.
-          struct QuadData {
-            unsigned count;
-            PerPixelData *data[4];
-          };
-          std::map<uint32_t, QuadData> quads;
-          for (auto i = cur; i != groupEnd; ++i) {
-            uint32_t quadId = (*i).second->id0;
-            auto match = quads.find(quadId);
-            if (match == quads.end()) {
-              QuadData qdata;
-              qdata.count = 1;
-              qdata.data[0] = (*i).second;
-              quads.insert(std::make_pair(quadId, qdata));
-            }
-            else {
-              VERIFY_IS_TRUE((*match).second.count < 4);
-              (*match).second.data[(*match).second.count++] = (*i).second;
-            }
-          }
-          for (auto quadPair : quads) {
-            unsigned count = quadPair.second.count;
-            if (count < 2) continue;
-            PerPixelData **data = quadPair.second.data;
-            bool isTop[4];
-            bool isLeft[4];
-            PerPixelData helperData;
-            memset(&helperData, sizeof(helperData), 0);
-            PerPixelData *layout[4]; // tl,tr,bl,br
-            memset(layout, sizeof(layout), 0);
-            auto fnToLayout = [&](bool top, bool left) -> PerPixelData ** {
-              int idx = top ? 0 : 2;
-              idx += left ? 0 : 1;
-              return &layout[idx];
-            };
-            auto fnToLayoutData = [&](bool top, bool left) -> PerPixelData * {
-              PerPixelData **pResult = fnToLayout(top, left);
-              if (*pResult == nullptr) return &helperData;
-              return *pResult;
-            };
-            VERIFY_IS_TRUE(count <= 4);
-            if (count == 2) {
-              isTop[0] = data[0]->position.y < data[1]->position.y;
-              isTop[1] = (data[0]->position.y == data[1]->position.y) ? isTop[0] : !isTop[0];
-              isLeft[0] = data[0]->position.x < data[1]->position.x;
-              isLeft[1] = (data[0]->position.x == data[1]->position.x) ? isLeft[0] : !isLeft[0];
-            }
-            else {
-              // with at least three samples, we have distinct x and y coordinates.
-              float left = std::min(data[0]->position.x, data[1]->position.x);
-              left = std::min(data[2]->position.x, left);
-              float top = std::min(data[0]->position.y, data[1]->position.y);
-              top = std::min(data[2]->position.y, top);
-              for (unsigned i = 0; i < count; ++i) {
-                isTop[i] = data[i]->position.y == top;
-                isLeft[i] = data[i]->position.x == left;
-              }
-            }
-            for (unsigned i = 0; i < count; ++i) {
-              *(fnToLayout(isTop[i], isLeft[i])) = data[i];
-            }
-
-            // Finally, we have a proper quad reconstructed. Validate.
-            for (unsigned i = 0; i < count; ++i) {
-              PerPixelData *d = data[i];
-              VERIFY_ARE_EQUAL(d->id0, fnToLayoutData(true, true)->id);
-              VERIFY_ARE_EQUAL(d->id1, fnToLayoutData(true, false)->id);
-              VERIFY_ARE_EQUAL(d->id2, fnToLayoutData(false, true)->id);
-              VERIFY_ARE_EQUAL(d->id3, fnToLayoutData(false, false)->id);
-              VERIFY_ARE_EQUAL(d->acrossX, fnToLayoutData(isTop[i], !isLeft[i])->id);
-              VERIFY_ARE_EQUAL(d->acrossY, fnToLayoutData(!isTop[i], isLeft[i])->id);
-              VERIFY_ARE_EQUAL(d->acrossDiag, fnToLayoutData(!isTop[i], !isLeft[i])->id);
-              VERIFY_ARE_EQUAL(d->quadActiveCount, count);
-            }
-          }
-        }
-        cur = groupEnd;
-      }
-    }
-
-    // TODO: provide validation for quads where the same pixel was shaded multiple times
-    //
-    // Consider: for pixels that were shaded multiple times, check whether
-    // some grouping of threads into quads satisfies all value requirements.
-  }
-}
-
-struct ShaderOpTestResult {
-  st::ShaderOp *ShaderOp;
-  std::shared_ptr<st::ShaderOpSet> ShaderOpSet;
-  std::shared_ptr<st::ShaderOpTest> Test;
-};
-
-struct SPrimitives {
-  float f_float;
-  float f_float2;
-  float f_float_o;
-  float f_float2_o;
-};
-
-std::shared_ptr<ShaderOpTestResult>
-RunShaderOpTestAfterParse(ID3D12Device *pDevice, dxc::DxcDllSupport &support,
-  IStream *pStream, LPCSTR pName,
-  st::ShaderOpTest::TInitCallbackFn pInitCallback, std::shared_ptr<st::ShaderOpSet> ShaderOpSet) {
-  DXASSERT_NOMSG(pStream != nullptr);
-  st::ShaderOp *pShaderOp;
-  if (pName == nullptr) {
-    if (ShaderOpSet->ShaderOps.size() != 1) {
-      VERIFY_FAIL(L"Expected a single shader operation.");
-    }
-    pShaderOp = ShaderOpSet->ShaderOps[0].get();
-  }
-  else {
-    pShaderOp = ShaderOpSet->GetShaderOp(pName);
-  }
-  if (pShaderOp == nullptr) {
-    std::string msg = "Unable to find shader op ";
-    msg += pName;
-    msg += "; available ops";
-    const char sep = ':';
-    for (auto &pAvailOp : ShaderOpSet->ShaderOps) {
-      msg += sep;
-      msg += pAvailOp->Name ? pAvailOp->Name : "[n/a]";
-    }
-    CA2W msgWide(msg.c_str());
-    VERIFY_FAIL(msgWide.m_psz);
-  }
-
-  // This won't actually be used since we're supplying the device,
-  // but let's make it consistent.
-  pShaderOp->UseWarpDevice = GetTestParamUseWARP(true);
-
-  std::shared_ptr<st::ShaderOpTest> test = std::make_shared<st::ShaderOpTest>();
-  test->SetDxcSupport(&support);
-  test->SetInitCallback(pInitCallback);
-  test->SetDevice(pDevice);
-  test->RunShaderOp(pShaderOp);
-
-  std::shared_ptr<ShaderOpTestResult> result =
-      std::make_shared<ShaderOpTestResult>();
-  result->ShaderOpSet = ShaderOpSet;
-  result->Test = test;
-  result->ShaderOp = pShaderOp;
-  return result;
-}
-
-std::shared_ptr<ShaderOpTestResult>
-RunShaderOpTest(ID3D12Device *pDevice, dxc::DxcDllSupport &support,
-                IStream *pStream, LPCSTR pName,
-                st::ShaderOpTest::TInitCallbackFn pInitCallback) {
-  DXASSERT_NOMSG(pStream != nullptr);
-  std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
-        std::make_shared<st::ShaderOpSet>();
-  st::ParseShaderOpSetFromStream(pStream, ShaderOpSet.get());
-  return RunShaderOpTestAfterParse(pDevice, support, pStream, pName, pInitCallback, ShaderOpSet);
-}
-
-TEST_F(ExecutionTest, OutOfBoundsTest) {
-  WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
-  CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
-
-  // Single operation test at the moment.
-  CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice))
-    return;
-
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(pDevice, m_support, pStream, "OOB", nullptr);
-  MappedData data;
-  // Read back to CPU and examine contents - should get pure red.
-  {
-    MappedData data;
-    test->Test->GetReadBackData("RTarget", &data);
-    const uint32_t *pPixels = (uint32_t *)data.data();
-    uint32_t first = *pPixels;
-    VERIFY_ARE_EQUAL(0xff0000ff, first); // pure red - only first component is read
-  }
-}
-
-TEST_F(ExecutionTest, SaturateTest) {
-  WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
-  CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
-
-  // Single operation test at the moment.
-  CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice))
-    return;
-
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(pDevice, m_support, pStream, "Saturate", nullptr);
-  MappedData data;
-  test->Test->GetReadBackData("U0", &data);
-  const float *pValues = (float *)data.data();
-  // Everything is zero except for 1.5f and +Inf, which saturate to 1.0f
-  const float ExpectedCases[9] = {
-    0.0f, 0.0f, 0.0f, 0.0f, // -inf, -1.5, -denorm, -0
-    0.0f, 0.0f, 1.0f, 1.0f, // 0, denorm, 1.5f, inf
-    0.0f                    // nan
-  };
-  for (size_t i = 0; i < _countof(ExpectedCases); ++i) {
-    VERIFY_ARE_EQUAL(*pValues, ExpectedCases[i]);
-    ++pValues;
-  }
-}
-
-TEST_F(ExecutionTest, BasicTriangleOpTest) {
-  WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
-  CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
-
-  // Single operation test at the moment.
-  CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice))
-    return;
-
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(pDevice, m_support, pStream, "Triangle", nullptr);
-  MappedData data;
-  D3D12_RESOURCE_DESC &D = test->ShaderOp->GetResourceByName("RTarget")->Desc;
-  UINT width = (UINT64)D.Width;
-  UINT height = (UINT64)D.Height;
-  test->Test->GetReadBackData("RTarget", &data);
-  const uint32_t *pPixels = (uint32_t *)data.data();
-  if (SaveImages()) {
-    SavePixelsToFile(pPixels, DXGI_FORMAT_R8G8B8A8_UNORM, 320, 200, L"basic.bmp");
-  }
-  uint32_t top = pPixels[width / 2]; // Top center.
-  uint32_t mid = pPixels[width / 2 + width * (height / 2)]; // Middle center.
-  VERIFY_ARE_EQUAL(0xff663300, top); // clear color
-  VERIFY_ARE_EQUAL(0xffffffff, mid); // white
-
-  // This is the basic validation test for shader operations, so it's good to
-  // check this here at least for this one test case.
-  data.reset();
-  test.reset();
-  ReportLiveObjects();
-}
-
-// Rendering two right triangles forming a square and assigning a texture value
-// for each pixel to calculate derivates.
-TEST_F(ExecutionTest, PartialDerivTest) {
-  WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
-  CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
-
-  CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice))
-      return;
-
-  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(pDevice, m_support, pStream, "DerivFine", nullptr);
-  MappedData data;
-  D3D12_RESOURCE_DESC &D = test->ShaderOp->GetResourceByName("RTarget")->Desc;
-  UINT width = (UINT64)D.Width;
-  UINT height = (UINT64)D.Height;
-  UINT pixelSize = GetByteSizeForFormat(D.Format) / 4;
-
-  test->Test->GetReadBackData("RTarget", &data);
-  const float *pPixels = (float *)data.data();
-
-  UINT centerIndex = (UINT64)width * height / 2 - width / 2;
-
-  // pixel at the center
-  UINT offsetCenter = centerIndex * pixelSize;
-  float CenterDDXFine = pPixels[offsetCenter];
-  float CenterDDYFine = pPixels[offsetCenter + 1];
-  float CenterDDXCoarse = pPixels[offsetCenter + 2];
-  float CenterDDYCoarse = pPixels[offsetCenter + 3];
-
-  LogCommentFmt(
-      L"center  ddx_fine: %8f, ddy_fine: %8f, ddx_coarse: %8f, ddy_coarse: %8f",
-      CenterDDXFine, CenterDDYFine, CenterDDXCoarse, CenterDDYCoarse);
-
-  // The texture for the 9 pixels in the center should look like the following
-
-  // 256   32  64
-  // 2048 256 512
-  // 1   .125 .25
-
-  // In D3D12 there is no guarantee of how the adapter is grouping 2x2 pixels
-  // So for fine derivatives there can be up to two possible results for the center pixel,
-  // while for coarse derivatives there can be up to six possible results.
-  int ulpTolerance = 1;
-  // 512 - 256 or 2048 - 256
-  bool left = CompareFloatULP(CenterDDXFine, -1792.0f, ulpTolerance);
-  VERIFY_IS_TRUE(left || CompareFloatULP(CenterDDXFine, 256.0f, ulpTolerance));
-  // 256 - 32 or 256 - .125
-  bool top = CompareFloatULP(CenterDDYFine, 224.0f, ulpTolerance);
-  VERIFY_IS_TRUE(top || CompareFloatULP(CenterDDYFine, -255.875, ulpTolerance));
-
-  if (top && left) {
-    VERIFY_IS_TRUE((CompareFloatULP(CenterDDXCoarse, -224.0f, ulpTolerance) ||
-                   CompareFloatULP(CenterDDXCoarse, -1792.0f, ulpTolerance)) &&
-                   (CompareFloatULP(CenterDDYCoarse, 224.0f, ulpTolerance) ||
-                   CompareFloatULP(CenterDDYCoarse, 1792.0f, ulpTolerance)));
-  }
-  else if (top) { // top right quad
-    VERIFY_IS_TRUE((CompareFloatULP(CenterDDXCoarse, 256.0f, ulpTolerance)  ||
-                   CompareFloatULP(CenterDDXCoarse, 32.0f, ulpTolerance))   &&
-                   (CompareFloatULP(CenterDDYCoarse, 224.0f, ulpTolerance) ||
-                   CompareFloatULP(CenterDDYCoarse, 448.0f, ulpTolerance)));
-  }
-  else if (left) { // bottom left quad
-    VERIFY_IS_TRUE((CompareFloatULP(CenterDDXCoarse, -1792.0f, ulpTolerance) ||
-                   CompareFloatULP(CenterDDXCoarse, -.875f, ulpTolerance))   &&
-                   (CompareFloatULP(CenterDDYCoarse, -2047.0f, ulpTolerance) ||
-                   CompareFloatULP(CenterDDYCoarse, -255.875f, ulpTolerance)));
-  }
-  else { // bottom right
-    VERIFY_IS_TRUE((CompareFloatULP(CenterDDXCoarse, 256.0f, ulpTolerance) ||
-                   CompareFloatULP(CenterDDXCoarse, .125f, ulpTolerance))  &&
-                   (CompareFloatULP(CenterDDYCoarse, -255.875f, ulpTolerance) ||
-                   CompareFloatULP(CenterDDYCoarse, -511.75f, ulpTolerance)));
-  }
-}
-
-// Resource structure for data-driven tests.
-
-struct SUnaryFPOp {
-    float input;
-    float output;
-};
-
-struct SBinaryFPOp {
-    float input1;
-    float input2;
-    float output1;
-    float output2;
-};
-
-struct STertiaryFPOp {
-    float input1;
-    float input2;
-    float input3;
-    float output;
-};
-
-struct SUnaryIntOp {
-    int input;
-    int output;
-};
-
-struct SUnaryUintOp {
-    unsigned int input;
-    unsigned int output;
-};
-
-struct SBinaryIntOp {
-    int input1;
-    int input2;
-    int output1;
-    int output2;
-};
-
-struct STertiaryIntOp {
-    int input1;
-    int input2;
-    int input3;
-    int output;
-};
-
-struct SBinaryUintOp {
-    unsigned int input1;
-    unsigned int input2;
-    unsigned int output1;
-    unsigned int output2;
-};
-
-struct STertiaryUintOp {
-    unsigned int input1;
-    unsigned int input2;
-    unsigned int input3;
-    unsigned int output;
-};
-
-// representation for HLSL float vectors
-struct SDotOp {
-    XMFLOAT4 input1;
-    XMFLOAT4 input2;
-    float o_dot2;
-    float o_dot3;
-    float o_dot4;
-};
-
-struct SMsad4 {
-    unsigned int ref;
-    XMUINT2 src;
-    XMUINT4 accum;
-    XMUINT4 result;
-};
-
-// Parameter representation for taef data-driven tests
-struct TableParameter {
-    LPCWSTR m_name;
-    enum TableParameterType {
-        INT,
-        UINT,
-        DOUBLE,
-        STRING,
-        BOOL,
-        INT_TABLE,
-        DOUBLE_TABLE,
-        STRING_TABLE,
-        UINT_TABLE,
-        BOOL_TABLE
-    };
-    TableParameterType m_type;
-    bool m_required; // required parameter
-    int m_int;
-    unsigned int m_uint;
-    double m_double;
-    bool m_bool;
-    WEX::Common::String m_str;
-    WEX::TestExecution::TestDataArray<int> m_intTable;
-    WEX::TestExecution::TestDataArray<unsigned int> m_uintTable;
-    WEX::TestExecution::TestDataArray<double> m_doubleTable;
-    WEX::TestExecution::TestDataArray<bool> m_boolTable;
-    WEX::TestExecution::TestDataArray<WEX::Common::String> m_StringTable;
-};
-
-class TableParameterHandler {
-public:
-  TableParameter* m_table;
-  size_t m_tableSize;
-
-  TableParameterHandler(TableParameter *pTable, size_t size) : m_table(pTable), m_tableSize(size) {}
-
-  TableParameter* GetTableParamByName(LPCWSTR name) {
-    for (size_t i = 0; i < m_tableSize; ++i) {
-      if (_wcsicmp(name, m_table[i].m_name) == 0) {
-        return &m_table[i];
-      }
-    }
-    DXASSERT(false, "Invalid Table Parameter Name %s", name);
-    return nullptr;
-  }
-
-  void clearTableParameter() {
-    for (size_t i = 0; i < m_tableSize; ++i) {
-      m_table[i].m_int = 0;
-      m_table[i].m_uint = 0;
-      m_table[i].m_double = 0;
-      m_table[i].m_bool = false;
-      m_table[i].m_str = WEX::Common::String();
-    }
-  }
-
-  template <class T1>
-  WEX::TestExecution::TestDataArray<T1> *GetDataArray(LPCWSTR name) {
-    return nullptr;
-  }
-
-  template <>
-  WEX::TestExecution::TestDataArray<int> *GetDataArray(LPCWSTR name) {
-    for (size_t i = 0; i < m_tableSize; ++i) {
-      if (_wcsicmp(name, m_table[i].m_name) == 0) {
-        return &(m_table[i].m_intTable);
-      }
-    }
-    DXASSERT(false, "Invalid Table Parameter Name %s", name);
-    return nullptr;
-  }
-
-  template <>
-  WEX::TestExecution::TestDataArray<unsigned int> *GetDataArray(LPCWSTR name) {
-    for (size_t i = 0; i < m_tableSize; ++i) {
-      if (_wcsicmp(name, m_table[i].m_name) == 0) {
-        return &(m_table[i].m_uintTable);
-      }
-    }
-    DXASSERT(false, "Invalid Table Parameter Name %s", name);
-    return nullptr;
-  }
-
-  template <>
-  WEX::TestExecution::TestDataArray<double> *GetDataArray(LPCWSTR name) {
-    for (size_t i = 0; i < m_tableSize; ++i) {
-      if (_wcsicmp(name, m_table[i].m_name) == 0) {
-        return &(m_table[i].m_doubleTable);
-      }
-    }
-    DXASSERT(false, "Invalid Table Parameter Name %s", name);
-    return nullptr;
-  }
-
-  template <>
-  WEX::TestExecution::TestDataArray<bool> *GetDataArray(LPCWSTR name) {
-    for (size_t i = 0; i < m_tableSize; ++i) {
-      if (_wcsicmp(name, m_table[i].m_name) == 0) {
-        return &(m_table[i].m_boolTable);
-      }
-    }
-    DXASSERT(false, "Invalid Table Parameter Name %s", name);
-    return nullptr;
-  }
-};
-
-static TableParameter UnaryFPOpParameters[] = {
-    { L"ShaderOp.Name", TableParameter::STRING, true },
-    { L"ShaderOp.Target", TableParameter::STRING, true },
-    { L"ShaderOp.EntryPoint", TableParameter::STRING, true },
-    { L"ShaderOp.Text", TableParameter::STRING, true },
-    { L"Validation.Input", TableParameter::STRING_TABLE, true },
-    { L"Validation.Expected", TableParameter::STRING_TABLE, true },
-    { L"Validation.Type", TableParameter::STRING, true },
-    { L"Validation.Tolerance", TableParameter::DOUBLE, true },
-    { L"Validation.NumInput", TableParameter::UINT, true },
-    { L"Warp.Version", TableParameter::UINT, false }
-};
-
-static TableParameter BinaryFPOpParameters[] = {
-    { L"ShaderOp.Name", TableParameter::STRING, true },
-    { L"ShaderOp.Target", TableParameter::STRING, true },
-    { L"ShaderOp.EntryPoint", TableParameter::STRING, true },
-    { L"ShaderOp.Text", TableParameter::STRING, true },
-    { L"Validation.Input1", TableParameter::STRING_TABLE, true },
-    { L"Validation.Input2", TableParameter::STRING_TABLE, true },
-    { L"Validation.Expected1", TableParameter::STRING_TABLE, true },
-    { L"Validation.Expected2", TableParameter::STRING_TABLE, true },
-    { L"Validation.Type", TableParameter::STRING, true },
-    { L"Validation.Tolerance", TableParameter::DOUBLE, true },
-    { L"Validation.NumInput", TableParameter::UINT, true }
-};
-
-static TableParameter TertiaryFPOpParameters[] = {
-    { L"ShaderOp.Name", TableParameter::STRING, true },
-    { L"ShaderOp.Target", TableParameter::STRING, true },
-    { L"ShaderOp.EntryPoint", TableParameter::STRING, true },
-    { L"ShaderOp.Text", TableParameter::STRING, true },
-    { L"Validation.Input1", TableParameter::STRING_TABLE, true },
-    { L"Validation.Input2", TableParameter::STRING_TABLE, true },
-    { L"Validation.Input3", TableParameter::STRING_TABLE, true },
-    { L"Validation.Expected", TableParameter::STRING_TABLE, true },
-    { L"Validation.Type", TableParameter::STRING, true },
-    { L"Validation.Tolerance", TableParameter::DOUBLE, true },
-    { L"Validation.NumInput", TableParameter::UINT, true }
-};
-
-static TableParameter UnaryIntOpParameters[] = {
-    { L"ShaderOp.Name", TableParameter::STRING, true },
-    { L"ShaderOp.Target", TableParameter::STRING, true },
-    { L"ShaderOp.EntryPoint", TableParameter::STRING, true },
-    { L"ShaderOp.Text", TableParameter::STRING, true },
-    { L"Validation.Input", TableParameter::INT_TABLE, true },
-    { L"Validation.Expected", TableParameter::INT_TABLE, true },
-    { L"Validation.Tolerance", TableParameter::INT, true },
-    { L"Validation.NumInput", TableParameter::UINT, true }
-};
-
-static TableParameter UnaryUintOpParameters[] = {
-    { L"ShaderOp.Name", TableParameter::STRING, true },
-    { L"ShaderOp.Target", TableParameter::STRING, true },
-    { L"ShaderOp.EntryPoint", TableParameter::STRING, true },
-    { L"ShaderOp.Text", TableParameter::STRING, true },
-    { L"Validation.Input", TableParameter::UINT_TABLE, true },
-    { L"Validation.Expected", TableParameter::UINT_TABLE, true },
-    { L"Validation.Tolerance", TableParameter::INT, true },
-    { L"Validation.NumInput", TableParameter::UINT, true }
-};
-
-static TableParameter BinaryIntOpParameters[] = {
-    { L"ShaderOp.Name", TableParameter::STRING, true },
-    { L"ShaderOp.Target", TableParameter::STRING, true },
-    { L"ShaderOp.EntryPoint", TableParameter::STRING, true },
-    { L"ShaderOp.Text", TableParameter::STRING, true },
-    { L"Validation.Input1", TableParameter::INT_TABLE, true },
-    { L"Validation.Input2", TableParameter::INT_TABLE, true },
-    { L"Validation.Expected1", TableParameter::INT_TABLE, true },
-    { L"Validation.Expected2", TableParameter::INT_TABLE, false },
-    { L"Validation.Tolerance", TableParameter::INT, true },
-    { L"Validation.NumInput", TableParameter::UINT, true },
-    { L"Validation.NumExpected", TableParameter::INT, true }
-};
-
-static TableParameter TertiaryIntOpParameters[] = {
-    { L"ShaderOp.Name", TableParameter::STRING, true },
-    { L"ShaderOp.Target", TableParameter::STRING, true },
-    { L"ShaderOp.EntryPoint", TableParameter::STRING, true },
-    { L"ShaderOp.Text", TableParameter::STRING, true },
-    { L"Validation.Input1", TableParameter::INT_TABLE, true },
-    { L"Validation.Input2", TableParameter::INT_TABLE, true },
-    { L"Validation.Input3", TableParameter::INT_TABLE, true },
-    { L"Validation.Expected", TableParameter::INT_TABLE, true },
-    { L"Validation.Tolerance", TableParameter::INT, true },
-    { L"Validation.NumInput", TableParameter::UINT, true }
-};
-
-static TableParameter BinaryUintOpParameters[] = {
-    { L"ShaderOp.Name", TableParameter::STRING, true },
-    { L"ShaderOp.Target", TableParameter::STRING, true },
-    { L"ShaderOp.EntryPoint", TableParameter::STRING, true },
-    { L"ShaderOp.Text", TableParameter::STRING, true },
-    { L"Validation.Input1", TableParameter::UINT_TABLE, true },
-    { L"Validation.Input2", TableParameter::UINT_TABLE, true },
-    { L"Validation.Expected1", TableParameter::UINT_TABLE, true },
-    { L"Validation.Expected2", TableParameter::UINT_TABLE, false },
-    { L"Validation.Tolerance", TableParameter::INT, true },
-
-    { L"Validation.NumInput", TableParameter::UINT, true },
-    { L"Validation.NumExpected", TableParameter::INT, true },
-};
-
-static TableParameter TertiaryUintOpParameters[] = {
-    { L"ShaderOp.Name", TableParameter::STRING, true },
-    { L"ShaderOp.Target", TableParameter::STRING, true },
-    { L"ShaderOp.EntryPoint", TableParameter::STRING, true },
-    { L"ShaderOp.Text", TableParameter::STRING, true },
-    { L"Validation.Input1", TableParameter::UINT_TABLE, true },
-    { L"Validation.Input2", TableParameter::UINT_TABLE, true },
-    { L"Validation.Input3", TableParameter::UINT_TABLE, true },
-    { L"Validation.Expected", TableParameter::UINT_TABLE, true },
-    { L"Validation.Tolerance", TableParameter::INT, true },
-    { L"Validation.NumInput", TableParameter::UINT, true }
-};
-
-static TableParameter DotOpParameters[] = {
-    { L"ShaderOp.Name", TableParameter::STRING, true },
-    { L"ShaderOp.Target", TableParameter::STRING, true },
-    { L"ShaderOp.EntryPoint", TableParameter::STRING, true },
-    { L"ShaderOp.Text", TableParameter::STRING, true },
-    { L"Validation.Input1", TableParameter::STRING_TABLE, true },
-    { L"Validation.Input2", TableParameter::STRING_TABLE, true },
-    { L"Validation.dot2", TableParameter::STRING_TABLE, true },
-    { L"Validation.dot3", TableParameter::STRING_TABLE, true },
-    { L"Validation.dot4", TableParameter::STRING_TABLE, true },
-    { L"Validation.Type", TableParameter::STRING, true },
-    { L"Validation.Tolerance", TableParameter::DOUBLE, true },
-    { L"Validation.NumInput", TableParameter::UINT, true }
-};
-
-static TableParameter Msad4OpParameters[] = {
-    { L"ShaderOp.Text", TableParameter::STRING, true },
-    { L"Validation.Tolerance", TableParameter::DOUBLE, true },
-    { L"Validation.NumInput", TableParameter::UINT, true },
-    { L"Validation.Reference", TableParameter::UINT_TABLE, true},
-    { L"Validation.Source", TableParameter::STRING_TABLE, true },
-    { L"Validation.Accum", TableParameter::STRING_TABLE, true },
-    { L"Validation.Expected", TableParameter::STRING_TABLE, true }
-};
-
-static TableParameter WaveIntrinsicsActiveIntParameters[] = {
-    { L"ShaderOp.Name", TableParameter::STRING, true },
-    { L"ShaderOp.Text", TableParameter::STRING, true },
-    { L"Validation.NumInputSet", TableParameter::UINT, true },
-    { L"Validation.InputSet1", TableParameter::INT_TABLE, true },
-    { L"Validation.InputSet2", TableParameter::INT_TABLE, false },
-    { L"Validation.InputSet3", TableParameter::INT_TABLE, false },
-    { L"Validation.InputSet4", TableParameter::INT_TABLE, false }
-};
-
-static TableParameter WaveIntrinsicsPrefixIntParameters[] = {
-  { L"ShaderOp.Name", TableParameter::STRING, true },
-  { L"ShaderOp.Text", TableParameter::STRING, true },
-  { L"Validation.NumInputSet", TableParameter::UINT, true },
-  { L"Validation.InputSet1", TableParameter::INT_TABLE, true },
-  { L"Validation.InputSet2", TableParameter::INT_TABLE, false },
-  { L"Validation.InputSet3", TableParameter::INT_TABLE, false },
-  { L"Validation.InputSet4", TableParameter::INT_TABLE, false }
-};
-
-static TableParameter WaveIntrinsicsActiveUintParameters[] = {
-  { L"ShaderOp.Name", TableParameter::STRING, true },
-  { L"ShaderOp.Text", TableParameter::STRING, true },
-  { L"Validation.NumInputSet", TableParameter::UINT, true },
-  { L"Validation.InputSet1", TableParameter::UINT_TABLE, true },
-  { L"Validation.InputSet2", TableParameter::UINT_TABLE, false },
-  { L"Validation.InputSet3", TableParameter::UINT_TABLE, false },
-  { L"Validation.InputSet4", TableParameter::UINT_TABLE, false }
-};
-
-static TableParameter WaveIntrinsicsPrefixUintParameters[] = {
-  { L"ShaderOp.Name", TableParameter::STRING, true },
-  { L"ShaderOp.Text", TableParameter::STRING, true },
-  { L"Validation.NumInputSet", TableParameter::UINT, true },
-  { L"Validation.InputSet1", TableParameter::UINT_TABLE, true },
-  { L"Validation.InputSet2", TableParameter::UINT_TABLE, false },
-  { L"Validation.InputSet3", TableParameter::UINT_TABLE, false },
-  { L"Validation.InputSet4", TableParameter::UINT_TABLE, false }
-};
-
-static TableParameter WaveIntrinsicsActiveBoolParameters[] = {
-  { L"ShaderOp.Name", TableParameter::STRING, true },
-  { L"ShaderOp.Text", TableParameter::STRING, true },
-  { L"Validation.NumInputSet", TableParameter::UINT, true },
-  { L"Validation.InputSet1", TableParameter::BOOL_TABLE, true },
-  { L"Validation.InputSet2", TableParameter::BOOL_TABLE, false },
-  { L"Validation.InputSet3", TableParameter::BOOL_TABLE, false },
-};
-
-static HRESULT ParseDataToFloat(PCWSTR str, float &value) {
-  std::wstring wString(str);
-  wString.erase(std::remove(wString.begin(), wString.end(), L' '), wString.end());
-  PCWSTR wstr = wString.data();
-  if (_wcsicmp(wstr, L"NaN") == 0) {
-    value = NAN;
-  } else if (_wcsicmp(wstr, L"-inf") == 0) {
-    value = -(INFINITY);
-  } else if (_wcsicmp(wstr, L"inf") == 0) {
-    value = INFINITY;
-  } else if (_wcsicmp(wstr, L"-denorm") == 0) {
-    value = -(FLT_MIN / 2);
-  } else if (_wcsicmp(wstr, L"denorm") == 0) {
-    value = FLT_MIN / 2;
-  } else if (_wcsicmp(wstr, L"-0.0f") == 0 || _wcsicmp(wstr, L"-0.0") == 0 ||
-             _wcsicmp(wstr, L"-0") == 0) {
-    value = -0.0f;
-  } else if (_wcsicmp(wstr, L"0.0f") == 0 || _wcsicmp(wstr, L"0.0") == 0 ||
-             _wcsicmp(wstr, L"0") == 0) {
-    value = 0.0f;
-  } else {
-    // evaluate the expression of wstring
-    double val = _wtof(wstr);
-    if (val == 0) {
-      LogErrorFmt(L"Failed to parse parameter %s to float", wstr);
-      return E_FAIL;
-    }
-    value = val;
-  }
-  return S_OK;
-}
-
-static HRESULT ParseDataToInt(PCWSTR str, int &value) {
-  std::wstring wString(str);
-  wString.erase(std::remove(wString.begin(), wString.end(), L' '), wString.end());
-  PCWSTR wstr = wString.data();
-  // evaluate the expression of string
-  if (_wcsicmp(wstr, L"0.0") == 0 || _wcsicmp(wstr, L"0") == 0) {
-      value = 0;
-      return S_OK;
-  }
-  int val = _wtoi(wstr);
-  if (val == 0) {
-      LogErrorFmt(L"Failed to parse parameter %s to int", wstr);
-      return E_FAIL;
-  }
-  value = val;
-  return S_OK;
-}
-
-static HRESULT ParseDataToUint(PCWSTR str, unsigned int &value) {
-    std::wstring wString(str);
-    wString.erase(std::remove(wString.begin(), wString.end(), L' '), wString.end());
-    PCWSTR wstr = wString.data();
-    // evaluate the expression of string
-    if (_wcsicmp(wstr, L"0") == 0 || _wcsicmp(wstr, L"0x00000000") == 0) {
-        value = 0;
-        return S_OK;
-    }
-    wchar_t *end;
-    unsigned int val = std::wcstoul(wstr, &end, 0);
-    if (val == 0) {
-        LogErrorFmt(L"Failed to parse parameter %s to int", wstr);
-        return E_FAIL;
-    }
-    value = val;
-    return S_OK;
-}
-
-static HRESULT ParseDataToVectorFloat(PCWSTR str, float *ptr, size_t count) {
-    std::wstring wstr(str);
-    size_t curPosition = 0;
-    // parse a string of dot product separated by commas
-    for (size_t i = 0; i < count; ++i) {
-        size_t nextPosition = wstr.find(L",", curPosition);
-        if (FAILED(ParseDataToFloat(
-            wstr.substr(curPosition, nextPosition - curPosition).data(),
-            *(ptr + i)))) {
-            return E_FAIL;
-        }
-        curPosition = nextPosition + 1;
-    }
-    return S_OK;
-}
-
-static HRESULT ParseDataToVectorUint(PCWSTR str, unsigned int *ptr, size_t count) {
-    std::wstring wstr(str);
-    size_t curPosition = 0;
-    // parse a string of dot product separated by commas
-    for (size_t i = 0; i < count; ++i) {
-        size_t nextPosition = wstr.find(L",", curPosition);
-        if (FAILED(ParseDataToUint(
-            wstr.substr(curPosition, nextPosition - curPosition).data(),
-            *(ptr + i)))) {
-            return E_FAIL;
-        }
-        curPosition = nextPosition + 1;
-    }
-    return S_OK;
-}
-
-static HRESULT ParseTableRow(TableParameter *table, unsigned int size) {
-  for (unsigned int i = 0; i < size; ++i) {
-    switch (table[i].m_type) {
-    case TableParameter::INT:
-      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
-                                                           table[i].m_int)) && table[i].m_required) {
-        LogErrorFmt(L"Failed to get %s", table[i].m_name);
-        return E_FAIL;
-      }
-      break;
-    case TableParameter::UINT:
-        if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
-            table[i].m_uint)) && table[i].m_required) {
-            LogErrorFmt(L"Failed to get %s", table[i].m_name);
-            return E_FAIL;
-        }
-        break;
-    case TableParameter::DOUBLE:
-      if (FAILED(WEX::TestExecution::TestData::TryGetValue(
-              table[i].m_name, table[i].m_double)) && table[i].m_required) {
-        LogErrorFmt(L"Failed to get %s", table[i].m_name);
-        return E_FAIL;
-      }
-      break;
-    case TableParameter::STRING:
-      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
-                                                           table[i].m_str)) && table[i].m_required) {
-        LogErrorFmt(L"Failed to get %s", table[i].m_name);
-        return E_FAIL;
-      }
-      break;
-    case TableParameter::BOOL:
-        if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
-            table[i].m_str)) && table[i].m_bool) {
-            LogErrorFmt(L"Failed to get %s", table[i].m_name);
-            return E_FAIL;
-        }
-        break;
-    case TableParameter::INT_TABLE:
-      if (FAILED(WEX::TestExecution::TestData::TryGetValue(
-              table[i].m_name, table[i].m_intTable)) && table[i].m_required) {
-        LogErrorFmt(L"Failed to get %s", table[i].m_name);
-        return E_FAIL;
-      }
-      break;
-    case TableParameter::UINT_TABLE:
-        if (FAILED(WEX::TestExecution::TestData::TryGetValue(
-            table[i].m_name, table[i].m_uintTable)) && table[i].m_required) {
-            LogErrorFmt(L"Failed to get %s", table[i].m_name);
-            return E_FAIL;
-        }
-        break;
-    case TableParameter::DOUBLE_TABLE:
-      if (FAILED(WEX::TestExecution::TestData::TryGetValue(
-              table[i].m_name, table[i].m_doubleTable)) && table[i].m_required) {
-        LogErrorFmt(L"Failed to get %s", table[i].m_name);
-        return E_FAIL;
-      }
-      break;
-    case TableParameter::BOOL_TABLE:
-      if (FAILED(WEX::TestExecution::TestData::TryGetValue(
-        table[i].m_name, table[i].m_boolTable)) && table[i].m_required) {
-        LogErrorFmt(L"Failed to get %s", table[i].m_name);
-        return E_FAIL;
-      }
-      break;
-    case TableParameter::STRING_TABLE:
-      if (FAILED(WEX::TestExecution::TestData::TryGetValue(
-              table[i].m_name, table[i].m_StringTable)) && table[i].m_required) {
-        LogErrorFmt(L"Failed to get %s", table[i].m_name);
-        return E_FAIL;
-      }
-      break;
-    default:
-      DXASSERT_NOMSG("Invalid Parameter Type");
-    }
-  }
-  return S_OK;
-}
-
-static void VerifyOutputWithExpectedValueInt(int output, int ref, int tolerance) {
-    VERIFY_IS_TRUE(output - ref <= tolerance && ref - output <= tolerance);
-}
-
-static void VerifyOutputWithExpectedValueFloat(float output, float ref, LPCWSTR type, double tolerance) {
-    if (_wcsicmp(type, L"Relative") == 0) {
-        VERIFY_IS_TRUE(CompareFloatRelativeEpsilon(output, ref, tolerance));
-    }
-    else if (_wcsicmp(type, L"Epsilon") == 0) {
-        VERIFY_IS_TRUE(CompareFloatEpsilon(output, ref, tolerance));
-    }
-    else if (_wcsicmp(type, L"ULP") == 0) {
-        VERIFY_IS_TRUE(CompareFloatULP(output, ref, (int)tolerance));
-    }
-    else {
-        LogErrorFmt(L"Failed to read comparison type %S", type);
-    }
-}
-
-TEST_F(ExecutionTest, UnaryFloatOpTest) {
-    WEX::TestExecution::SetVerifyOutput verifySettings(
-        WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
-    CComPtr<IStream> pStream;
-    ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
-
-    CComPtr<ID3D12Device> pDevice;
-    if (!CreateDevice(&pDevice)) {
-      return;
-    }
-    // Read data from the table
-    int tableSize = sizeof(UnaryFPOpParameters) / sizeof(TableParameter);
-    TableParameterHandler handler(UnaryFPOpParameters, tableSize);
-    handler.clearTableParameter();
-    VERIFY_SUCCEEDED(ParseTableRow(UnaryFPOpParameters, tableSize));
-
-    st::ShaderOpShader shader;
-
-    CW2A Name(handler.GetTableParamByName(L"ShaderOp.Name")->m_str);
-    CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
-    CW2A EntryPoint(handler.GetTableParamByName(L"ShaderOp.EntryPoint")->m_str);
-    CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
-    shader.Name = Name.m_psz;
-    shader.Target = Target.m_psz;
-    shader.EntryPoint = EntryPoint.m_psz;
-    shader.Text = Text.m_psz;
-
-    unsigned int WarpVersion = handler.GetTableParamByName(L"Warp.Version")->m_uint;
-    if (GetTestParamUseWARP(true) && !IsValidWarpDllVersion(WarpVersion)) {
-        return;
-    }
-
-    WEX::TestExecution::TestDataArray<WEX::Common::String> *Validation_Input =
-        &(handler.GetTableParamByName(L"Validation.Input")->m_StringTable);
-    WEX::TestExecution::TestDataArray<WEX::Common::String> *Validation_Expected =
-        &(handler.GetTableParamByName(L"Validation.Expected")->m_StringTable);
-
-    LPCWSTR Validation_Type = handler.GetTableParamByName(L"Validation.Type")->m_str;
-    double Validation_Tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_double;
-
-    size_t count = handler.GetTableParamByName(L"Validation.NumInput")->m_uint;
-
-    std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
-        pDevice, m_support, pStream, "UnaryFPOp",
-        // this callbacked is called when the test
-        // is creating the resource to run the test
-        [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
-          VERIFY_IS_TRUE(0 == _stricmp(Name, "SUnaryFPOp"));
-          size_t size = sizeof(SUnaryFPOp) * count;
-          Data.resize(size);
-          SUnaryFPOp *pPrimitives = (SUnaryFPOp *)Data.data();
-          for (size_t i = 0; i < count; ++i) {
-            SUnaryFPOp *p = &pPrimitives[i];
-            PCWSTR str = (*Validation_Input)[i % Validation_Input->GetSize()];
-            float val;
-            VERIFY_SUCCEEDED(ParseDataToFloat(str, val));
-            p->input = val;
-          }
-          // use shader from data table
-          pShaderOp->Shaders.at(0).Target = shader.Target;
-          pShaderOp->Shaders.at(0).EntryPoint = shader.EntryPoint;
-          pShaderOp->Shaders.at(0).Text = shader.Text;
-        });
-
-    MappedData data;
-    test->Test->GetReadBackData("SUnaryFPOp", &data);
-
-    SUnaryFPOp *pPrimitives = (SUnaryFPOp*)data.data();
-    WEX::TestExecution::DisableVerifyExceptions dve;
-    for (unsigned i = 0; i < count; ++i) {
-        SUnaryFPOp *p = &pPrimitives[i];
-        LPCWSTR str = (*Validation_Expected)[i % Validation_Expected->GetSize()];
-        float val;
-        VERIFY_SUCCEEDED(ParseDataToFloat(str, val));
-        LogCommentFmt(
-            L"element #%u, input = %10f, output = %10f, expected = %10f", i,
-            p->input, p->output, val);
-        VerifyOutputWithExpectedValueFloat(p->output, val, Validation_Type, Validation_Tolerance);
-    }
-}
-
-TEST_F(ExecutionTest, BinaryFloatOpTest) {
-    WEX::TestExecution::SetVerifyOutput verifySettings(
-        WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
-    CComPtr<IStream> pStream;
-    ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
-
-    CComPtr<ID3D12Device> pDevice;
-    if (!CreateDevice(&pDevice)) {
-        return;
-    }
-    // Read data from the table
-    int tableSize = sizeof(BinaryFPOpParameters) / sizeof(TableParameter);
-    TableParameterHandler handler(BinaryFPOpParameters, tableSize);
-    handler.clearTableParameter();
-    VERIFY_SUCCEEDED(ParseTableRow(BinaryFPOpParameters, tableSize));
-
-
-    st::ShaderOpShader shader;
-
-    CW2A Name(handler.GetTableParamByName(L"ShaderOp.Name")->m_str);
-    CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
-    CW2A EntryPoint(handler.GetTableParamByName(L"ShaderOp.EntryPoint")->m_str);
-    CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
-    shader.Name = Name.m_psz;
-    shader.Target = Target.m_psz;
-    shader.EntryPoint = EntryPoint.m_psz;
-    shader.Text = Text.m_psz;
-
-    WEX::TestExecution::TestDataArray<WEX::Common::String> *Validation_Input1 =
-        &(handler.GetTableParamByName(L"Validation.Input1")->m_StringTable);
-    WEX::TestExecution::TestDataArray<WEX::Common::String> *Validation_Input2 =
-        &(handler.GetTableParamByName(L"Validation.Input2")->m_StringTable);
-
-    WEX::TestExecution::TestDataArray<WEX::Common::String> *Validation_Expected1 =
-        &(handler.GetTableParamByName(L"Validation.Expected1")->m_StringTable);
-
-    WEX::TestExecution::TestDataArray<WEX::Common::String> *Validation_Expected2 =
-        &(handler.GetTableParamByName(L"Validation.Expected2")->m_StringTable);
-
-    LPCWSTR Validation_Type = handler.GetTableParamByName(L"Validation.Type")->m_str;
-    double Validation_Tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_double;
-    size_t count = handler.GetTableParamByName(L"Validation.NumInput")->m_uint;
-
-    std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
-        pDevice, m_support, pStream, "BinaryFPOp", 
-        // this callbacked is called when the test
-        // is creating the resource to run the test
-        [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
-        VERIFY_IS_TRUE(0 == _stricmp(Name, "SBinaryFPOp"));
-        size_t size = sizeof(SBinaryFPOp) * count;
-        Data.resize(size);
-        SBinaryFPOp *pPrimitives = (SBinaryFPOp *)Data.data();
-        for (size_t i = 0; i < count; ++i) {
-            SBinaryFPOp *p = &pPrimitives[i];
-            PCWSTR str1 = (*Validation_Input1)[i % Validation_Input1->GetSize()];
-            PCWSTR str2 = (*Validation_Input2)[i % Validation_Input2->GetSize()];
-            float val1, val2;
-            VERIFY_SUCCEEDED(ParseDataToFloat(str1, val1));
-            VERIFY_SUCCEEDED(ParseDataToFloat(str2, val2));
-            p->input1 = val1;
-            p->input2 = val2;
-        }
-
-        // use shader from data table
-        pShaderOp->Shaders.at(0).Target = shader.Target;
-        pShaderOp->Shaders.at(0).EntryPoint = shader.EntryPoint;
-        pShaderOp->Shaders.at(0).Text = shader.Text;
-    });
-
-    MappedData data;
-    test->Test->GetReadBackData("SBinaryFPOp", &data);
-
-    SBinaryFPOp *pPrimitives = (SBinaryFPOp *)data.data();
-    WEX::TestExecution::DisableVerifyExceptions dve;
-
-    for (unsigned i = 0; i < count; ++i) {
-        SBinaryFPOp *p = &pPrimitives[i];
-        LPCWSTR str1 = (*Validation_Expected1)[i % Validation_Expected1->GetSize()];
-        LPCWSTR str2 = (*Validation_Expected2)[i % Validation_Expected2->GetSize()];
-        float val1, val2;
-        VERIFY_SUCCEEDED(ParseDataToFloat(str1, val1));
-        VERIFY_SUCCEEDED(ParseDataToFloat(str2, val2));
-        LogCommentFmt(L"element #%u, input1 = %10f, input2 = %10f, output1 = "
-            L"%10f, expected1 = %10f, output2 = %10f, expected2 = %10f",
-            i, p->input1, p->input2, p->output1, val1, p->output2,
-            val2);
-        VerifyOutputWithExpectedValueFloat(p->output1, val1, Validation_Type,
-            Validation_Tolerance);
-        VerifyOutputWithExpectedValueFloat(p->output2, val2, Validation_Type,
-            Validation_Tolerance);
-    }
-}
-
-TEST_F(ExecutionTest, TertiaryFloatOpTest) {
-    WEX::TestExecution::SetVerifyOutput verifySettings(
-        WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
-    CComPtr<IStream> pStream;
-    ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
-
-    CComPtr<ID3D12Device> pDevice;
-    if (!CreateDevice(&pDevice)) {
-        return;
-    }
-    // Read data from the table
-    
-    int tableSize = sizeof(TertiaryFPOpParameters) / sizeof(TableParameter);
-    TableParameterHandler handler(TertiaryFPOpParameters, tableSize);
-    handler.clearTableParameter();
-    VERIFY_SUCCEEDED(ParseTableRow(TertiaryFPOpParameters, tableSize));
-
-    st::ShaderOpShader shader;
-
-    CW2A Name(handler.GetTableParamByName(L"ShaderOp.Name")->m_str);
-    CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
-    CW2A EntryPoint(handler.GetTableParamByName(L"ShaderOp.EntryPoint")->m_str);
-    CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
-    shader.Name = Name.m_psz;
-    shader.Target = Target.m_psz;
-    shader.EntryPoint = EntryPoint.m_psz;
-    shader.Text = Text.m_psz;
-
-    WEX::TestExecution::TestDataArray<WEX::Common::String> *Validation_Input1 =
-        &(handler.GetTableParamByName(L"Validation.Input1")->m_StringTable);
-    WEX::TestExecution::TestDataArray<WEX::Common::String> *Validation_Input2 =
-        &(handler.GetTableParamByName(L"Validation.Input2")->m_StringTable);
-    WEX::TestExecution::TestDataArray<WEX::Common::String> *Validation_Input3 =
-        &(handler.GetTableParamByName(L"Validation.Input3")->m_StringTable);
-
-    WEX::TestExecution::TestDataArray<WEX::Common::String> *Validation_Expected =
-        &(handler.GetTableParamByName(L"Validation.Expected")->m_StringTable);
-
-    LPCWSTR Validation_Type = handler.GetTableParamByName(L"Validation.Type")->m_str;
-    double Validation_Tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_double;
-    size_t count = handler.GetTableParamByName(L"Validation.NumInput")->m_uint;
-
-    std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
-        pDevice, m_support, pStream, "TertiaryFPOp",
-        // this callbacked is called when the test
-        // is creating the resource to run the test
-        [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
-        VERIFY_IS_TRUE(0 == _stricmp(Name, "STertiaryFPOp"));
-        size_t size = sizeof(STertiaryFPOp) * count;
-        Data.resize(size);
-        STertiaryFPOp *pPrimitives = (STertiaryFPOp *)Data.data();
-        for (size_t i = 0; i < count; ++i) {
-            STertiaryFPOp *p = &pPrimitives[i];
-            PCWSTR str1 = (*Validation_Input1)[i % Validation_Input1->GetSize()];
-            PCWSTR str2 = (*Validation_Input2)[i % Validation_Input2->GetSize()];
-            PCWSTR str3 = (*Validation_Input3)[i % Validation_Input3->GetSize()];
-            float val1, val2, val3;
-            VERIFY_SUCCEEDED(ParseDataToFloat(str1, val1));
-            VERIFY_SUCCEEDED(ParseDataToFloat(str2, val2));
-            VERIFY_SUCCEEDED(ParseDataToFloat(str3, val3));
-            p->input1 = val1;
-            p->input2 = val2;
-            p->input3 = val3;
-        }
-
-        // use shader from data table
-        pShaderOp->Shaders.at(0).Target = shader.Target;
-        pShaderOp->Shaders.at(0).EntryPoint = shader.EntryPoint;
-        pShaderOp->Shaders.at(0).Text = shader.Text;
-    });
-
-    MappedData data;
-    test->Test->GetReadBackData("STertiaryFPOp", &data);
-
-    STertiaryFPOp *pPrimitives = (STertiaryFPOp *)data.data();
-    WEX::TestExecution::DisableVerifyExceptions dve;
-
-    for (unsigned i = 0; i < count; ++i) {
-      STertiaryFPOp *p = &pPrimitives[i];
-      LPCWSTR str = (*Validation_Expected)[i % Validation_Expected->GetSize()];
-      float val;
-      VERIFY_SUCCEEDED(ParseDataToFloat(str, val));
-      LogCommentFmt(L"element #%u, input1 = %10f, input2 = %10f, input3 = %10f, output1 = "
-                    L"%10f, expected = %10f",
-                    i, p->input1, p->input2, p->input3, p->output, val);
-      VerifyOutputWithExpectedValueFloat(p->output, val, Validation_Type,
-                               Validation_Tolerance);
-    }
-}
-
-TEST_F(ExecutionTest, UnaryIntOpTest) {
-    WEX::TestExecution::SetVerifyOutput verifySettings(
-        WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
-    CComPtr<IStream> pStream;
-    ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
-
-    CComPtr<ID3D12Device> pDevice;
-    if (!CreateDevice(&pDevice)) {
-        return;
-    }
-    // Read data from the table
-
-    int tableSize = sizeof(UnaryIntOpParameters) / sizeof(TableParameter);
-    TableParameterHandler handler(UnaryIntOpParameters, tableSize);
-    handler.clearTableParameter();
-    VERIFY_SUCCEEDED(ParseTableRow(UnaryIntOpParameters, tableSize));
-
-    st::ShaderOpShader shader;
-
-    CW2A Name(handler.GetTableParamByName(L"ShaderOp.Name")->m_str);
-    CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
-    CW2A EntryPoint(handler.GetTableParamByName(L"ShaderOp.EntryPoint")->m_str);
-    CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
-    shader.Name = Name.m_psz;
-    shader.Target = Target.m_psz;
-    shader.EntryPoint = EntryPoint.m_psz;
-    shader.Text = Text.m_psz;
-
-    WEX::TestExecution::TestDataArray<int> *Validation_Input =
-        &handler.GetTableParamByName(L"Validation.Input")->m_intTable;
-    WEX::TestExecution::TestDataArray<int> *Validation_Expected =
-        &handler.GetTableParamByName(L"Validation.Expected")->m_intTable;
-    int Validation_Tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_int;
-    size_t count = handler.GetTableParamByName(L"Validation.NumInput")->m_uint;
-
-    std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
-        pDevice, m_support, pStream, "UnaryIntOp",
-        // this callbacked is called when the test
-        // is creating the resource to run the test
-        [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
-          VERIFY_IS_TRUE(0 == _stricmp(Name, "SUnaryIntOp"));
-          size_t size = sizeof(SUnaryIntOp) * count;
-          Data.resize(size);
-          SUnaryIntOp *pPrimitives = (SUnaryIntOp *)Data.data();
-          for (size_t i = 0; i < count; ++i) {
-            SUnaryIntOp *p = &pPrimitives[i];
-            int val = (*Validation_Input)[i % Validation_Input->GetSize()];
-            p->input = val;
-          }
-          // use shader data table
-          pShaderOp->Shaders.at(0).Target = shader.Target;
-          pShaderOp->Shaders.at(0).EntryPoint = shader.EntryPoint;
-          pShaderOp->Shaders.at(0).Text = shader.Text;
-        });
-
-    MappedData data;
-    test->Test->GetReadBackData("SUnaryIntOp", &data);
-
-    SUnaryIntOp *pPrimitives = (SUnaryIntOp *)data.data();
-    WEX::TestExecution::DisableVerifyExceptions dve;
-    for (unsigned i = 0; i < count; ++i) {
-      SUnaryIntOp *p = &pPrimitives[i];
-      int val = (*Validation_Expected)[i % Validation_Expected->GetSize()];
-      LogCommentFmt(L"element #%u, input = %11i(0x%08x), output = %11i(0x%08x), "
-                    L"expected = %11i(0x%08x)",
-                    i, p->input, p->input, p->output, p->output, val, val);
-      VerifyOutputWithExpectedValueInt(p->output, val, Validation_Tolerance);
-    }
-}
-
-TEST_F(ExecutionTest, UnaryUintOpTest) {
-    WEX::TestExecution::SetVerifyOutput verifySettings(
-        WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
-    CComPtr<IStream> pStream;
-    ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
-
-    CComPtr<ID3D12Device> pDevice;
-    if (!CreateDevice(&pDevice)) {
-        return;
-    }
-    // Read data from the table
-
-    int tableSize = sizeof(UnaryUintOpParameters) / sizeof(TableParameter);
-    TableParameterHandler handler(UnaryUintOpParameters, tableSize);
-    handler.clearTableParameter();
-    VERIFY_SUCCEEDED(ParseTableRow(UnaryUintOpParameters, tableSize));
-
-    st::ShaderOpShader shader;
-
-    CW2A Name(handler.GetTableParamByName(L"ShaderOp.Name")->m_str);
-    CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
-    CW2A EntryPoint(handler.GetTableParamByName(L"ShaderOp.EntryPoint")->m_str);
-    CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
-    shader.Name = Name.m_psz;
-    shader.Target = Target.m_psz;
-    shader.EntryPoint = EntryPoint.m_psz;
-    shader.Text = Text.m_psz;
-
-    WEX::TestExecution::TestDataArray<unsigned int> *Validation_Input =
-        &handler.GetTableParamByName(L"Validation.Input")->m_uintTable;
-    WEX::TestExecution::TestDataArray<unsigned int> *Validation_Expected =
-        &handler.GetTableParamByName(L"Validation.Expected")->m_uintTable;
-    int Validation_Tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_int;
-    size_t count = handler.GetTableParamByName(L"Validation.NumInput")->m_uint;
-
-    std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
-        pDevice, m_support, pStream, "UnaryUintOp",
-        // this callbacked is called when the test
-        // is creating the resource to run the test
-        [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
-        VERIFY_IS_TRUE(0 == _stricmp(Name, "SUnaryUintOp"));
-        size_t size = sizeof(SUnaryUintOp) * count;
-        Data.resize(size);
-        SUnaryUintOp *pPrimitives = (SUnaryUintOp *)Data.data();
-        for (size_t i = 0; i < count; ++i) {
-            SUnaryUintOp *p = &pPrimitives[i];
-            unsigned int val = (*Validation_Input)[i % Validation_Input->GetSize()];
-            p->input = val;
-        }
-        // use shader data table
-        pShaderOp->Shaders.at(0).Target = shader.Target;
-        pShaderOp->Shaders.at(0).EntryPoint = shader.EntryPoint;
-        pShaderOp->Shaders.at(0).Text = shader.Text;
-    });
-
-    MappedData data;
-    test->Test->GetReadBackData("SUnaryUintOp", &data);
-
-    SUnaryUintOp *pPrimitives = (SUnaryUintOp *)data.data();
-    WEX::TestExecution::DisableVerifyExceptions dve;
-    for (unsigned i = 0; i < count; ++i) {
-        SUnaryUintOp *p = &pPrimitives[i];
-        unsigned int val = (*Validation_Expected)[i % Validation_Expected->GetSize()];
-        LogCommentFmt(L"element #%u, input = %11u(0x%08x), output = %11u(0x%08x), "
-            L"expected = %11u(0x%08x)",
-            i, p->input, p->input, p->output, p->output, val, val);
-        VerifyOutputWithExpectedValueInt(p->output, val, Validation_Tolerance);
-    }
-}
-
-TEST_F(ExecutionTest, BinaryIntOpTest) {
-    WEX::TestExecution::SetVerifyOutput verifySettings(
-        WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
-    CComPtr<IStream> pStream;
-    ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
-
-    CComPtr<ID3D12Device> pDevice;
-    if (!CreateDevice(&pDevice)) {
-      return;
-    }
-    // Read data from the table
-    size_t tableSize = sizeof(BinaryIntOpParameters) / sizeof(TableParameter);
-    TableParameterHandler handler(BinaryIntOpParameters, tableSize);
-    handler.clearTableParameter();
-    VERIFY_SUCCEEDED(ParseTableRow(BinaryIntOpParameters,tableSize));
-
-    st::ShaderOpShader shader;
-
-    CW2A Name(handler.GetTableParamByName(L"ShaderOp.Name")->m_str);
-    CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
-    CW2A EntryPoint(handler.GetTableParamByName(L"ShaderOp.EntryPoint")->m_str);
-    CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
-    shader.Name = Name.m_psz;
-    shader.Target = Target.m_psz;
-    shader.EntryPoint = EntryPoint.m_psz;
-    shader.Text = Text.m_psz;
-
-    int numExpected = handler.GetTableParamByName(L"Validation.NumExpected")->m_int;
-
-    WEX::TestExecution::TestDataArray<int> *Validation_Input1 =
-        &handler.GetTableParamByName(L"Validation.Input1")->m_intTable;
-    WEX::TestExecution::TestDataArray<int> *Validation_Input2 =
-        &handler.GetTableParamByName(L"Validation.Input2")->m_intTable;
-    WEX::TestExecution::TestDataArray<int> *Validation_Expected1 =
-        &handler.GetTableParamByName(L"Validation.Expected1")->m_intTable;
-    WEX::TestExecution::TestDataArray<int> *Validation_Expected2 =
-        &handler.GetTableParamByName(L"Validation.Expected2")->m_intTable;
-    int Validation_Tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_int;
-    size_t count = handler.GetTableParamByName(L"Validation.NumInput")->m_uint;
-
-    std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
-        pDevice, m_support, pStream, "BinaryIntOp",
-        // this callbacked is called when the test
-        // is creating the resource to run the test
-        [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
-          VERIFY_IS_TRUE(0 == _stricmp(Name, "SBinaryIntOp"));
-          size_t size = sizeof(SBinaryIntOp) * count;
-          Data.resize(size);
-          SBinaryIntOp *pPrimitives = (SBinaryIntOp *)Data.data();
-          for (size_t i = 0; i < count; ++i) {
-            SBinaryIntOp *p = &pPrimitives[i];
-            int val1 = (*Validation_Input1)[i % Validation_Input1->GetSize()];
-            int val2 = (*Validation_Input2)[i % Validation_Input2->GetSize()];
-            p->input1 = val1;
-            p->input2 = val2;
-          }
-
-          // use shader from data table
-          pShaderOp->Shaders.at(0).Target = shader.Target;
-          pShaderOp->Shaders.at(0).EntryPoint = shader.EntryPoint;
-          pShaderOp->Shaders.at(0).Text = shader.Text;
-        });
-
-    MappedData data;
-    test->Test->GetReadBackData("SBinaryIntOp", &data);
-
-    SBinaryIntOp *pPrimitives = (SBinaryIntOp *)data.data();
-    WEX::TestExecution::DisableVerifyExceptions dve;
-
-    if (numExpected == 2) {
-        for (unsigned i = 0; i < count; ++i) {
-            SBinaryIntOp *p = &pPrimitives[i];
-            int val1 = (*Validation_Expected1)[i % Validation_Expected1->GetSize()];
-            int val2 = (*Validation_Expected2)[i % Validation_Expected2->GetSize()];
-            LogCommentFmt(L"element #%u, input1 = %11i(0x%08x), input2 = "
-                L"%11i(0x%08x), output1 = "
-                L"%11i(0x%08x), expected1 = %11i(0x%08x), output2 = "
-                L"%11i(0x%08x), expected2 = %11i(0x%08x)",
-                i, p->input1, p->input1, p->input2, p->input2, p->output1,
-                p->output1, val1, val1, p->output2, p->output2, val2,
-                val2);
-            VerifyOutputWithExpectedValueInt(p->output1, val1, Validation_Tolerance);
-            VerifyOutputWithExpectedValueInt(p->output2, val2, Validation_Tolerance);
-        }
-    }
-    else if (numExpected == 1) {
-        for (unsigned i = 0; i < count; ++i) {
-            SBinaryIntOp *p = &pPrimitives[i];
-            int val1 = (*Validation_Expected1)[i % Validation_Expected1->GetSize()];
-            LogCommentFmt(L"element #%u, input1 = %11i(0x%08x), input2 = "
-                          L"%11i(0x%08x), output = "
-                          L"%11i(0x%08x), expected = %11i(0x%08x)", i,
-                          p->input1, p->input1, p->input2, p->input2,
-                          p->output1, p->output1, val1, val1);
-            VerifyOutputWithExpectedValueInt(p->output1, val1, Validation_Tolerance);
-        }
-    }
-    else {
-        LogErrorFmt(L"Unexpected number of expected values for operation %i", numExpected);
-    }
-}
-
-TEST_F(ExecutionTest, TertiaryIntOpTest) {
-    WEX::TestExecution::SetVerifyOutput verifySettings(
-        WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
-    CComPtr<IStream> pStream;
-    ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
-
-    CComPtr<ID3D12Device> pDevice;
-    if (!CreateDevice(&pDevice)) {
-        return;
-    }
-    // Read data from the table
-    size_t tableSize = sizeof(TertiaryIntOpParameters) / sizeof(TableParameter);
-    TableParameterHandler handler(TertiaryIntOpParameters, tableSize);
-    handler.clearTableParameter();
-    VERIFY_SUCCEEDED(ParseTableRow(TertiaryIntOpParameters, tableSize));
-
-    st::ShaderOpShader shader;
-
-    CW2A Name(handler.GetTableParamByName(L"ShaderOp.Name")->m_str);
-    CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
-    CW2A EntryPoint(handler.GetTableParamByName(L"ShaderOp.EntryPoint")->m_str);
-    CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
-    shader.Name = Name.m_psz;
-    shader.Target = Target.m_psz;
-    shader.EntryPoint = EntryPoint.m_psz;
-    shader.Text = Text.m_psz;
-
-    WEX::TestExecution::TestDataArray<int> *Validation_Input1 =
-        &handler.GetTableParamByName(L"Validation.Input1")->m_intTable;
-    WEX::TestExecution::TestDataArray<int> *Validation_Input2 =
-        &handler.GetTableParamByName(L"Validation.Input2")->m_intTable;
-    WEX::TestExecution::TestDataArray<int> *Validation_Input3 =
-        &handler.GetTableParamByName(L"Validation.Input3")->m_intTable;
-    WEX::TestExecution::TestDataArray<int> *Validation_Expected =
-        &handler.GetTableParamByName(L"Validation.Expected")->m_intTable;
-    int Validation_Tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_int;
-    size_t count = handler.GetTableParamByName(L"Validation.NumInput")->m_uint;
-
-    std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
-        pDevice, m_support, pStream, "TertiaryIntOp",
-        // this callbacked is called when the test
-        // is creating the resource to run the test
-        [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
-        VERIFY_IS_TRUE(0 == _stricmp(Name, "STertiaryIntOp"));
-        size_t size = sizeof(STertiaryIntOp) * count;
-        Data.resize(size);
-        STertiaryIntOp *pPrimitives = (STertiaryIntOp *)Data.data();
-        for (size_t i = 0; i < count; ++i) {
-            STertiaryIntOp *p = &pPrimitives[i];
-            int val1 = (*Validation_Input1)[i % Validation_Input1->GetSize()];
-            int val2 = (*Validation_Input2)[i % Validation_Input2->GetSize()];
-            int val3 = (*Validation_Input3)[i % Validation_Input3->GetSize()];
-            p->input1 = val1;
-            p->input2 = val2;
-            p->input3 = val3;
-        }
-
-        // use shader from data table
-        pShaderOp->Shaders.at(0).Target = shader.Target;
-        pShaderOp->Shaders.at(0).EntryPoint = shader.EntryPoint;
-        pShaderOp->Shaders.at(0).Text = shader.Text;
-    });
-
-    MappedData data;
-    test->Test->GetReadBackData("STertiaryIntOp", &data);
-
-    STertiaryIntOp *pPrimitives = (STertiaryIntOp *)data.data();
-    WEX::TestExecution::DisableVerifyExceptions dve;
-    for (unsigned i = 0; i < count; ++i) {
-        STertiaryIntOp *p = &pPrimitives[i];
-        int val1 = (*Validation_Expected)[i % Validation_Expected->GetSize()];
-        LogCommentFmt(L"element #%u, input1 = %11i(0x%08x), input2 = "
-            L"%11i(0x%08x), input3= %11i(0x%08x), output = "
-            L"%11i(0x%08x), expected = %11i(0x%08x)",
-            i, p->input1, p->input1, p->input2, p->input2,
-            p->input3, p->input3, p->output, p->output, val1,
-            val1);
-        VerifyOutputWithExpectedValueInt(p->output, val1, Validation_Tolerance);
-    }
-}
-
-TEST_F(ExecutionTest, BinaryUintOpTest) {
-    WEX::TestExecution::SetVerifyOutput verifySettings(
-        WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
-    CComPtr<IStream> pStream;
-    ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
-
-    CComPtr<ID3D12Device> pDevice;
-    if (!CreateDevice(&pDevice)) {
-        return;
-    }
-    // Read data from the table
-    size_t tableSize = sizeof(BinaryUintOpParameters) / sizeof(TableParameter);
-    TableParameterHandler handler(BinaryUintOpParameters, tableSize);
-    handler.clearTableParameter();
-    VERIFY_SUCCEEDED(ParseTableRow(BinaryUintOpParameters, tableSize));
-
-    st::ShaderOpShader shader;
-
-    CW2A Name(handler.GetTableParamByName(L"ShaderOp.Name")->m_str);
-    CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
-    CW2A EntryPoint(handler.GetTableParamByName(L"ShaderOp.EntryPoint")->m_str);
-    CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
-    shader.Name = Name.m_psz;
-    shader.Target = Target.m_psz;
-    shader.EntryPoint = EntryPoint.m_psz;
-    shader.Text = Text.m_psz;
-
-    int numExpected = handler.GetTableParamByName(L"Validation.NumExpected")->m_int;
-
-    WEX::TestExecution::TestDataArray<unsigned int> *Validation_Input1 =
-        &handler.GetTableParamByName(L"Validation.Input1")->m_uintTable;
-    WEX::TestExecution::TestDataArray<unsigned int> *Validation_Input2 =
-        &handler.GetTableParamByName(L"Validation.Input2")->m_uintTable;
-    WEX::TestExecution::TestDataArray<unsigned int> *Validation_Expected1 =
-        &handler.GetTableParamByName(L"Validation.Expected1")->m_uintTable;
-    WEX::TestExecution::TestDataArray<unsigned int> *Validation_Expected2 =
-        &handler.GetTableParamByName(L"Validation.Expected2")->m_uintTable;
-    int Validation_Tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_int;
-    size_t count = handler.GetTableParamByName(L"Validation.NumInput")->m_uint;
-
-    std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
-        pDevice, m_support, pStream, "BinaryUintOp",
-        // this callbacked is called when the test
-        // is creating the resource to run the test
-        [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
-        VERIFY_IS_TRUE(0 == _stricmp(Name, "SBinaryUintOp"));
-        size_t size = sizeof(SBinaryUintOp) * count;
-        Data.resize(size);
-        SBinaryUintOp *pPrimitives = (SBinaryUintOp *)Data.data();
-        for (size_t i = 0; i < count; ++i) {
-            SBinaryUintOp *p = &pPrimitives[i];
-            unsigned int val1 = (*Validation_Input1)[i % Validation_Input1->GetSize()];
-            unsigned int val2 = (*Validation_Input2)[i % Validation_Input2->GetSize()];
-            p->input1 = val1;
-            p->input2 = val2;
-        }
-
-        // use shader from data table
-        pShaderOp->Shaders.at(0).Target = shader.Target;
-        pShaderOp->Shaders.at(0).EntryPoint = shader.EntryPoint;
-        pShaderOp->Shaders.at(0).Text = shader.Text;
-    });
-
-    MappedData data;
-    test->Test->GetReadBackData("SBinaryUintOp", &data);
-
-    SBinaryUintOp *pPrimitives = (SBinaryUintOp *)data.data();
-    WEX::TestExecution::DisableVerifyExceptions dve;
-    if (numExpected == 2) {
-        for (unsigned i = 0; i < count; ++i) {
-            SBinaryUintOp *p = &pPrimitives[i];
-            unsigned int val1 = (*Validation_Expected1)[i % Validation_Expected1->GetSize()];
-            unsigned int val2 = (*Validation_Expected2)[i % Validation_Expected2->GetSize()];
-            LogCommentFmt(L"element #%u, input1 = %11u(0x%08x), input2 = "
-                L"%11u(0x%08x), output1 = "
-                L"%11u(0x%08x), expected1 = %11u(0x%08x), output2 = "
-                L"%11u(0x%08x), expected2 = %11u(0x%08x)",
-                i, p->input1, p->input1, p->input2, p->input2, p->output1,
-                p->output1, val1, val1, p->output2, p->output2, val2,
-                val2);
-            VerifyOutputWithExpectedValueInt(p->output1, val1, Validation_Tolerance);
-            VerifyOutputWithExpectedValueInt(p->output2, val2, Validation_Tolerance);
-        }
-    }
-    else if (numExpected == 1) {
-        for (unsigned i = 0; i < count; ++i) {
-            SBinaryUintOp *p = &pPrimitives[i];
-            unsigned int val1 = (*Validation_Expected1)[i % Validation_Expected1->GetSize()];
-            LogCommentFmt(L"element #%u, input1 = %11u(0x%08x), input2 = "
-                L"%11u(0x%08x), output = "
-                L"%11u(0x%08x), expected = %11u(0x%08x)", i,
-                p->input1, p->input1, p->input2, p->input2,
-                p->output1, p->output1, val1, val1);
-            VerifyOutputWithExpectedValueInt(p->output1, val1, Validation_Tolerance);
-        }
-    }
-    else {
-        LogErrorFmt(L"Unexpected number of expected values for operation %i", numExpected);
-    }
-}
-
-TEST_F(ExecutionTest, TertiaryUintOpTest) {
-    WEX::TestExecution::SetVerifyOutput verifySettings(
-        WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
-    CComPtr<IStream> pStream;
-    ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
-
-    CComPtr<ID3D12Device> pDevice;
-    if (!CreateDevice(&pDevice)) {
-        return;
-    }
-    // Read data from the table
-    size_t tableSize = sizeof(TertiaryUintOpParameters) / sizeof(TableParameter);
-    TableParameterHandler handler(TertiaryUintOpParameters, tableSize);
-    handler.clearTableParameter();
-    VERIFY_SUCCEEDED(ParseTableRow(TertiaryUintOpParameters, tableSize));
-
-    st::ShaderOpShader shader;
-
-    CW2A Name(handler.GetTableParamByName(L"ShaderOp.Name")->m_str);
-    CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
-    CW2A EntryPoint(handler.GetTableParamByName(L"ShaderOp.EntryPoint")->m_str);
-    CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
-    shader.Name = Name.m_psz;
-    shader.Target = Target.m_psz;
-    shader.EntryPoint = EntryPoint.m_psz;
-    shader.Text = Text.m_psz;
-
-    WEX::TestExecution::TestDataArray<unsigned int> *Validation_Input1 =
-        &handler.GetTableParamByName(L"Validation.Input1")->m_uintTable;
-    WEX::TestExecution::TestDataArray<unsigned int> *Validation_Input2 =
-        &handler.GetTableParamByName(L"Validation.Input2")->m_uintTable;
-    WEX::TestExecution::TestDataArray<unsigned int> *Validation_Input3 =
-        &handler.GetTableParamByName(L"Validation.Input3")->m_uintTable;
-    WEX::TestExecution::TestDataArray<unsigned int> *Validation_Expected =
-        &handler.GetTableParamByName(L"Validation.Expected")->m_uintTable;
-    int Validation_Tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_int;
-    size_t count = handler.GetTableParamByName(L"Validation.NumInput")->m_uint;
-
-    std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
-        pDevice, m_support, pStream, "TertiaryUintOp",
-        // this callbacked is called when the test
-        // is creating the resource to run the test
-        [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
-        VERIFY_IS_TRUE(0 == _stricmp(Name, "STertiaryUintOp"));
-        size_t size = sizeof(STertiaryUintOp) * count;
-        Data.resize(size);
-        STertiaryUintOp *pPrimitives = (STertiaryUintOp *)Data.data();
-        for (size_t i = 0; i < count; ++i) {
-            STertiaryUintOp *p = &pPrimitives[i];
-            unsigned int val1 = (*Validation_Input1)[i % Validation_Input1->GetSize()];
-            unsigned int val2 = (*Validation_Input2)[i % Validation_Input2->GetSize()];
-            unsigned int val3 = (*Validation_Input3)[i % Validation_Input3->GetSize()];
-            p->input1 = val1;
-            p->input2 = val2;
-            p->input3 = val3;
-        }
-
-        // use shader from data table
-        pShaderOp->Shaders.at(0).Target = shader.Target;
-        pShaderOp->Shaders.at(0).EntryPoint = shader.EntryPoint;
-        pShaderOp->Shaders.at(0).Text = shader.Text;
-    });
-
-    MappedData data;
-    test->Test->GetReadBackData("STertiaryUintOp", &data);
-
-    STertiaryUintOp *pPrimitives = (STertiaryUintOp *)data.data();
-    WEX::TestExecution::DisableVerifyExceptions dve;
-    for (unsigned i = 0; i < count; ++i) {
-        STertiaryUintOp *p = &pPrimitives[i];
-        unsigned int val1 = (*Validation_Expected)[i % Validation_Expected->GetSize()];
-        LogCommentFmt(L"element #%u, input1 = %11u(0x%08x), input2 = "
-            L"%11u(0x%08x), input3 = %11u(0x%08x), output = "
-            L"%11u(0x%08x), expected = %11u(0x%08x)", i,
-            p->input1, p->input1, p->input2, p->input2, p->input3, p->input3,
-            p->output, p->output, val1, val1);
-        VerifyOutputWithExpectedValueInt(p->output, val1, Validation_Tolerance);
-    }
-}
-
-TEST_F(ExecutionTest, DotTest) {
-    WEX::TestExecution::SetVerifyOutput verifySettings(
-        WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
-    CComPtr<IStream> pStream;
-    ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
-
-    CComPtr<ID3D12Device> pDevice;
-    if (!CreateDevice(&pDevice)) {
-        return;
-    }
-
-    int tableSize = sizeof(DotOpParameters) / sizeof(TableParameter);
-    TableParameterHandler handler(DotOpParameters, tableSize);
-    handler.clearTableParameter();
-    VERIFY_SUCCEEDED(ParseTableRow(DotOpParameters, tableSize));
-
-    st::ShaderOpShader shader;
-
-    CW2A Name(handler.GetTableParamByName(L"ShaderOp.Name")->m_str);
-    CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
-    CW2A EntryPoint(handler.GetTableParamByName(L"ShaderOp.EntryPoint")->m_str);
-    CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
-    shader.Name = Name.m_psz;
-    shader.Target = Target.m_psz;
-    shader.EntryPoint = EntryPoint.m_psz;
-    shader.Text = Text.m_psz;
-
-    WEX::TestExecution::TestDataArray<WEX::Common::String> *Validation_Input1 =
-        &handler.GetTableParamByName(L"Validation.Input1")->m_StringTable;
-    WEX::TestExecution::TestDataArray<WEX::Common::String> *Validation_Input2 =
-        &handler.GetTableParamByName(L"Validation.Input2")->m_StringTable;
-    WEX::TestExecution::TestDataArray<WEX::Common::String> *Validation_dot2 =
-        &handler.GetTableParamByName(L"Validation.dot2")->m_StringTable;
-    WEX::TestExecution::TestDataArray<WEX::Common::String> *Validation_dot3 =
-        &handler.GetTableParamByName(L"Validation.dot3")->m_StringTable;
-    WEX::TestExecution::TestDataArray<WEX::Common::String> *Validation_dot4 =
-        &handler.GetTableParamByName(L"Validation.dot4")->m_StringTable;
-
-    PCWSTR Validation_type = handler.GetTableParamByName(L"Validation.Type")->m_str;
-    double tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_double;
-    unsigned int count = handler.GetTableParamByName(L"Validation.NumInput")->m_uint;
-
-    std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
-        pDevice, m_support, pStream, "DotOp",
-        // this callbacked is called when the test
-        // is creating the resource to run the test
-        [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
-        VERIFY_IS_TRUE(0 == _stricmp(Name, "SDotOp"));
-        size_t size = sizeof(SDotOp) * count;
-        Data.resize(size);
-        SDotOp *pPrimitives = (SDotOp*)Data.data();
-        for (size_t i = 0; i < count; ++i) {
-            SDotOp *p = &pPrimitives[i];
-            XMFLOAT4 val1,val2;
-            VERIFY_SUCCEEDED(ParseDataToVectorFloat((*Validation_Input1)[i],
-                                                    (float *)&val1, 4));
-            VERIFY_SUCCEEDED(ParseDataToVectorFloat((*Validation_Input2)[i],
-                                                    (float *)&val2, 4));
-            p->input1 = val1;
-            p->input2 = val2;
-        }
-        // use shader from data table
-        pShaderOp->Shaders.at(0).Target = shader.Target;
-        pShaderOp->Shaders.at(0).EntryPoint = shader.EntryPoint;
-        pShaderOp->Shaders.at(0).Text = shader.Text;
-    });
-
-    MappedData data;
-    test->Test->GetReadBackData("SDotOp", &data);
-
-    SDotOp *pPrimitives = (SDotOp*)data.data();
-    WEX::TestExecution::DisableVerifyExceptions dve;
-    for (size_t i = 0; i < count; ++i) {
-        SDotOp *p = &pPrimitives[i];
-        float dot2, dot3, dot4;
-        VERIFY_SUCCEEDED(ParseDataToFloat((*Validation_dot2)[i], dot2));
-        VERIFY_SUCCEEDED(ParseDataToFloat((*Validation_dot3)[i], dot3));
-        VERIFY_SUCCEEDED(ParseDataToFloat((*Validation_dot4)[i], dot4));
-        LogCommentFmt(
-            L"element #%u, input1 = (%f, %f, %f, %f), input2 = (%f, %f, "
-            L"%f, %f), \n dot2 = %f, dot2_expected = %f, dot3 = %f, "
-            L"dot3_expected = %f, dot4 = %f, dot4_expected = %f",
-            i, p->input1.x, p->input1.y, p->input1.z, p->input1.w, p->input2.x,
-            p->input2.y, p->input2.z, p->input2.w, p->o_dot2, dot2, p->o_dot3, dot3,
-            p->o_dot4, dot4);
-        VerifyOutputWithExpectedValueFloat(p->o_dot2, dot2, Validation_type,
-                                           tolerance);
-        VerifyOutputWithExpectedValueFloat(p->o_dot3, dot3, Validation_type,
-                                           tolerance);
-        VerifyOutputWithExpectedValueFloat(p->o_dot4, dot4, Validation_type,
-                                           tolerance);
-    }
-}
-
-TEST_F(ExecutionTest, Msad4Test) {
-    WEX::TestExecution::SetVerifyOutput verifySettings(
-        WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
-    CComPtr<IStream> pStream;
-    ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
-
-    CComPtr<ID3D12Device> pDevice;
-    if (!CreateDevice(&pDevice)) {
-        return;
-    }
-    size_t tableSize = sizeof(Msad4OpParameters) / sizeof(TableParameter);
-    TableParameterHandler handler(Msad4OpParameters, tableSize);
-    handler.clearTableParameter();
-    VERIFY_SUCCEEDED(ParseTableRow(Msad4OpParameters, tableSize));
-
-    CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
-    double tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_double;
-    unsigned int count = handler.GetTableParamByName(L"Validation.NumInput")->m_uint;
-
-    WEX::TestExecution::TestDataArray<unsigned int> *Validation_Reference =
-        &handler.GetTableParamByName(L"Validation.Reference")->m_uintTable;
-    WEX::TestExecution::TestDataArray<WEX::Common::String> *Validation_Source =
-        &handler.GetTableParamByName(L"Validation.Source")->m_StringTable;
-    WEX::TestExecution::TestDataArray<WEX::Common::String> *Validation_Accum =
-        &handler.GetTableParamByName(L"Validation.Accum")->m_StringTable;
-    WEX::TestExecution::TestDataArray<WEX::Common::String> *Validation_Expected =
-        &handler.GetTableParamByName(L"Validation.Expected")->m_StringTable;
-
-    std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
-        pDevice, m_support, pStream, "Msad4",
-        // this callbacked is called when the test
-        // is creating the resource to run the test
-        [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
-        VERIFY_IS_TRUE(0 == _stricmp(Name, "SMsad4"));
-        size_t size = sizeof(SMsad4) * count;
-        Data.resize(size);
-        SMsad4 *pPrimitives = (SMsad4*)Data.data();
-        for (size_t i = 0; i < count; ++i) {
-            SMsad4 *p = &pPrimitives[i];
-            XMUINT2 src;
-            XMUINT4 accum;
-            VERIFY_SUCCEEDED(ParseDataToVectorUint((*Validation_Source)[i], (unsigned int*)&src, 2));
-            VERIFY_SUCCEEDED(ParseDataToVectorUint((*Validation_Accum)[i], (unsigned int*)&accum, 4));
-            p->ref = (*Validation_Reference)[i];
-            p->src = src;
-            p->accum = accum;
-        }
-        // use shader from data table
-        pShaderOp->Shaders.at(0).Text = Text.m_psz;
-    });
-
-    MappedData data;
-    test->Test->GetReadBackData("SMsad4", &data);
-
-    SMsad4 *pPrimitives = (SMsad4*)data.data();
-    WEX::TestExecution::DisableVerifyExceptions dve;
-    for (size_t i = 0; i < count; ++i) {
-        SMsad4 *p = &pPrimitives[i];
-        XMUINT4 result;
-        VERIFY_SUCCEEDED(ParseDataToVectorUint((*Validation_Expected)[i],
-                                               (unsigned int *)&result, 4));
-        LogCommentFmt(
-            L"element #%u, ref = %u(0x%08x), src = %u(0x%08x), %u(0x%08x), "
-            L"accum = %u(0x%08x), %u(0x%08x), %u(0x%08x), %u(0x%08x),\n"
-            L"result = %u(0x%08x), %u(0x%08x), %u(0x%08x), %u(0x%08x),\n"
-            L"expected = %u(0x%08x), %u(0x%08x), %u(0x%08x), %u(0x%08x)", i,
-            p->ref, p->ref, p->src.x, p->src.x, p->src.y, p->src.y, p->accum.x,
-            p->accum.x, p->accum.y, p->accum.y, p->accum.z, p->accum.z,
-            p->accum.w, p->accum.w, p->result.x, p->result.x, p->result.y,
-            p->result.y, p->result.z, p->result.z, p->result.w, p->result.w,
-            result.x, result.x, result.y, result.y, result.z, result.z,
-            result.w, result.w);
-
-        VerifyOutputWithExpectedValueInt(p->result.x, result.x, tolerance);
-        VerifyOutputWithExpectedValueInt(p->result.y, result.y, tolerance);
-        VerifyOutputWithExpectedValueInt(p->result.z, result.z, tolerance);
-        VerifyOutputWithExpectedValueInt(p->result.w, result.w, tolerance);
-    }
-}
-
-template <class T1, class T2>
-void ExecutionTest::WaveIntrinsicsActivePrefixTest(
-    TableParameter *pParameterList, size_t numParameter, bool isPrefix) {
-  WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
-
-  // Resource representation for compute shader
-  // firstLaneId is used to group different waves
-  struct PerThreadData {
-      int firstLaneId;
-      int mask;
-      T1 input;
-      T2 output;
-  };
-
-  unsigned int NumThreadsX = 8;
-  unsigned int NumThreadsY = 12;
-  unsigned int NumThreadsZ = 1;
-
-  static const unsigned int ThreadsPerGroup = NumThreadsX * NumThreadsY * NumThreadsZ;
-  static const unsigned int DispatchGroupCount = 1;
-  static const unsigned int ThreadCount = ThreadsPerGroup * DispatchGroupCount;
-  CComPtr<IStream> pStream;
-  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
-
-  CComPtr<ID3D12Device> pDevice;
-  if (!CreateDevice(&pDevice)) {
-    return;
-  }
-  if (!DoesDeviceSupportWaveOps(pDevice)) {
-    // Optional feature, so it's correct to not support it if declared as such.
-    WEX::Logging::Log::Comment(L"Device does not support wave operations.");
-    return;
-  }
-
-  TableParameterHandler handler(pParameterList, numParameter);
-  handler.clearTableParameter();
-  VERIFY_SUCCEEDED(ParseTableRow(pParameterList, numParameter));
-
-  unsigned int numInputSet = handler.GetTableParamByName(L"Validation.NumInputSet")->m_uint;
-
-  // Obtain the list of input lists
-  typedef WEX::TestExecution::TestDataArray<T1> DataArray;
-  std::vector<DataArray*> InputList;
-  for (unsigned int i = 0;
-    i < numInputSet; ++i) {
-    std::wstring inputName = L"Validation.InputSet";
-    inputName.append(std::to_wstring(i + 1));
-    InputList.push_back(handler.GetDataArray<T1>(inputName.data()));
-  }
-  CW2A Text(handler.GetTableParamByName(L"ShaderOp.text")->m_str);
-
-  std::shared_ptr<st::ShaderOpSet> ShaderOpSet = std::make_shared<st::ShaderOpSet>();
-  st::ParseShaderOpSetFromStream(pStream, ShaderOpSet.get());
-
-  // Running compute shader for each input set with different masks
-  for (size_t setIndex = 0; setIndex < numInputSet; ++setIndex) {
-    for (size_t maskIndex = 0; maskIndex < sizeof(MaskFunctionTable) / sizeof(MaskFunction); ++maskIndex) {
-      std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(
-        pDevice, m_support, pStream, "WaveIntrinsicsOp",
-        // this callbacked is called when the test
-        // is creating the resource to run the test
-        [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
-        VERIFY_IS_TRUE(0 == _stricmp(Name, "SWaveIntrinsicsOp"));
-        size_t size = sizeof(PerThreadData) * ThreadCount;
-        Data.resize(size);
-        PerThreadData *pPrimitives = (PerThreadData*)Data.data();
-        // 4 different inputs for each operation test
-        size_t index = 0;
-        while (index < ThreadCount) {
-          PerThreadData *p = &pPrimitives[index];
-          DataArray *IntList = InputList[setIndex];
-          p->mask = MaskFunctionTable[maskIndex](index);
-          p->input = (*IntList)[index % IntList->GetSize()];
-          p->output = 0xFFFFBFFF;
-          index++;
-        }
-        // use shader from data table
-        pShaderOp->Shaders.at(0).Text = Text.m_psz;
-      }, ShaderOpSet);
-
-      // Check the value
-      MappedData data;
-      test->Test->GetReadBackData("SWaveIntrinsicsOp", &data);
-
-      PerThreadData *pPrimitives = (PerThreadData*)data.data();
-      WEX::TestExecution::DisableVerifyExceptions dve;
-
-      // Grouping data by waves
-      std::vector<int> firstLaneIds;
-      for (size_t i = 0; i < ThreadCount; ++i) {
-        PerThreadData *p = &pPrimitives[i];
-        int firstLaneId = p->firstLaneId;
-        if (!contains(firstLaneIds, firstLaneId)) {
-          firstLaneIds.push_back(firstLaneId);
-        }
-      }
-
-      std::map<int, std::unique_ptr<std::vector<PerThreadData *>>> waves;
-      for (size_t i = 0; i < firstLaneIds.size(); ++i) {
-        waves[firstLaneIds.at(i)] = std::make_unique<std::vector<PerThreadData*>>(std::vector<PerThreadData*>());
-      }
-
-      for (size_t i = 0; i < ThreadCount; ++i) {
-        PerThreadData *p = &pPrimitives[i];
-        waves[p->firstLaneId].get()->push_back(p);
-      }
-
-      // validate for each wave
-      for (size_t i = 0; i < firstLaneIds.size(); ++i) {
-        // collect inputs and masks for a given wave
-        std::vector<PerThreadData *> *waveData = waves[firstLaneIds.at(i)].get();
-        std::vector<T1> inputList(waveData->size());
-        std::vector<int> maskList(waveData->size());
-        std::wstring inputStr = L"Wave Inputs: ";
-        std::wstring maskStr =  L"Wave Mask:   ";
-        for (size_t j = 0; j < waveData->size(); ++j) {
-          inputList.at(j) = (waveData->at(j)->input);
-          maskList.at(j) = (waveData->at(j)->mask);
-          inputStr.append(std::to_wstring(waveData->at(j)->input));
-          inputStr.append(L" ");
-          maskStr.append(std::to_wstring(waveData->at(j)->mask));
-          maskStr.append(L" ");
-        }
-        LogCommentFmt(inputStr.data());
-        LogCommentFmt(maskStr.data());
-        // Compute expected output for a given inputs, masks, and index
-        for (size_t laneIndex = 0; laneIndex < waveData->size(); ++laneIndex) {
-          T2 expected;
-          // WaveActive is equivalent to WavePrefix lane # lane count
-          unsigned int index = isPrefix ? laneIndex : waveData->size();
-          if (waveData->at(laneIndex)->mask == 1) {
-            expected = computeExpectedWithShaderOp<T1, T2>(
-              inputList, maskList, 1, index,
-              handler.GetTableParamByName(L"ShaderOp.Name")->m_str);
-          }
-          else {
-            expected = computeExpectedWithShaderOp<T1, T2>(
-              inputList, maskList, 0, index,
-              handler.GetTableParamByName(L"ShaderOp.Name")->m_str);
-          }
-          // TODO: use different comparison for floating point inputs
-          bool equal = waveData->at(laneIndex)->output == expected;
-          if (!equal) {
-            LogCommentFmt(L"lane%d: %4d, Expected : %4d", laneIndex, waveData->at(laneIndex)->output, expected);
-          }
-          VERIFY_IS_TRUE(equal);
-        }
-      }
-    }
-  }
-}
-
-static const unsigned int MinWarpVersionForWaveIntrinsics = 16202;
-
-TEST_F(ExecutionTest, WaveIntrinsicsActiveIntTest) {
-  if (GetTestParamUseWARP(true) &&
-      !IsValidWarpDllVersion(MinWarpVersionForWaveIntrinsics)) {
-    return;
-  }
-  WaveIntrinsicsActivePrefixTest<int, int>(
-      WaveIntrinsicsActiveIntParameters,
-      sizeof(WaveIntrinsicsActiveIntParameters) / sizeof(TableParameter),
-      /*isPrefix*/ false);
-}
-
-TEST_F(ExecutionTest, WaveIntrinsicsActiveUintTest) {
-  if (GetTestParamUseWARP(true) &&
-      !IsValidWarpDllVersion(MinWarpVersionForWaveIntrinsics)) {
-    return;
-  }
-  WaveIntrinsicsActivePrefixTest<unsigned int, unsigned int>(
-      WaveIntrinsicsActiveUintParameters,
-      sizeof(WaveIntrinsicsActiveUintParameters) / sizeof(TableParameter),
-      /*isPrefix*/ false);
-}
-
-TEST_F(ExecutionTest, WaveIntrinsicsPrefixIntTest) {
-  if (GetTestParamUseWARP(true) &&
-      !IsValidWarpDllVersion(MinWarpVersionForWaveIntrinsics)) {
-    return;
-  }
-  WaveIntrinsicsActivePrefixTest<int, int>(
-      WaveIntrinsicsPrefixIntParameters,
-      sizeof(WaveIntrinsicsPrefixIntParameters) / sizeof(TableParameter),
-      /*isPrefix*/ true);
-}
-
-TEST_F(ExecutionTest, WaveIntrinsicsPrefixUintTest) {
-  if (GetTestParamUseWARP(true) &&
-      !IsValidWarpDllVersion(MinWarpVersionForWaveIntrinsics)) {
-    return;
-  }
-  WaveIntrinsicsActivePrefixTest<unsigned int, unsigned int>(
-      WaveIntrinsicsPrefixUintParameters,
-      sizeof(WaveIntrinsicsPrefixUintParameters) / sizeof(TableParameter),
-      /*isPrefix*/ true);
-}
-
-static void WriteReadBackDump(st::ShaderOp *pShaderOp, st::ShaderOpTest *pTest,
-                              char **pReadBackDump) {
-  std::stringstream str;
-
-  unsigned count = 0;
-  for (auto &R : pShaderOp->Resources) {
-    if (!R.ReadBack)
-      continue;
-    ++count;
-    str << "Resource: " << R.Name << "\r\n";
-    // Find a descriptor that can tell us how to dump this resource.
-    bool found = false;
-    for (auto &Heaps : pShaderOp->DescriptorHeaps) {
-      for (auto &D : Heaps.Descriptors) {
-        if (_stricmp(D.ResName, R.Name) != 0) {
-          continue;
-        }
-        found = true;
-        if (_stricmp(D.Kind, "UAV") != 0) {
-          str << "Resource dump for kind " << D.Kind << " not implemented yet.\r\n";
-          break;
-        }
-        if (D.UavDesc.ViewDimension != D3D12_UAV_DIMENSION_BUFFER) {
-          str << "Resource dump for this kind of view dimension not implemented yet.\r\n";
-          break;
-        }
-        // We can map back to the structure if a structured buffer via the shader, but
-        // we'll keep this simple and simply dump out 32-bit uint/float representations.
-        MappedData data;
-        pTest->GetReadBackData(R.Name, &data);
-        uint32_t *pData = (uint32_t *)data.data();
-        size_t u32_count = R.Desc.Width / sizeof(uint32_t);
-        for (size_t i = 0; i < u32_count; ++i) {
-          float f = *(float *)pData;
-          str << i << ": 0n" << *pData << "   0x" << std::hex << *pData
-              << std::dec << "   " << f << "\r\n";
-          ++pData;
-        }
-        break;
-      }
-      if (found) break;
-    }
-    if (!found) {
-      str << "Unable to find a view for the resource.\r\n";
-    }
-  }
-
-  str << "Resources read back: " << count << "\r\n";
-
-  std::string s(str.str());
-  CComHeapPtr<char> pDump;
-  if (!pDump.Allocate(s.size() + 1))
-    throw std::bad_alloc();
-  memcpy(pDump.m_pData, s.data(), s.size());
-  pDump.m_pData[s.size()] = '\0';
-  *pReadBackDump = pDump.Detach();
-}
-
-// This is the exported interface by use from HLSLHost.exe.
-// It's exclusive with the use of the DLL as a TAEF target.
-extern "C" {
-  __declspec(dllexport) HRESULT WINAPI InitializeOpTests(void *pStrCtx, st::OutputStringFn pOutputStrFn) {
-    HRESULT hr = EnableExperimentalShaderModels();
-    if (FAILED(hr)) {
-      pOutputStrFn(pStrCtx, L"Unable to enable experimental shader models.\r\n.");
-    }
-    return S_OK;
-  }
-
-  __declspec(dllexport) HRESULT WINAPI
-      RunOpTest(void *pStrCtx, st::OutputStringFn pOutputStrFn, LPCSTR pText,
-                ID3D12Device *pDevice, ID3D12CommandQueue *pCommandQueue,
-                ID3D12Resource *pRenderTarget, char **pReadBackDump) {
-
-    HRESULT hr;
-    if (pReadBackDump) *pReadBackDump = nullptr;
-    st::SetOutputFn(pStrCtx, pOutputStrFn);
-    CComPtr<ID3D12InfoQueue> pInfoQueue;
-    CComHeapPtr<char> pDump;
-    bool FilterCreation = false;
-    if (SUCCEEDED(pDevice->QueryInterface(&pInfoQueue))) {
-      // Creation is largely driven by inputs, so don't log create/destroy messages.
-      pInfoQueue->PushEmptyStorageFilter();
-      pInfoQueue->PushEmptyRetrievalFilter();
-      if (FilterCreation) {
-        D3D12_INFO_QUEUE_FILTER filter;
-        D3D12_MESSAGE_CATEGORY denyCategories[] = { D3D12_MESSAGE_CATEGORY_STATE_CREATION };
-        ZeroMemory(&filter, sizeof(filter));
-        filter.DenyList.NumCategories = _countof(denyCategories);
-        filter.DenyList.pCategoryList = denyCategories;
-        pInfoQueue->PushStorageFilter(&filter);
-      }
-    }
-    else {
-      pOutputStrFn(pStrCtx, L"Unable to enable info queue for D3D.\r\n.");
-    }
-    try {
-      dxc::DxcDllSupport m_support;
-      m_support.Initialize();
-
-      const char *pName = nullptr;
-      CComPtr<IStream> pStream = SHCreateMemStream((BYTE *)pText, strlen(pText));
-      std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
-        std::make_shared<st::ShaderOpSet>();
-      st::ParseShaderOpSetFromStream(pStream, ShaderOpSet.get());
-      st::ShaderOp *pShaderOp;
-      if (pName == nullptr) {
-        if (ShaderOpSet->ShaderOps.size() != 1) {
-          pOutputStrFn(pStrCtx, L"Expected a single shader operation.\r\n");
-          return E_FAIL;
-        }
-        pShaderOp = ShaderOpSet->ShaderOps[0].get();
-      }
-      else {
-        pShaderOp = ShaderOpSet->GetShaderOp(pName);
-      }
-      if (pShaderOp == nullptr) {
-        std::string msg = "Unable to find shader op ";
-        msg += pName;
-        msg += "; available ops";
-        const char sep = ':';
-        for (auto &pAvailOp : ShaderOpSet->ShaderOps) {
-          msg += sep;
-          msg += pAvailOp->Name ? pAvailOp->Name : "[n/a]";
-        }
-        CA2W msgWide(msg.c_str());
-        pOutputStrFn(pStrCtx, msgWide);
-        return E_FAIL;
-      }
-
-      std::shared_ptr<st::ShaderOpTest> test = std::make_shared<st::ShaderOpTest>();
-      test->SetupRenderTarget(pShaderOp, pDevice, pCommandQueue, pRenderTarget);
-      test->SetDxcSupport(&m_support);
-      test->RunShaderOp(pShaderOp);
-      test->PresentRenderTarget(pShaderOp, pCommandQueue, pRenderTarget);
-
-      pOutputStrFn(pStrCtx, L"Rendering complete.\r\n");
-
-      if (!pShaderOp->IsCompute()) {
-        D3D12_QUERY_DATA_PIPELINE_STATISTICS stats;
-        test->GetPipelineStats(&stats);
-        wchar_t statsText[400];
-        StringCchPrintfW(statsText, _countof(statsText),
-          L"Vertices/primitives read by input assembler: %I64u/%I64u\r\n"
-          L"Vertex shader invocations: %I64u\r\n"
-          L"Geometry shader invocations/output primitive: %I64u/%I64u\r\n"
-          L"Primitives sent to rasterizer/rendered: %I64u/%I64u\r\n"
-          L"PS/HS/DS/CS invocations: %I64u/%I64u/%I64u/%I64u\r\n",
-          stats.IAVertices, stats.IAPrimitives, stats.VSInvocations,
-          stats.GSInvocations, stats.GSPrimitives, stats.CInvocations,
-          stats.CPrimitives, stats.PSInvocations, stats.HSInvocations,
-          stats.DSInvocations, stats.CSInvocations);
-        pOutputStrFn(pStrCtx, statsText);
-      }
-
-      if (pReadBackDump) {
-        WriteReadBackDump(pShaderOp, test.get(), &pDump);
-      }
-
-      hr = S_OK;
-    }
-    catch (const CAtlException &E)
-    {
-      hr = E.m_hr;
-    }
-    catch (const std::bad_alloc &)
-    {
-      hr = E_OUTOFMEMORY;
-    }
-    catch (const std::exception &)
-    {
-      hr = E_FAIL;
-    }
-
-    // Drain the device message queue if available.
-    if (pInfoQueue != nullptr) {
-      wchar_t buf[200];
-      StringCchPrintfW(buf, _countof(buf),
-        L"NumStoredMessages=%u limit/discarded by limit=%u/%u "
-        L"allowed/denied by storage filter=%u/%u "
-        L"NumStoredMessagesAllowedByRetrievalFilter=%u\r\n",
-        (unsigned)pInfoQueue->GetNumStoredMessages(),
-        (unsigned)pInfoQueue->GetMessageCountLimit(),
-        (unsigned)pInfoQueue->GetNumMessagesDiscardedByMessageCountLimit(),
-        (unsigned)pInfoQueue->GetNumMessagesAllowedByStorageFilter(),
-        (unsigned)pInfoQueue->GetNumMessagesDeniedByStorageFilter(),
-        (unsigned)pInfoQueue->GetNumStoredMessagesAllowedByRetrievalFilter());
-      pOutputStrFn(pStrCtx, buf);
-
-      WriteInfoQueueMessages(pStrCtx, pOutputStrFn, pInfoQueue);
-
-      pInfoQueue->ClearStoredMessages();
-      pInfoQueue->PopRetrievalFilter();
-      pInfoQueue->PopStorageFilter();
-      if (FilterCreation) {
-        pInfoQueue->PopStorageFilter();
-      }
-    }
-
-    if (pReadBackDump) *pReadBackDump = pDump.Detach();
-
-    return hr;
-  }
-}
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// ExecutionTest.cpp                                                         //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// These tests run by executing compiled programs, and thus involve more     //
+// moving parts, like the runtime and drivers.                               //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#include <algorithm>
+#include <memory>
+#include <vector>
+#include <string>
+#include <map>
+#include <unordered_set>
+#include <strstream>
+#include <iomanip>
+#include "CompilationResult.h"
+#include "HLSLTestData.h"
+#include <Shlwapi.h>
+#include <atlcoll.h>
+#include <locale>
+#include <algorithm>
+
+#undef _read
+#include "WexTestClass.h"
+#include "HlslTestUtils.h"
+#include "DxcTestUtils.h"
+#include "dxc/Support/Global.h"
+#include "dxc/Support/WinIncludes.h"
+#include "dxc/Support/FileIOHelper.h"
+#include "dxc/Support/Unicode.h"
+
+//
+// d3d12.h and dxgi1_4.h are included in the Windows 10 SDK
+// https://msdn.microsoft.com/en-us/library/windows/desktop/dn899120(v=vs.85).aspx
+// https://developer.microsoft.com/en-US/windows/downloads/windows-10-sdk
+//
+#include <d3d12.h>
+#include <dxgi1_4.h>
+#include <DXGIDebug.h>
+#include <D3dx12.h>
+#include <DirectXMath.h>
+#include <strsafe.h>
+#include <d3dcompiler.h>
+#include <wincodec.h>
+#include "ShaderOpTest.h"
+
+#pragma comment(lib, "d3dcompiler.lib")
+#pragma comment(lib, "windowscodecs.lib")
+#pragma comment(lib, "dxguid.lib")
+#pragma comment(lib, "version.lib")
+
+// A more recent Windows SDK than currently required is needed for these.
+typedef HRESULT(WINAPI *D3D12EnableExperimentalFeaturesFn)(
+  UINT                                    NumFeatures,
+  __in_ecount(NumFeatures) const IID*     pIIDs,
+  __in_ecount_opt(NumFeatures) void*      pConfigurationStructs,
+  __in_ecount_opt(NumFeatures) UINT*      pConfigurationStructSizes);
+
+static const GUID D3D12ExperimentalShaderModelsID = { /* 76f5573e-f13a-40f5-b297-81ce9e18933f */
+  0x76f5573e,
+  0xf13a,
+  0x40f5,
+  { 0xb2, 0x97, 0x81, 0xce, 0x9e, 0x18, 0x93, 0x3f }
+};
+
+using namespace DirectX;
+using namespace hlsl_test;
+
+template <typename TSequence, typename T>
+static bool contains(TSequence s, const T &val) {
+  return std::cend(s) != std::find(std::cbegin(s), std::cend(s), val);
+}
+
+template <typename InputIterator, typename T>
+static bool contains(InputIterator b, InputIterator e, const T &val) {
+  return e != std::find(b, e, val);
+}
+
+static HRESULT EnableExperimentalShaderModels() {
+  HMODULE hRuntime = LoadLibraryW(L"d3d12.dll");
+  if (hRuntime == NULL) {
+    return HRESULT_FROM_WIN32(GetLastError());
+  }
+
+  D3D12EnableExperimentalFeaturesFn pD3D12EnableExperimentalFeatures =
+    (D3D12EnableExperimentalFeaturesFn)GetProcAddress(hRuntime, "D3D12EnableExperimentalFeatures");
+  if (pD3D12EnableExperimentalFeatures == nullptr) {
+    FreeLibrary(hRuntime);
+    return HRESULT_FROM_WIN32(GetLastError());
+  }
+
+  HRESULT hr = pD3D12EnableExperimentalFeatures(1, &D3D12ExperimentalShaderModelsID, nullptr, nullptr);
+  FreeLibrary(hRuntime);
+  return hr;
+}
+
+static HRESULT ReportLiveObjects() {
+  CComPtr<IDXGIDebug1> pDebug;
+  IFR(DXGIGetDebugInterface1(0, IID_PPV_ARGS(&pDebug)));
+  IFR(pDebug->ReportLiveObjects(DXGI_DEBUG_ALL, DXGI_DEBUG_RLO_ALL));
+  return S_OK;
+}
+
+static void WriteInfoQueueMessages(void *pStrCtx, st::OutputStringFn pOutputStrFn, ID3D12InfoQueue *pInfoQueue) {
+  bool allMessagesOK = true;
+  UINT64 count = pInfoQueue->GetNumStoredMessages();
+  CAtlArray<BYTE> message;
+  for (UINT64 i = 0; i < count; ++i) {
+    // 'GetMessageA' rather than 'GetMessage' is an artifact of user32 headers.
+    SIZE_T msgLen = 0;
+    if (FAILED(pInfoQueue->GetMessageA(i, nullptr, &msgLen))) {
+      allMessagesOK = false;
+      continue;
+    }
+    if (message.GetCount() < msgLen) {
+      if (!message.SetCount(msgLen)) {
+        allMessagesOK = false;
+        continue;
+      }
+    }
+    D3D12_MESSAGE *pMessage = (D3D12_MESSAGE *)message.GetData();
+    if (FAILED(pInfoQueue->GetMessageA(i, pMessage, &msgLen))) {
+      allMessagesOK = false;
+      continue;
+    }
+    CA2W msgW(pMessage->pDescription, CP_ACP);
+    pOutputStrFn(pStrCtx, msgW.m_psz);
+    pOutputStrFn(pStrCtx, L"\r\n");
+  }
+  if (!allMessagesOK) {
+    pOutputStrFn(pStrCtx, L"Failed to retrieve some messages.\r\n");
+  }
+}
+
+class CComContext {
+private:
+  bool m_init;
+public:
+  CComContext() : m_init(false) {}
+  ~CComContext() { Dispose(); }
+  void Dispose() { if (!m_init) return; m_init = false; CoUninitialize(); }
+  HRESULT Init() { HRESULT hr = CoInitializeEx(0, COINIT_MULTITHREADED); if (SUCCEEDED(hr)) { m_init = true; } return hr; }
+};
+
+static void SavePixelsToFile(LPCVOID pPixels, DXGI_FORMAT format, UINT32 m_width, UINT32 m_height, LPCWSTR pFileName) {
+  CComContext ctx;
+  CComPtr<IWICImagingFactory> pFactory;
+  CComPtr<IWICBitmap> pBitmap;
+  CComPtr<IWICBitmapEncoder> pEncoder;
+  CComPtr<IWICBitmapFrameEncode> pFrameEncode;
+  CComPtr<hlsl::AbstractMemoryStream> pStream;
+  CComPtr<IMalloc> pMalloc;
+
+  struct PF {
+    DXGI_FORMAT Format;
+    GUID PixelFormat;
+    UINT32 PixelSize;
+    bool operator==(DXGI_FORMAT F) const {
+      return F == Format;
+    }
+  } Vals[] = {
+    // Add more pixel format mappings as needed.
+    { DXGI_FORMAT_R8G8B8A8_UNORM, GUID_WICPixelFormat32bppRGBA, 4 }
+  };
+  PF *pFormat = std::find(Vals, Vals + _countof(Vals), format);
+
+  VERIFY_SUCCEEDED(ctx.Init());
+  VERIFY_SUCCEEDED(CoCreateInstance(CLSID_WICImagingFactory, NULL, CLSCTX_INPROC_SERVER, IID_IWICImagingFactory, (LPVOID*)&pFactory));
+  VERIFY_SUCCEEDED(CoGetMalloc(1, &pMalloc));
+  VERIFY_SUCCEEDED(hlsl::CreateMemoryStream(pMalloc, &pStream));
+  VERIFY_ARE_NOT_EQUAL(pFormat, Vals + _countof(Vals));
+  VERIFY_SUCCEEDED(pFactory->CreateBitmapFromMemory(m_width, m_height, pFormat->PixelFormat, m_width * pFormat->PixelSize, m_width * m_height * pFormat->PixelSize, (BYTE *)pPixels, &pBitmap));
+  VERIFY_SUCCEEDED(pFactory->CreateEncoder(GUID_ContainerFormatBmp, nullptr, &pEncoder));
+  VERIFY_SUCCEEDED(pEncoder->Initialize(pStream, WICBitmapEncoderNoCache));
+  VERIFY_SUCCEEDED(pEncoder->CreateNewFrame(&pFrameEncode, nullptr));
+  VERIFY_SUCCEEDED(pFrameEncode->Initialize(nullptr));
+  VERIFY_SUCCEEDED(pFrameEncode->WriteSource(pBitmap, nullptr));
+  VERIFY_SUCCEEDED(pFrameEncode->Commit());
+  VERIFY_SUCCEEDED(pEncoder->Commit());
+  hlsl::WriteBinaryFile(pFileName, pStream->GetPtr(), pStream->GetPtrSize());
+}
+
+// Setup for wave intrinsics tests
+enum class ShaderOpKind {
+  WaveSum,
+  WaveProduct,
+  WaveActiveMax,
+  WaveActiveMin,
+  WaveCountBits,
+  WaveActiveAllEqual,
+  WaveActiveAnyTrue,
+  WaveActiveAllTrue,
+  WaveActiveBitOr,
+  WaveActiveBitAnd,
+  WaveActiveBitXor,
+  ShaderOpInvalid
+};
+
+struct ShaderOpKindPair {
+  LPCWSTR name;
+  ShaderOpKind kind;
+};
+
+static ShaderOpKindPair ShaderOpKindTable[] = {
+  { L"WaveActiveSum", ShaderOpKind::WaveSum },
+  { L"WaveActiveUSum", ShaderOpKind::WaveSum },
+  { L"WaveActiveProduct", ShaderOpKind::WaveProduct },
+  { L"WaveActiveUProduct", ShaderOpKind::WaveProduct },
+  { L"WaveActiveMax", ShaderOpKind::WaveActiveMax },
+  { L"WaveActiveUMax", ShaderOpKind::WaveActiveMax },
+  { L"WaveActiveMin", ShaderOpKind::WaveActiveMin },
+  { L"WaveActiveUMin", ShaderOpKind::WaveActiveMin },
+  { L"WaveActiveCountBits", ShaderOpKind::WaveCountBits },
+  { L"WaveActiveAllEqual", ShaderOpKind::WaveActiveAllEqual },
+  { L"WaveActiveAnyTrue", ShaderOpKind::WaveActiveAnyTrue },
+  { L"WaveActiveAllTrue", ShaderOpKind::WaveActiveAllTrue },
+  { L"WaveActiveBitOr", ShaderOpKind::WaveActiveBitOr },
+  { L"WaveActiveBitAnd", ShaderOpKind::WaveActiveBitAnd },
+  { L"WaveActiveBitXor", ShaderOpKind::WaveActiveBitXor },
+  { L"WavePrefixSum", ShaderOpKind::WaveSum },
+  { L"WavePrefixUSum", ShaderOpKind::WaveSum },
+  { L"WavePrefixProduct", ShaderOpKind::WaveProduct },
+  { L"WavePrefixUProduct", ShaderOpKind::WaveProduct },
+  { L"WavePrefixMax", ShaderOpKind::WaveActiveMax },
+  { L"WavePrefixUMax", ShaderOpKind::WaveActiveMax },
+  { L"WavePrefixMin", ShaderOpKind::WaveActiveMin },
+  { L"WavePrefixUMin", ShaderOpKind::WaveActiveMin },
+  { L"WavePrefixCountBits", ShaderOpKind::WaveCountBits }
+};
+
+ShaderOpKind GetShaderOpKind(LPCWSTR str) {
+  for (size_t i = 0; i < sizeof(ShaderOpKindTable)/sizeof(ShaderOpKindPair); ++i) {
+    if (_wcsicmp(ShaderOpKindTable[i].name, str) == 0) {
+      return ShaderOpKindTable[i].kind;
+    }
+  }
+  DXASSERT(false, "Invalid ShaderOp name: %s", str);
+  return ShaderOpKind::ShaderOpInvalid;
+}
+
+// Virtual class to compute the expected result given a set of inputs
+struct TableParameter;
+
+template <typename InType, typename OutType, ShaderOpKind kind>
+struct computeExpected {
+  OutType operator()(const std::vector<InType> &inputs,
+                     const std::vector<int> &masks, int maskValue,
+                     unsigned int index) {
+    return 0;
+  }
+};
+
+template <typename InType, typename OutType>
+struct computeExpected<InType, OutType, ShaderOpKind::WaveSum> {
+  OutType operator()(const std::vector<InType> &inputs,
+                     const std::vector<int> &masks, int maskValue,
+                     unsigned int index) {
+    OutType sum = 0;
+    for (size_t i = 0; i < index; ++i) {
+      if (masks.at(i) == maskValue) {
+        sum += inputs.at(i);
+      }
+    }
+    return sum;
+  }
+};
+
+template <typename InType, typename OutType>
+struct computeExpected<InType, OutType, ShaderOpKind::WaveProduct> {
+  OutType operator()(const std::vector<InType> &inputs,
+                     const std::vector<int> &masks, int maskValue,
+                     unsigned int index) {
+    OutType prod = 1;
+    for (size_t i = 0; i < index; ++i) {
+      if (masks.at(i) == maskValue) {
+        prod *= inputs.at(i);
+      }
+    }
+    return prod;
+  }
+};
+
+template <typename InType, typename OutType>
+struct computeExpected<InType, OutType, ShaderOpKind::WaveActiveMax> {
+  OutType operator()(const std::vector<InType> &inputs,
+                     const std::vector<int> &masks, int maskValue,
+                     unsigned int index) {
+    OutType maximum = std::numeric_limits<OutType>::min();
+    for (size_t i = 0; i < index; ++i) {
+      if (masks.at(i) == maskValue && inputs.at(i) > maximum)
+        maximum = inputs.at(i);
+    }
+    return maximum;
+  }
+};
+
+template <typename InType, typename OutType>
+struct computeExpected<InType, OutType, ShaderOpKind::WaveActiveMin> {
+  OutType operator()(const std::vector<InType> &inputs,
+                     const std::vector<int> &masks, int maskValue,
+                     unsigned int index) {
+    OutType minimum = std::numeric_limits<OutType>::max();
+    for (size_t i = 0; i < index; ++i) {
+      if (masks.at(i) == maskValue && inputs.at(i) < minimum)
+        minimum = inputs.at(i);
+    }
+    return minimum;
+  }
+};
+
+template <typename InType, typename OutType>
+struct computeExpected<InType, OutType, ShaderOpKind::WaveCountBits> {
+  OutType operator()(const std::vector<InType> &inputs,
+                     const std::vector<int> &masks, int maskValue,
+                     unsigned int index) {
+    OutType count = 0;
+    for (size_t i = 0; i < index; ++i) {
+      if (masks.at(i) == maskValue && inputs.at(i) > 3) {
+        count++;
+      }
+    }
+    return count;
+  }
+};
+
+// In HLSL, boolean is represented in a 4 byte (uint32) format,
+// So we cannot use c++ bool type to represent bool in HLSL
+// HLSL returns 0 for false and 1 for true
+template <typename InType, typename OutType>
+struct computeExpected<InType, OutType, ShaderOpKind::WaveActiveAnyTrue> {
+  OutType operator()(const std::vector<InType> &inputs,
+                     const std::vector<int> &masks, int maskValue,
+                     unsigned int index) {
+    for (size_t i = 0; i < index; ++i) {
+      if (masks.at(i) == maskValue && inputs.at(i) != 0) {
+        return 1;
+      }
+    }
+    return 0;
+  }
+};
+
+template <typename InType, typename OutType>
+struct computeExpected<InType, OutType, ShaderOpKind::WaveActiveAllTrue> {
+  OutType operator()(const std::vector<InType> &inputs,
+                     const std::vector<int> &masks, int maskValue,
+                     unsigned int index) {
+    for (size_t i = 0; i < index; ++i) {
+      if (masks.at(i) == maskValue && inputs.at(i) == 0) {
+        return 0;
+      }
+    }
+    return 1;
+  }
+};
+
+template <typename InType, typename OutType>
+struct computeExpected<InType, OutType, ShaderOpKind::WaveActiveAllEqual> {
+  OutType operator()(const std::vector<InType> &inputs,
+                     const std::vector<int> &masks, int maskValue,
+                     unsigned int index) {
+    const InType *val = nullptr;
+    for (size_t i = 0; i < index; ++i) {
+      if (masks.at(i) == maskValue) {
+        if (val && *val != inputs.at(i)) {
+          return 0;
+        }
+        val = &inputs.at(i);
+      }
+    }
+    return 1;
+  }
+};
+
+template <typename InType, typename OutType>
+struct computeExpected<InType, OutType, ShaderOpKind::WaveActiveBitOr> {
+  OutType operator()(const std::vector<InType> &inputs,
+                     const std::vector<int> &masks, int maskValue,
+                     unsigned int index) {
+    OutType bits = 0x00000000;
+    for (size_t i = 0; i < index; ++i) {
+      if (masks.at(i) == maskValue) {
+        bits |= inputs.at(i);
+      }
+    }
+    return bits;
+  }
+};
+
+template <typename InType, typename OutType>
+struct computeExpected<InType, OutType, ShaderOpKind::WaveActiveBitAnd> {
+  OutType operator()(const std::vector<InType> &inputs,
+                     const std::vector<int> &masks, int maskValue,
+                     unsigned int index) {
+    OutType bits = 0xffffffff;
+    for (size_t i = 0; i < index; ++i) {
+      if (masks.at(i) == maskValue) {
+        bits &= inputs.at(i);
+      }
+    }
+    return bits;
+  }
+};
+
+template <typename InType, typename OutType>
+struct computeExpected<InType, OutType, ShaderOpKind::WaveActiveBitXor> {
+  OutType operator()(const std::vector<InType> &inputs,
+                     const std::vector<int> &masks, int maskValue,
+                     unsigned int index) {
+    OutType bits = 0x00000000;
+    for (size_t i = 0; i < index; ++i) {
+      if (masks.at(i) == maskValue) {
+        bits ^= inputs.at(i);
+      }
+    }
+    return bits;
+  }
+};
+
+// Mask functions used to control active lanes
+static int MaskAll(int i) {
+  return 1;
+}
+
+static int MaskEveryOther(int i) {
+  return i % 2 == 0 ? 1 : 0;
+}
+
+static int MaskEveryThird(int i) {
+  return i % 3 == 0 ? 1 : 0;
+}
+// TODO: It seems there is an issue with WARP with controlling active lanes
+// Add more masks when this is resolved
+typedef int(*MaskFunction)(int);
+static MaskFunction MaskFunctionTable[] = {
+  MaskAll, MaskEveryOther, MaskEveryThird
+};
+
+template <typename InType, typename OutType>
+static OutType computeExpectedWithShaderOp(const std::vector<InType> &inputs,
+                                           const std::vector<int> &masks,
+                                           int maskValue, unsigned int index,
+                                           LPCWSTR str) {
+  ShaderOpKind kind = GetShaderOpKind(str);
+  switch (kind) {
+  case ShaderOpKind::WaveSum:
+    return computeExpected<InType, OutType, ShaderOpKind::WaveSum>()(inputs, masks, maskValue, index);
+  case ShaderOpKind::WaveProduct:
+    return computeExpected<InType, OutType, ShaderOpKind::WaveProduct>()(inputs, masks, maskValue, index);
+  case ShaderOpKind::WaveActiveMax:
+    return computeExpected<InType, OutType, ShaderOpKind::WaveActiveMax>()(inputs, masks, maskValue, index);
+  case ShaderOpKind::WaveActiveMin:
+    return computeExpected<InType, OutType, ShaderOpKind::WaveActiveMin>()(inputs, masks, maskValue, index);
+  case ShaderOpKind::WaveCountBits:
+    return computeExpected<InType, OutType, ShaderOpKind::WaveCountBits>()(inputs, masks, maskValue, index);
+  case ShaderOpKind::WaveActiveBitOr:
+    return computeExpected<InType, OutType, ShaderOpKind::WaveActiveBitOr>()(inputs, masks, maskValue, index);
+  case ShaderOpKind::WaveActiveBitAnd:
+    return computeExpected<InType, OutType, ShaderOpKind::WaveActiveBitAnd>()(inputs, masks, maskValue, index);
+  case ShaderOpKind::WaveActiveBitXor:
+    return computeExpected<InType, OutType, ShaderOpKind::WaveActiveBitXor>()(inputs, masks, maskValue, index);
+  case ShaderOpKind::WaveActiveAnyTrue:
+    return computeExpected<InType, OutType, ShaderOpKind::WaveActiveAnyTrue>()(inputs, masks, maskValue, index);
+  case ShaderOpKind::WaveActiveAllTrue:
+    return computeExpected<InType, OutType, ShaderOpKind::WaveActiveAllTrue>()(inputs, masks, maskValue, index);
+  case ShaderOpKind::WaveActiveAllEqual:
+    return computeExpected<InType, OutType, ShaderOpKind::WaveActiveAllEqual>()(inputs, masks, maskValue, index);
+  default:
+    DXASSERT(false, "Invalid ShaderOp Name: %s", str);
+    return (OutType) 0;
+  }
+};
+
+
+// Checks if the given warp version supports the given operation.
+bool IsValidWarpDllVersion(unsigned int minBuildNumber) {
+    HMODULE pLibrary = LoadLibrary("D3D10Warp.dll");
+    if (pLibrary) {
+        char path[MAX_PATH];
+        DWORD length = GetModuleFileName(pLibrary, path, MAX_PATH);
+        if (length) {
+            DWORD dwVerHnd = 0;
+            DWORD dwVersionInfoSize = GetFileVersionInfoSize(path, &dwVerHnd);
+            std::unique_ptr<int[]> VffInfo(new int[dwVersionInfoSize]);
+            if (GetFileVersionInfo(path, NULL, dwVersionInfoSize, VffInfo.get())) {
+                LPVOID versionInfo;
+                UINT size;
+                if (VerQueryValue(VffInfo.get(), "\\", &versionInfo, &size)) {
+                    if (size) {
+                        VS_FIXEDFILEINFO *verInfo = (VS_FIXEDFILEINFO *)versionInfo;
+                        unsigned int warpBuildNumber = verInfo->dwFileVersionLS >> 16 & 0xffff;
+                        if (verInfo->dwSignature == 0xFEEF04BD && warpBuildNumber >= minBuildNumber) {
+                            return true;
+                        }
+                    }
+                }
+            }
+        }
+        FreeLibrary(pLibrary);
+    }
+    return false;
+}
+
+
+class ExecutionTest {
+public:
+  // By default, ignore these tests, which require a recent build to run properly.
+  BEGIN_TEST_CLASS(ExecutionTest)
+    TEST_CLASS_PROPERTY(L"Parallel", L"true")
+    TEST_CLASS_PROPERTY(L"Ignore", L"true")
+    TEST_METHOD_PROPERTY(L"Priority", L"0")
+  END_TEST_CLASS()
+  TEST_CLASS_SETUP(ExecutionTestClassSetup)
+
+  TEST_METHOD(BasicComputeTest);
+  TEST_METHOD(BasicTriangleTest);
+  TEST_METHOD(BasicTriangleOpTest);
+  TEST_METHOD(OutOfBoundsTest);
+  TEST_METHOD(SaturateTest);
+  TEST_METHOD(SignTest);
+  TEST_METHOD(Int64Test);
+  TEST_METHOD(WaveIntrinsicsTest);
+  TEST_METHOD(WaveIntrinsicsInPSTest);
+  TEST_METHOD(PartialDerivTest);
+
+  // TODO: Change the priority to 0 once there is a driver that fixes the issue with WaveActive operations
+  BEGIN_TEST_METHOD(WaveIntrinsicsActiveIntTest)
+    TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#WaveIntrinsicsActiveIntTable")
+  END_TEST_METHOD()
+
+  BEGIN_TEST_METHOD(WaveIntrinsicsActiveUintTest)
+    TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#WaveIntrinsicsActiveUintTable")
+  END_TEST_METHOD()
+
+  BEGIN_TEST_METHOD(WaveIntrinsicsPrefixIntTest)
+  TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#WaveIntrinsicsPrefixIntTable")
+  END_TEST_METHOD()
+
+  BEGIN_TEST_METHOD(WaveIntrinsicsPrefixUintTest)
+  TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#WaveIntrinsicsPrefixUintTable")
+  END_TEST_METHOD()
+  // TAEF data-driven tests.
+  BEGIN_TEST_METHOD(UnaryFloatOpTest)
+    TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#UnaryFloatOpTable")
+  END_TEST_METHOD()
+  BEGIN_TEST_METHOD(BinaryFloatOpTest)
+    TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#BinaryFloatOpTable")
+  END_TEST_METHOD()
+  BEGIN_TEST_METHOD(TertiaryFloatOpTest)
+    TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#TertiaryFloatOpTable")
+  END_TEST_METHOD()
+
+  BEGIN_TEST_METHOD(UnaryIntOpTest)
+    TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#UnaryIntOpTable")
+  END_TEST_METHOD()
+  BEGIN_TEST_METHOD(BinaryIntOpTest)
+    TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#BinaryIntOpTable")
+  END_TEST_METHOD()
+  BEGIN_TEST_METHOD(TertiaryIntOpTest)
+    TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#TertiaryIntOpTable")
+  END_TEST_METHOD()
+
+  BEGIN_TEST_METHOD(UnaryUintOpTest)
+     TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#UnaryUintOpTable")
+  END_TEST_METHOD()
+  BEGIN_TEST_METHOD(BinaryUintOpTest)
+    TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#BinaryUintOpTable")
+  END_TEST_METHOD()
+  BEGIN_TEST_METHOD(TertiaryUintOpTest)
+    TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#TertiaryUintOpTable")
+  END_TEST_METHOD()
+
+  BEGIN_TEST_METHOD(DotTest)
+    TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#DotOpTable")
+  END_TEST_METHOD()
+
+  BEGIN_TEST_METHOD(Msad4Test)
+    TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#Msad4Table")
+  END_TEST_METHOD()
+
+  dxc::DxcDllSupport m_support;
+  bool m_ExperimentalModeEnabled = false;
+  static const float ClearColor[4];
+
+  template <class T1, class T2>
+  void WaveIntrinsicsActivePrefixTest(
+    TableParameter *pParameterList, size_t numParameter, bool isPrefix);
+
+  bool UseDxbc() {
+    return GetTestParamBool(L"DXBC");
+  }
+
+  bool UseDebugIfaces() {
+    return true;
+  }
+
+  bool SaveImages() {
+    return GetTestParamBool(L"SaveImages");
+  }
+
+  void CompileFromText(LPCSTR pText, LPCWSTR pEntryPoint, LPCWSTR pTargetProfile, ID3DBlob **ppBlob) {
+    VERIFY_SUCCEEDED(m_support.Initialize());
+    CComPtr<IDxcCompiler> pCompiler;
+    CComPtr<IDxcLibrary> pLibrary;
+    CComPtr<IDxcBlobEncoding> pTextBlob;
+    CComPtr<IDxcOperationResult> pResult;
+    HRESULT resultCode;
+    VERIFY_SUCCEEDED(m_support.CreateInstance(CLSID_DxcCompiler, &pCompiler));
+    VERIFY_SUCCEEDED(m_support.CreateInstance(CLSID_DxcLibrary, &pLibrary));
+    VERIFY_SUCCEEDED(pLibrary->CreateBlobWithEncodingFromPinned((LPBYTE)pText, strlen(pText), CP_UTF8, &pTextBlob));
+    VERIFY_SUCCEEDED(pCompiler->Compile(pTextBlob, L"hlsl.hlsl", pEntryPoint, pTargetProfile, nullptr, 0, nullptr, 0, nullptr, &pResult));
+    VERIFY_SUCCEEDED(pResult->GetStatus(&resultCode));
+    if (FAILED(resultCode)) {
+      CComPtr<IDxcBlobEncoding> errors;
+      VERIFY_SUCCEEDED(pResult->GetErrorBuffer(&errors));
+      LogCommentFmt(L"Failed to compile shader: %s", BlobToUtf16(errors).data());
+    }
+    VERIFY_SUCCEEDED(resultCode);
+    VERIFY_SUCCEEDED(pResult->GetResult((IDxcBlob **)ppBlob));
+  }
+
+  void CreateComputeCommandQueue(ID3D12Device *pDevice, LPCWSTR pName, ID3D12CommandQueue **ppCommandQueue) {
+    D3D12_COMMAND_QUEUE_DESC queueDesc = {};
+    queueDesc.Flags = D3D12_COMMAND_QUEUE_FLAG_NONE;
+    queueDesc.Type = D3D12_COMMAND_LIST_TYPE_COMPUTE;
+    VERIFY_SUCCEEDED(pDevice->CreateCommandQueue(&queueDesc, IID_PPV_ARGS(ppCommandQueue)));
+    VERIFY_SUCCEEDED((*ppCommandQueue)->SetName(pName));
+  }
+
+  void CreateComputePSO(ID3D12Device *pDevice, ID3D12RootSignature *pRootSignature, LPCSTR pShader, ID3D12PipelineState **ppComputeState) {
+    CComPtr<ID3DBlob> pComputeShader;
+
+    // Load and compile shaders.
+    if (UseDxbc()) {
+      DXBCFromText(pShader, L"main", L"cs_6_0", &pComputeShader);
+    }
+    else {
+      CompileFromText(pShader, L"main", L"cs_6_0", &pComputeShader);
+    }
+
+    // Describe and create the compute pipeline state object (PSO).
+    D3D12_COMPUTE_PIPELINE_STATE_DESC computePsoDesc = {};
+    computePsoDesc.pRootSignature = pRootSignature;
+    computePsoDesc.CS = CD3DX12_SHADER_BYTECODE(pComputeShader);
+
+    VERIFY_SUCCEEDED(pDevice->CreateComputePipelineState(&computePsoDesc, IID_PPV_ARGS(ppComputeState)));
+  }
+
+  bool CreateDevice(_COM_Outptr_ ID3D12Device **ppDevice) {
+    const D3D_FEATURE_LEVEL FeatureLevelRequired = D3D_FEATURE_LEVEL_11_0;
+    CComPtr<IDXGIFactory4> factory;
+    CComPtr<ID3D12Device> pDevice;
+
+    *ppDevice = nullptr;
+
+    VERIFY_SUCCEEDED(CreateDXGIFactory1(IID_PPV_ARGS(&factory)));
+    if (GetTestParamUseWARP(true)) {
+      CComPtr<IDXGIAdapter> warpAdapter;
+      VERIFY_SUCCEEDED(factory->EnumWarpAdapter(IID_PPV_ARGS(&warpAdapter)));
+      HRESULT createHR = D3D12CreateDevice(warpAdapter, FeatureLevelRequired,
+                                           IID_PPV_ARGS(&pDevice));
+      if (FAILED(createHR)) {
+        LogCommentFmt(L"The available version of WARP does not support d3d12.");
+        WEX::Logging::Log::Result(WEX::Logging::TestResults::Blocked);
+        return false;
+      }
+    } else {
+      CComPtr<IDXGIAdapter1> hardwareAdapter;
+      WEX::Common::String AdapterValue;
+      IFT(WEX::TestExecution::RuntimeParameters::TryGetValue(L"Adapter",
+                                                             AdapterValue));
+      GetHardwareAdapter(factory, AdapterValue, &hardwareAdapter);
+      if (hardwareAdapter == nullptr) {
+        WEX::Logging::Log::Error(
+            L"Unable to find hardware adapter with D3D12 support.");
+        return false;
+      }
+      VERIFY_SUCCEEDED(D3D12CreateDevice(hardwareAdapter, FeatureLevelRequired,
+                                         IID_PPV_ARGS(&pDevice)));
+      DXGI_ADAPTER_DESC1 AdapterDesc;
+      VERIFY_SUCCEEDED(hardwareAdapter->GetDesc1(&AdapterDesc));
+      LogCommentFmt(L"Using Adapter: %s", AdapterDesc.Description);
+    }
+    if (pDevice == nullptr)
+      return false;
+
+    if (!UseDxbc()) {
+      // Check for DXIL support.
+      // This is defined in d3d.h for Windows 10 Anniversary Edition SDK, but we only
+      // require the Windows 10 SDK.
+      typedef enum D3D_SHADER_MODEL {
+        D3D_SHADER_MODEL_5_1 = 0x51,
+        D3D_SHADER_MODEL_6_0 = 0x60
+      } D3D_SHADER_MODEL;
+      typedef struct D3D12_FEATURE_DATA_SHADER_MODEL {
+        _Inout_ D3D_SHADER_MODEL HighestShaderModel;
+      } D3D12_FEATURE_DATA_SHADER_MODEL;
+      const UINT D3D12_FEATURE_SHADER_MODEL = 7;
+      D3D12_FEATURE_DATA_SHADER_MODEL SMData;
+      SMData.HighestShaderModel = D3D_SHADER_MODEL_6_0;
+      VERIFY_SUCCEEDED(pDevice->CheckFeatureSupport(
+        (D3D12_FEATURE)D3D12_FEATURE_SHADER_MODEL, &SMData, sizeof(SMData)));
+      if (SMData.HighestShaderModel != D3D_SHADER_MODEL_6_0) {
+        LogCommentFmt(L"The selected device does not support "
+                      L"shader model 6 (required for DXIL).");
+        WEX::Logging::Log::Result(WEX::Logging::TestResults::Blocked);
+        return false;
+      }
+    }
+
+    if (UseDebugIfaces()) {
+      CComPtr<ID3D12InfoQueue> pInfoQueue;
+      if (SUCCEEDED(pDevice->QueryInterface(&pInfoQueue))) {
+        pInfoQueue->SetMuteDebugOutput(FALSE);
+      }
+    }
+
+    *ppDevice = pDevice.Detach();
+    return true;
+  }
+
+  void CreateGraphicsCommandQueue(ID3D12Device *pDevice, ID3D12CommandQueue **ppCommandQueue) {
+    D3D12_COMMAND_QUEUE_DESC queueDesc = {};
+    queueDesc.Flags = D3D12_COMMAND_QUEUE_FLAG_NONE;
+    queueDesc.Type = D3D12_COMMAND_LIST_TYPE_DIRECT;;
+    VERIFY_SUCCEEDED(pDevice->CreateCommandQueue(&queueDesc, IID_PPV_ARGS(ppCommandQueue)));
+  }
+
+  void CreateGraphicsCommandQueueAndList(
+      ID3D12Device *pDevice, ID3D12CommandQueue **ppCommandQueue,
+      ID3D12CommandAllocator **ppAllocator,
+      ID3D12GraphicsCommandList **ppCommandList, ID3D12PipelineState *pPSO) {
+    CreateGraphicsCommandQueue(pDevice, ppCommandQueue);
+    VERIFY_SUCCEEDED(pDevice->CreateCommandAllocator(
+        D3D12_COMMAND_LIST_TYPE_DIRECT, IID_PPV_ARGS(ppAllocator)));
+    VERIFY_SUCCEEDED(pDevice->CreateCommandList(
+        0, D3D12_COMMAND_LIST_TYPE_DIRECT, *ppAllocator, pPSO,
+        IID_PPV_ARGS(ppCommandList)));
+  }
+
+  void CreateGraphicsPSO(ID3D12Device *pDevice,
+                         D3D12_INPUT_LAYOUT_DESC *pInputLayout,
+                         ID3D12RootSignature *pRootSignature, LPCSTR pShaders,
+                         ID3D12PipelineState **ppPSO) {
+    CComPtr<ID3DBlob> vertexShader;
+    CComPtr<ID3DBlob> pixelShader;
+
+    if (UseDxbc()) {
+      DXBCFromText(pShaders, L"VSMain", L"vs_6_0", &vertexShader);
+      DXBCFromText(pShaders, L"PSMain", L"ps_6_0", &pixelShader);
+    } else {
+      CompileFromText(pShaders, L"VSMain", L"vs_6_0", &vertexShader);
+      CompileFromText(pShaders, L"PSMain", L"ps_6_0", &pixelShader);
+    }
+
+    // Describe and create the graphics pipeline state object (PSO).
+    D3D12_GRAPHICS_PIPELINE_STATE_DESC psoDesc = {};
+    psoDesc.InputLayout = *pInputLayout;
+    psoDesc.pRootSignature = pRootSignature;
+    psoDesc.VS = CD3DX12_SHADER_BYTECODE(vertexShader);
+    psoDesc.PS = CD3DX12_SHADER_BYTECODE(pixelShader);
+    psoDesc.RasterizerState = CD3DX12_RASTERIZER_DESC(D3D12_DEFAULT);
+    psoDesc.BlendState = CD3DX12_BLEND_DESC(D3D12_DEFAULT);
+    psoDesc.DepthStencilState.DepthEnable = FALSE;
+    psoDesc.DepthStencilState.StencilEnable = FALSE;
+    psoDesc.SampleMask = UINT_MAX;
+    psoDesc.PrimitiveTopologyType = D3D12_PRIMITIVE_TOPOLOGY_TYPE_TRIANGLE;
+    psoDesc.NumRenderTargets = 1;
+    psoDesc.RTVFormats[0] = DXGI_FORMAT_R8G8B8A8_UNORM;
+    psoDesc.SampleDesc.Count = 1;
+    VERIFY_SUCCEEDED(
+        pDevice->CreateGraphicsPipelineState(&psoDesc, IID_PPV_ARGS(ppPSO)));
+  }
+
+  void CreateRenderTargetAndReadback(ID3D12Device *pDevice,
+                                     ID3D12DescriptorHeap *pHeap, UINT width,
+                                     UINT height,
+                                     ID3D12Resource **ppRenderTarget,
+                                     ID3D12Resource **ppBuffer) {
+    const DXGI_FORMAT format = DXGI_FORMAT_R8G8B8A8_UNORM;
+    const size_t formatElementSize = 4;
+    CComPtr<ID3D12Resource> pRenderTarget;
+    CComPtr<ID3D12Resource> pBuffer;
+
+    CD3DX12_CPU_DESCRIPTOR_HANDLE rtvHandle(
+        pHeap->GetCPUDescriptorHandleForHeapStart());
+    CD3DX12_HEAP_PROPERTIES rtHeap(D3D12_HEAP_TYPE_DEFAULT);
+    CD3DX12_RESOURCE_DESC rtDesc(
+        CD3DX12_RESOURCE_DESC::Tex2D(format, width, height));
+    CD3DX12_CLEAR_VALUE rtClearVal(format, ClearColor);
+    rtDesc.Flags = D3D12_RESOURCE_FLAG_ALLOW_RENDER_TARGET;
+    VERIFY_SUCCEEDED(pDevice->CreateCommittedResource(
+        &rtHeap, D3D12_HEAP_FLAG_NONE, &rtDesc, D3D12_RESOURCE_STATE_COPY_DEST,
+        &rtClearVal, IID_PPV_ARGS(&pRenderTarget)));
+    pDevice->CreateRenderTargetView(pRenderTarget, nullptr, rtvHandle);
+    // rtvHandle.Offset(1, rtvDescriptorSize);  // Not needed for a single
+    // resource.
+
+    CD3DX12_HEAP_PROPERTIES readHeap(D3D12_HEAP_TYPE_READBACK);
+    CD3DX12_RESOURCE_DESC readDesc(
+        CD3DX12_RESOURCE_DESC::Buffer(width * height * formatElementSize));
+    VERIFY_SUCCEEDED(pDevice->CreateCommittedResource(
+        &readHeap, D3D12_HEAP_FLAG_NONE, &readDesc,
+        D3D12_RESOURCE_STATE_COPY_DEST, nullptr, IID_PPV_ARGS(&pBuffer)));
+
+    *ppRenderTarget = pRenderTarget.Detach();
+    *ppBuffer = pBuffer.Detach();
+  }
+
+  void CreateRootSignatureFromDesc(ID3D12Device *pDevice,
+                                   const D3D12_ROOT_SIGNATURE_DESC *pDesc,
+                                   ID3D12RootSignature **pRootSig) {
+    CComPtr<ID3DBlob> signature;
+    CComPtr<ID3DBlob> error;
+    VERIFY_SUCCEEDED(D3D12SerializeRootSignature(pDesc, D3D_ROOT_SIGNATURE_VERSION_1, &signature, &error));
+    VERIFY_SUCCEEDED(pDevice->CreateRootSignature(
+        0, signature->GetBufferPointer(), signature->GetBufferSize(),
+        IID_PPV_ARGS(pRootSig)));
+  }
+
+  void CreateRtvDescriptorHeap(ID3D12Device *pDevice, UINT numDescriptors,
+                               ID3D12DescriptorHeap **pRtvHeap, UINT *rtvDescriptorSize) {
+    D3D12_DESCRIPTOR_HEAP_DESC rtvHeapDesc = {};
+    rtvHeapDesc.NumDescriptors = numDescriptors;
+    rtvHeapDesc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_RTV;
+    rtvHeapDesc.Flags = D3D12_DESCRIPTOR_HEAP_FLAG_NONE;
+    VERIFY_SUCCEEDED(
+        pDevice->CreateDescriptorHeap(&rtvHeapDesc, IID_PPV_ARGS(pRtvHeap)));
+
+    if (rtvDescriptorSize != nullptr) {
+      *rtvDescriptorSize = pDevice->GetDescriptorHandleIncrementSize(
+          D3D12_DESCRIPTOR_HEAP_TYPE_RTV);
+    }
+  }
+
+  void CreateTestUavs(ID3D12Device *pDevice,
+                      ID3D12GraphicsCommandList *pCommandList, LPCVOID values,
+                      UINT32 valueSizeInBytes, ID3D12Resource **ppUavResource,
+                      ID3D12Resource **ppReadBuffer,
+                      ID3D12Resource **ppUploadResource) {
+    CComPtr<ID3D12Resource> pUavResource;
+    CComPtr<ID3D12Resource> pReadBuffer;
+    CComPtr<ID3D12Resource> pUploadResource;
+    D3D12_SUBRESOURCE_DATA transferData;
+    D3D12_HEAP_PROPERTIES defaultHeapProperties = CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_DEFAULT);
+    D3D12_HEAP_PROPERTIES uploadHeapProperties = CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_UPLOAD);
+    D3D12_RESOURCE_DESC bufferDesc = CD3DX12_RESOURCE_DESC::Buffer(valueSizeInBytes, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS);
+    D3D12_RESOURCE_DESC uploadBufferDesc = CD3DX12_RESOURCE_DESC::Buffer(valueSizeInBytes);
+    CD3DX12_HEAP_PROPERTIES readHeap(D3D12_HEAP_TYPE_READBACK);
+    CD3DX12_RESOURCE_DESC readDesc(CD3DX12_RESOURCE_DESC::Buffer(valueSizeInBytes));
+
+    VERIFY_SUCCEEDED(pDevice->CreateCommittedResource(
+      &defaultHeapProperties,
+      D3D12_HEAP_FLAG_NONE,
+      &bufferDesc,
+      D3D12_RESOURCE_STATE_COPY_DEST,
+      nullptr,
+      IID_PPV_ARGS(&pUavResource)));
+
+    VERIFY_SUCCEEDED(pDevice->CreateCommittedResource(
+      &uploadHeapProperties,
+      D3D12_HEAP_FLAG_NONE,
+      &uploadBufferDesc,
+      D3D12_RESOURCE_STATE_GENERIC_READ,
+      nullptr,
+      IID_PPV_ARGS(&pUploadResource)));
+
+    VERIFY_SUCCEEDED(pDevice->CreateCommittedResource(
+      &readHeap, D3D12_HEAP_FLAG_NONE, &readDesc,
+      D3D12_RESOURCE_STATE_COPY_DEST, nullptr, IID_PPV_ARGS(&pReadBuffer)));
+
+    transferData.pData = values;
+    transferData.RowPitch = valueSizeInBytes;
+    transferData.SlicePitch = transferData.RowPitch;
+
+    UpdateSubresources<1>(pCommandList, pUavResource.p, pUploadResource.p, 0, 0, 1, &transferData);
+    RecordTransitionBarrier(pCommandList, pUavResource, D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_UNORDERED_ACCESS);
+
+    *ppUavResource = pUavResource.Detach();
+    *ppReadBuffer = pReadBuffer.Detach();
+    *ppUploadResource = pUploadResource.Detach();
+  }
+
+  template <typename TVertex, int len>
+  void CreateVertexBuffer(ID3D12Device *pDevice, TVertex(&vertices)[len],
+                          ID3D12Resource **ppVertexBuffer,
+                          D3D12_VERTEX_BUFFER_VIEW *pVertexBufferView) {
+    size_t vertexBufferSize = sizeof(vertices);
+    CComPtr<ID3D12Resource> pVertexBuffer;
+    CD3DX12_HEAP_PROPERTIES heapProps(D3D12_HEAP_TYPE_UPLOAD);
+    CD3DX12_RESOURCE_DESC bufferDesc(
+        CD3DX12_RESOURCE_DESC::Buffer(vertexBufferSize));
+    VERIFY_SUCCEEDED(pDevice->CreateCommittedResource(
+        &heapProps, D3D12_HEAP_FLAG_NONE, &bufferDesc,
+        D3D12_RESOURCE_STATE_GENERIC_READ, nullptr,
+        IID_PPV_ARGS(&pVertexBuffer)));
+
+    UINT8 *pVertexDataBegin;
+    CD3DX12_RANGE readRange(0, 0);
+    VERIFY_SUCCEEDED(pVertexBuffer->Map(
+        0, &readRange, reinterpret_cast<void **>(&pVertexDataBegin)));
+    memcpy(pVertexDataBegin, vertices, vertexBufferSize);
+    pVertexBuffer->Unmap(0, nullptr);
+
+    // Initialize the vertex buffer view.
+    pVertexBufferView->BufferLocation = pVertexBuffer->GetGPUVirtualAddress();
+    pVertexBufferView->StrideInBytes = sizeof(TVertex);
+    pVertexBufferView->SizeInBytes = vertexBufferSize;
+
+    *ppVertexBuffer = pVertexBuffer.Detach();
+  }
+
+  // Requires Anniversary Edition headers, so simplifying things for current setup.
+  const UINT D3D12_FEATURE_D3D12_OPTIONS1 = 8;
+  struct D3D12_FEATURE_DATA_D3D12_OPTIONS1 {
+    BOOL WaveOps;
+    UINT WaveLaneCountMin;
+    UINT WaveLaneCountMax;
+    UINT TotalLaneCount;
+    BOOL ExpandedComputeResourceStates;
+    BOOL Int64ShaderOps;
+  };
+
+  bool DoesDeviceSupportInt64(ID3D12Device *pDevice) {
+    D3D12_FEATURE_DATA_D3D12_OPTIONS1 O;
+    if (FAILED(pDevice->CheckFeatureSupport((D3D12_FEATURE)D3D12_FEATURE_D3D12_OPTIONS1, &O, sizeof(O))))
+      return false;
+    return O.Int64ShaderOps != FALSE;
+  }
+
+  bool DoesDeviceSupportWaveOps(ID3D12Device *pDevice) {
+    D3D12_FEATURE_DATA_D3D12_OPTIONS1 O;
+    if (FAILED(pDevice->CheckFeatureSupport((D3D12_FEATURE)D3D12_FEATURE_D3D12_OPTIONS1, &O, sizeof(O))))
+      return false;
+    return O.WaveOps != FALSE;
+  }
+
+  void DXBCFromText(LPCSTR pText, LPCWSTR pEntryPoint, LPCWSTR pTargetProfile, ID3DBlob **ppBlob) {
+    CW2A pEntryPointA(pEntryPoint, CP_UTF8);
+    CW2A pTargetProfileA(pTargetProfile, CP_UTF8);
+    CComPtr<ID3DBlob> pErrors;
+    D3D_SHADER_MACRO d3dMacro[2];
+    ZeroMemory(d3dMacro, sizeof(d3dMacro));
+    d3dMacro[0].Definition = "1";
+    d3dMacro[0].Name = "USING_DXBC";
+    HRESULT hr = D3DCompile(pText, strlen(pText), "hlsl.hlsl", d3dMacro, nullptr, pEntryPointA, pTargetProfileA, 0, 0, ppBlob, &pErrors);
+    if (pErrors != nullptr) {
+      CA2W errors((char *)pErrors->GetBufferPointer(), CP_ACP);
+      LogCommentFmt(L"Compilation failure: %s", errors.m_szBuffer);
+    }
+    VERIFY_SUCCEEDED(hr);
+  }
+
+  HRESULT EnableDebugLayer() {
+    // The debug layer does net yet validate DXIL programs that require rewriting,
+    // but basic logging should work properly.
+    HRESULT hr = S_FALSE;
+    if (UseDebugIfaces()) {
+      CComPtr<ID3D12Debug> debugController;
+      hr = D3D12GetDebugInterface(IID_PPV_ARGS(&debugController));
+      if (SUCCEEDED(hr)) {
+        debugController->EnableDebugLayer();
+        hr = S_OK;
+      }
+    }
+    return hr;
+  }
+
+  HRESULT EnableExperimentalMode() {
+    if (m_ExperimentalModeEnabled) {
+      return S_OK;
+    }
+    if (!GetTestParamBool(L"ExperimentalShaders")) {
+      return S_FALSE;
+    }
+    HRESULT hr = EnableExperimentalShaderModels();
+    if (SUCCEEDED(hr)) {
+      m_ExperimentalModeEnabled = true;
+    }
+    return hr;
+  }
+
+  struct FenceObj {
+    HANDLE m_fenceEvent = NULL;
+    CComPtr<ID3D12Fence> m_fence;
+    UINT64 m_fenceValue;
+    ~FenceObj() {
+      if (m_fenceEvent) CloseHandle(m_fenceEvent);
+    }
+  };
+
+  void InitFenceObj(ID3D12Device *pDevice, FenceObj *pObj) {
+    pObj->m_fenceValue = 1;
+    VERIFY_SUCCEEDED(pDevice->CreateFence(0, D3D12_FENCE_FLAG_NONE,
+                                          IID_PPV_ARGS(&pObj->m_fence)));
+    // Create an event handle to use for frame synchronization.
+    pObj->m_fenceEvent = CreateEvent(nullptr, FALSE, FALSE, nullptr);
+    if (pObj->m_fenceEvent == nullptr) {
+      VERIFY_SUCCEEDED(HRESULT_FROM_WIN32(GetLastError()));
+    }
+  }
+
+  void ReadHlslDataIntoNewStream(LPCWSTR relativePath, IStream **ppStream) {
+    VERIFY_SUCCEEDED(m_support.Initialize());
+    CComPtr<IDxcLibrary> pLibrary;
+    CComPtr<IDxcBlobEncoding> pBlob;
+    CComPtr<IStream> pStream;
+    std::wstring path = GetPathToHlslDataFile(relativePath);
+    VERIFY_SUCCEEDED(m_support.CreateInstance(CLSID_DxcLibrary, &pLibrary));
+    VERIFY_SUCCEEDED(pLibrary->CreateBlobFromFile(path.c_str(), nullptr, &pBlob));
+    VERIFY_SUCCEEDED(pLibrary->CreateStreamFromBlobReadOnly(pBlob, &pStream));
+    *ppStream = pStream.Detach();
+  }
+
+  void RecordRenderAndReadback(ID3D12GraphicsCommandList *pList,
+                               ID3D12DescriptorHeap *pRtvHeap,
+                               UINT rtvDescriptorSize,
+                               UINT instanceCount,
+                               D3D12_VERTEX_BUFFER_VIEW *pVertexBufferView,
+                               ID3D12RootSignature *pRootSig,
+                               ID3D12Resource *pRenderTarget,
+                               ID3D12Resource *pReadBuffer) {
+    D3D12_RESOURCE_DESC rtDesc = pRenderTarget->GetDesc();
+    D3D12_VIEWPORT viewport;
+    D3D12_RECT scissorRect;
+
+    memset(&viewport, 0, sizeof(viewport));
+    viewport.Height = rtDesc.Height;
+    viewport.Width = rtDesc.Width;
+    viewport.MaxDepth = 1.0f;
+    memset(&scissorRect, 0, sizeof(scissorRect));
+    scissorRect.right = rtDesc.Width;
+    scissorRect.bottom = rtDesc.Height;
+    if (pRootSig != nullptr) {
+      pList->SetGraphicsRootSignature(pRootSig);
+    }
+    pList->RSSetViewports(1, &viewport);
+    pList->RSSetScissorRects(1, &scissorRect);
+
+    // Indicate that the buffer will be used as a render target.
+    RecordTransitionBarrier(pList, pRenderTarget, D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_RENDER_TARGET);
+
+    CD3DX12_CPU_DESCRIPTOR_HANDLE rtvHandle(pRtvHeap->GetCPUDescriptorHandleForHeapStart(), 0, rtvDescriptorSize);
+    pList->OMSetRenderTargets(1, &rtvHandle, FALSE, nullptr);
+
+    pList->ClearRenderTargetView(rtvHandle, ClearColor, 0, nullptr);
+    pList->IASetPrimitiveTopology(D3D_PRIMITIVE_TOPOLOGY_TRIANGLELIST);
+    pList->IASetVertexBuffers(0, 1, pVertexBufferView);
+    pList->DrawInstanced(3, instanceCount, 0, 0);
+
+    // Transition to copy source and copy into read-back buffer.
+    RecordTransitionBarrier(pList, pRenderTarget, D3D12_RESOURCE_STATE_RENDER_TARGET, D3D12_RESOURCE_STATE_COPY_SOURCE);
+
+    // Copy into read-back buffer.
+    UINT rowPitch = rtDesc.Width * 4;
+    if (rowPitch % D3D12_TEXTURE_DATA_PITCH_ALIGNMENT)
+      rowPitch += D3D12_TEXTURE_DATA_PITCH_ALIGNMENT - (rowPitch % D3D12_TEXTURE_DATA_PITCH_ALIGNMENT);
+    D3D12_PLACED_SUBRESOURCE_FOOTPRINT Footprint;
+    Footprint.Offset = 0;
+    Footprint.Footprint = CD3DX12_SUBRESOURCE_FOOTPRINT(DXGI_FORMAT_R8G8B8A8_UNORM, rtDesc.Width, rtDesc.Height, 1, rowPitch);
+    CD3DX12_TEXTURE_COPY_LOCATION DstLoc(pReadBuffer, Footprint);
+    CD3DX12_TEXTURE_COPY_LOCATION SrcLoc(pRenderTarget, 0);
+    pList->CopyTextureRegion(&DstLoc, 0, 0, 0, &SrcLoc, nullptr);
+  }
+
+  void RunRWByteBufferComputeTest(ID3D12Device *pDevice, LPCSTR shader, std::vector<uint32_t> &values);
+
+  void SetDescriptorHeap(ID3D12GraphicsCommandList *pCommandList, ID3D12DescriptorHeap *pHeap) {
+    ID3D12DescriptorHeap *const pHeaps[1] = { pHeap };
+    pCommandList->SetDescriptorHeaps(1, pHeaps);
+  }
+
+  void WaitForSignal(ID3D12CommandQueue *pCQ, FenceObj &FO) {
+    ::WaitForSignal(pCQ, FO.m_fence, FO.m_fenceEvent, FO.m_fenceValue++);
+  }
+};
+
+const float ExecutionTest::ClearColor[4] = { 0.0f, 0.2f, 0.4f, 1.0f };
+
+#define WAVE_INTRINSIC_DXBC_GUARD \
+  "#ifdef USING_DXBC\r\n" \
+  "uint WaveGetLaneIndex() { return 1; }\r\n" \
+  "uint WaveReadLaneFirst(uint u) { return u; }\r\n" \
+  "bool WaveIsFirstLane() { return true; }\r\n" \
+  "uint WaveGetLaneCount() { return 1; }\r\n" \
+  "uint WaveReadLaneAt(uint n, uint u) { return u; }\r\n" \
+  "bool WaveActiveAnyTrue(bool b) { return b; }\r\n" \
+  "bool WaveActiveAllTrue(bool b) { return false; }\r\n" \
+  "uint WaveActiveAllEqual(uint u) { return u; }\r\n" \
+  "uint4 WaveActiveBallot(bool b) { return 1; }\r\n" \
+  "uint WaveActiveCountBits(uint u) { return 1; }\r\n" \
+  "uint WaveActiveSum(uint u) { return 1; }\r\n" \
+  "uint WaveActiveProduct(uint u) { return 1; }\r\n" \
+  "uint WaveActiveBitAnd(uint u) { return 1; }\r\n" \
+  "uint WaveActiveBitOr(uint u) { return 1; }\r\n" \
+  "uint WaveActiveBitXor(uint u) { return 1; }\r\n" \
+  "uint WaveActiveMin(uint u) { return 1; }\r\n" \
+  "uint WaveActiveMax(uint u) { return 1; }\r\n" \
+  "uint WavePrefixCountBits(uint u) { return 1; }\r\n" \
+  "uint WavePrefixSum(uint u) { return 1; }\r\n" \
+  "uint WavePrefixProduct(uint u) { return 1; }\r\n" \
+  "uint QuadReadLaneAt(uint a, uint u) { return 1; }\r\n" \
+  "uint QuadReadAcrossX(uint u) { return 1; }\r\n" \
+  "uint QuadReadAcrossY(uint u) { return 1; }\r\n" \
+  "uint QuadReadAcrossDiagonal(uint u) { return 1; }\r\n" \
+  "#endif\r\n"
+
+
+static void SetupComputeValuePattern(std::vector<uint32_t> &values, size_t count) {
+  values.resize(count); // one element per dispatch group, in bytes
+  for (size_t i = 0; i < count; ++i) {
+    values[i] = i;
+  }
+}
+
+bool ExecutionTest::ExecutionTestClassSetup() {
+  HRESULT hr = EnableExperimentalMode();
+  if (FAILED(hr)) {
+    LogCommentFmt(L"Unable to enable shader experimental mode - 0x%08x.", hr);
+  }
+  else if (hr == S_FALSE) {
+    LogCommentFmt(L"Experimental mode not enabled.");
+  }
+  else {
+    LogCommentFmt(L"Experimental mode enabled.");
+  }
+  hr = EnableDebugLayer();
+  if (FAILED(hr)) {
+    LogCommentFmt(L"Unable to enable debug layer - 0x%08x.", hr);
+  }
+  else {
+    LogCommentFmt(L"Debug layer enabled.");
+  }
+  return true;
+}
+
+void ExecutionTest::RunRWByteBufferComputeTest(ID3D12Device *pDevice, LPCSTR pShader, std::vector<uint32_t> &values) {
+  static const int DispatchGroupX = 1;
+  static const int DispatchGroupY = 1;
+  static const int DispatchGroupZ = 1;
+
+  CComPtr<ID3D12GraphicsCommandList> pCommandList;
+  CComPtr<ID3D12CommandQueue> pCommandQueue;
+  CComPtr<ID3D12DescriptorHeap> pUavHeap;
+  CComPtr<ID3D12CommandAllocator> pCommandAllocator;
+  UINT uavDescriptorSize;
+  FenceObj FO;
+
+  const size_t valueSizeInBytes = values.size() * sizeof(uint32_t);
+  CreateComputeCommandQueue(pDevice, L"RunRWByteBufferComputeTest Command Queue", &pCommandQueue);
+  InitFenceObj(pDevice, &FO);
+
+  // Describe and create a UAV descriptor heap.
+  D3D12_DESCRIPTOR_HEAP_DESC heapDesc = {};
+  heapDesc.NumDescriptors = 1;
+  heapDesc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV;
+  heapDesc.Flags = D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE;
+  VERIFY_SUCCEEDED(pDevice->CreateDescriptorHeap(&heapDesc, IID_PPV_ARGS(&pUavHeap)));
+  uavDescriptorSize = pDevice->GetDescriptorHandleIncrementSize(heapDesc.Type);
+
+  // Create root signature.
+  CComPtr<ID3D12RootSignature> pRootSignature;
+  {
+    CD3DX12_DESCRIPTOR_RANGE ranges[1];
+    ranges[0].Init(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 0, 0);
+
+    CD3DX12_ROOT_PARAMETER rootParameters[1];
+    rootParameters[0].InitAsDescriptorTable(1, &ranges[0], D3D12_SHADER_VISIBILITY_ALL);
+
+    CD3DX12_ROOT_SIGNATURE_DESC rootSignatureDesc;
+    rootSignatureDesc.Init(_countof(rootParameters), rootParameters, 0, nullptr, D3D12_ROOT_SIGNATURE_FLAG_NONE);
+
+    CreateRootSignatureFromDesc(pDevice, &rootSignatureDesc, &pRootSignature);
+  }
+
+  // Create pipeline state object.
+  CComPtr<ID3D12PipelineState> pComputeState;
+  CreateComputePSO(pDevice, pRootSignature, pShader, &pComputeState);
+
+  // Create a command allocator and list for compute.
+  VERIFY_SUCCEEDED(pDevice->CreateCommandAllocator(D3D12_COMMAND_LIST_TYPE_COMPUTE, IID_PPV_ARGS(&pCommandAllocator)));
+  VERIFY_SUCCEEDED(pDevice->CreateCommandList(0, D3D12_COMMAND_LIST_TYPE_COMPUTE, pCommandAllocator, pComputeState, IID_PPV_ARGS(&pCommandList)));
+  pCommandList->SetName(L"ExecutionTest::RunRWByteButterComputeTest Command List");
+
+  // Set up UAV resource.
+  CComPtr<ID3D12Resource> pUavResource;
+  CComPtr<ID3D12Resource> pReadBuffer;
+  CComPtr<ID3D12Resource> pUploadResource;
+  CreateTestUavs(pDevice, pCommandList, values.data(), valueSizeInBytes, &pUavResource, &pReadBuffer, &pUploadResource);
+  VERIFY_SUCCEEDED(pUavResource->SetName(L"RunRWByteBufferComputeText UAV"));
+  VERIFY_SUCCEEDED(pReadBuffer->SetName(L"RunRWByteBufferComputeText UAV Read Buffer"));
+  VERIFY_SUCCEEDED(pUploadResource->SetName(L"RunRWByteBufferComputeText UAV Upload Buffer"));
+
+  // Close the command list and execute it to perform the GPU setup.
+  pCommandList->Close();
+  ExecuteCommandList(pCommandQueue, pCommandList);
+  WaitForSignal(pCommandQueue, FO);
+  VERIFY_SUCCEEDED(pCommandAllocator->Reset());
+  VERIFY_SUCCEEDED(pCommandList->Reset(pCommandAllocator, pComputeState));
+
+  // Run the compute shader and copy the results back to readable memory.
+  {
+    D3D12_UNORDERED_ACCESS_VIEW_DESC uavDesc = {};
+    uavDesc.Format = DXGI_FORMAT_R32_TYPELESS;
+    uavDesc.ViewDimension = D3D12_UAV_DIMENSION_BUFFER;
+    uavDesc.Buffer.FirstElement = 0;
+    uavDesc.Buffer.NumElements = values.size();
+    uavDesc.Buffer.StructureByteStride = 0;
+    uavDesc.Buffer.CounterOffsetInBytes = 0;
+    uavDesc.Buffer.Flags = D3D12_BUFFER_UAV_FLAG_RAW;
+    CD3DX12_CPU_DESCRIPTOR_HANDLE uavHandle(pUavHeap->GetCPUDescriptorHandleForHeapStart());
+    CD3DX12_GPU_DESCRIPTOR_HANDLE uavHandleGpu(pUavHeap->GetGPUDescriptorHandleForHeapStart());
+    pDevice->CreateUnorderedAccessView(pUavResource, nullptr, &uavDesc, uavHandle);
+    SetDescriptorHeap(pCommandList, pUavHeap);
+    pCommandList->SetComputeRootSignature(pRootSignature);
+    pCommandList->SetComputeRootDescriptorTable(0, uavHandleGpu);
+  }
+  pCommandList->Dispatch(DispatchGroupX, DispatchGroupY, DispatchGroupZ);
+  RecordTransitionBarrier(pCommandList, pUavResource, D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COPY_SOURCE);
+  pCommandList->CopyResource(pReadBuffer, pUavResource);
+  pCommandList->Close();
+  ExecuteCommandList(pCommandQueue, pCommandList);
+  WaitForSignal(pCommandQueue, FO);
+  {
+    MappedData mappedData(pReadBuffer, valueSizeInBytes);
+    uint32_t *pData = (uint32_t *)mappedData.data();
+    memcpy(values.data(), pData, valueSizeInBytes);
+  }
+  WaitForSignal(pCommandQueue, FO);
+}
+
+TEST_F(ExecutionTest, BasicComputeTest) {
+  //
+  // BasicComputeTest is a simple compute shader that can be used as the basis
+  // for more interesting compute execution tests.
+  // The HLSL is compatible with shader models <=5.1 to allow using the DXBC
+  // rendering code paths for comparison.
+  //
+  static const char pShader[] =
+    "RWByteAddressBuffer g_bab : register(u0);\r\n"
+    "[numthreads(8,8,1)]\r\n"
+    "void main(uint GI : SV_GroupIndex) {"
+    "  uint addr = GI * 4;\r\n"
+    "  uint val = g_bab.Load(addr);\r\n"
+    "  DeviceMemoryBarrierWithGroupSync();\r\n"
+    "  g_bab.Store(addr, val + 1);\r\n"
+    "}";
+  static const int NumThreadsX = 8;
+  static const int NumThreadsY = 8;
+  static const int NumThreadsZ = 1;
+  static const int ThreadsPerGroup = NumThreadsX * NumThreadsY * NumThreadsZ;
+  static const int DispatchGroupCount = 1;
+
+  CComPtr<ID3D12Device> pDevice;
+  if (!CreateDevice(&pDevice))
+    return;
+
+  std::vector<uint32_t> values;
+  SetupComputeValuePattern(values, ThreadsPerGroup * DispatchGroupCount);
+  VERIFY_ARE_EQUAL(values[0], 0);
+  RunRWByteBufferComputeTest(pDevice, pShader, values);
+  VERIFY_ARE_EQUAL(values[0], 1);
+}
+
+TEST_F(ExecutionTest, BasicTriangleTest) {
+  static const UINT FrameCount = 2;
+  static const UINT m_width = 320;
+  static const UINT m_height = 200;
+  static const float m_aspectRatio = static_cast<float>(m_width) / static_cast<float>(m_height);
+
+  struct Vertex {
+    XMFLOAT3 position;
+    XMFLOAT4 color;
+  };
+
+  // Pipeline objects.
+  CComPtr<ID3D12Device> pDevice;
+  CComPtr<ID3D12Resource> pRenderTarget;
+  CComPtr<ID3D12CommandAllocator> pCommandAllocator;
+  CComPtr<ID3D12CommandQueue> pCommandQueue;
+  CComPtr<ID3D12RootSignature> pRootSig;
+  CComPtr<ID3D12DescriptorHeap> pRtvHeap;
+  CComPtr<ID3D12PipelineState> pPipelineState;
+  CComPtr<ID3D12GraphicsCommandList> pCommandList;
+  CComPtr<ID3D12Resource> pReadBuffer;
+  UINT rtvDescriptorSize;
+
+  CComPtr<ID3D12Resource> pVertexBuffer;
+  D3D12_VERTEX_BUFFER_VIEW vertexBufferView;
+
+  // Synchronization objects.
+  FenceObj FO;
+
+  // Shaders.
+  static const char pShaders[] =
+    "struct PSInput {\r\n"
+    "  float4 position : SV_POSITION;\r\n"
+    "  float4 color : COLOR;\r\n"
+    "};\r\n\r\n"
+    "PSInput VSMain(float4 position : POSITION, float4 color : COLOR) {\r\n"
+    "  PSInput result;\r\n"
+    "\r\n"
+    "  result.position = position;\r\n"
+    "  result.color = color;\r\n"
+    "  return result;\r\n"
+    "}\r\n\r\n"
+    "float4 PSMain(PSInput input) : SV_TARGET {\r\n"
+    "  return 1; //input.color;\r\n"
+    "};\r\n";
+
+  if (!CreateDevice(&pDevice))
+    return;
+
+  struct BasicTestChecker {
+    CComPtr<ID3D12Device> m_pDevice;
+    CComPtr<ID3D12InfoQueue> m_pInfoQueue;
+    bool m_OK = false;
+    void SetOK(bool value) { m_OK = value; }
+    BasicTestChecker(ID3D12Device *pDevice) : m_pDevice(pDevice) {
+      if (FAILED(m_pDevice.QueryInterface(&m_pInfoQueue)))
+        return;
+      m_pInfoQueue->PushEmptyStorageFilter();
+      m_pInfoQueue->PushEmptyRetrievalFilter();
+    }
+    ~BasicTestChecker() {
+      if (!m_OK && m_pInfoQueue != nullptr) {
+        UINT64 count = m_pInfoQueue->GetNumStoredMessages();
+        bool invalidBytecodeFound = false;
+        CAtlArray<BYTE> m_pBytes;
+        for (UINT64 i = 0; i < count; ++i) {
+          SIZE_T len = 0;
+          if (FAILED(m_pInfoQueue->GetMessageA(i, nullptr, &len)))
+            continue;
+          if (m_pBytes.GetCount() < len && !m_pBytes.SetCount(len))
+            continue;
+          D3D12_MESSAGE *pMsg = (D3D12_MESSAGE *)m_pBytes.GetData();
+          if (FAILED(m_pInfoQueue->GetMessageA(i, pMsg, &len)))
+            continue;
+          if (pMsg->ID == D3D12_MESSAGE_ID_CREATEVERTEXSHADER_INVALIDSHADERBYTECODE ||
+              pMsg->ID == D3D12_MESSAGE_ID_CREATEPIXELSHADER_INVALIDSHADERBYTECODE) {
+            invalidBytecodeFound = true;
+            break;
+          }
+        }
+        if (invalidBytecodeFound) {
+          LogCommentFmt(L"%s", L"Found an invalid bytecode message. This "
+            L"typically indicates that experimental mode "
+            L"is not set up properly.");
+          if (!GetTestParamBool(L"ExperimentalShaders")) {
+            LogCommentFmt(L"Note that the ExperimentalShaders test parameter isn't set.");
+          }
+        }
+        else {
+          LogCommentFmt(L"Did not find corrupt pixel or vertex shaders in "
+                        L"queue - dumping complete queue.");
+          WriteInfoQueueMessages(nullptr, OutputFn, m_pInfoQueue);
+        }
+      }
+    }
+    static void __stdcall OutputFn(void *pCtx, const wchar_t *pMsg) {
+      LogCommentFmt(L"%s", pMsg);
+    }
+  };
+  BasicTestChecker BTC(pDevice);
+  {
+    InitFenceObj(pDevice, &FO);
+    CreateRtvDescriptorHeap(pDevice, FrameCount, &pRtvHeap, &rtvDescriptorSize);
+    CreateRenderTargetAndReadback(pDevice, pRtvHeap, m_width, m_height, &pRenderTarget, &pReadBuffer);
+
+    // Create an empty root signature.
+    CD3DX12_ROOT_SIGNATURE_DESC rootSignatureDesc;
+    rootSignatureDesc.Init(
+      0, nullptr, 0, nullptr,
+      D3D12_ROOT_SIGNATURE_FLAG_ALLOW_INPUT_ASSEMBLER_INPUT_LAYOUT);
+    CreateRootSignatureFromDesc(pDevice, &rootSignatureDesc, &pRootSig);
+
+    // Create the pipeline state, which includes compiling and loading shaders.
+    // Define the vertex input layout.
+    D3D12_INPUT_ELEMENT_DESC inputElementDescs[] = {
+        {"POSITION", 0, DXGI_FORMAT_R32G32B32_FLOAT, 0, 0,
+         D3D12_INPUT_CLASSIFICATION_PER_VERTEX_DATA, 0},
+        {"COLOR", 0, DXGI_FORMAT_R32G32B32A32_FLOAT, 0, 12,
+         D3D12_INPUT_CLASSIFICATION_PER_VERTEX_DATA, 0}};
+    D3D12_INPUT_LAYOUT_DESC InputLayout = { inputElementDescs, _countof(inputElementDescs) };
+    CreateGraphicsPSO(pDevice, &InputLayout, pRootSig, pShaders, &pPipelineState);
+
+    CreateGraphicsCommandQueueAndList(pDevice, &pCommandQueue,
+                                      &pCommandAllocator, &pCommandList,
+                                      pPipelineState);
+
+    // Define the geometry for a triangle.
+    Vertex triangleVertices[] = {
+      { { 0.0f, 0.25f * m_aspectRatio, 0.0f },{ 1.0f, 0.0f, 0.0f, 1.0f } },
+      { { 0.25f, -0.25f * m_aspectRatio, 0.0f },{ 0.0f, 1.0f, 0.0f, 1.0f } },
+      { { -0.25f, -0.25f * m_aspectRatio, 0.0f },{ 0.0f, 0.0f, 1.0f, 1.0f } } };
+
+    CreateVertexBuffer(pDevice, triangleVertices, &pVertexBuffer, &vertexBufferView);
+    WaitForSignal(pCommandQueue, FO);
+  }
+
+  // Render and execute the command list.
+  RecordRenderAndReadback(pCommandList, pRtvHeap, rtvDescriptorSize, 1,
+                          &vertexBufferView, pRootSig, pRenderTarget,
+                          pReadBuffer);
+  VERIFY_SUCCEEDED(pCommandList->Close());
+  ExecuteCommandList(pCommandQueue, pCommandList);
+
+  // Wait for previous frame.
+  WaitForSignal(pCommandQueue, FO);
+
+  // At this point, we've verified that execution succeeded with DXIL.
+  BTC.SetOK(true);
+
+  // Read back to CPU and examine contents.
+  {
+    MappedData data(pReadBuffer, m_width * m_height * 4);
+    const uint32_t *pPixels = (uint32_t *)data.data();
+    if (SaveImages()) {
+      SavePixelsToFile(pPixels, DXGI_FORMAT_R8G8B8A8_UNORM, m_width, m_height, L"basic.bmp");
+    }
+    uint32_t top = pPixels[m_width / 2]; // Top center.
+    uint32_t mid = pPixels[m_width / 2 + m_width * (m_height / 2)]; // Middle center.
+    VERIFY_ARE_EQUAL(0xff663300, top); // clear color
+    VERIFY_ARE_EQUAL(0xffffffff, mid); // white
+  }
+}
+
+TEST_F(ExecutionTest, Int64Test) {
+  static const char pShader[] =
+    "RWByteAddressBuffer g_bab : register(u0);\r\n"
+    "[numthreads(8,8,1)]\r\n"
+    "void main(uint GI : SV_GroupIndex) {"
+    "  uint addr = GI * 4;\r\n"
+    "  uint val = g_bab.Load(addr);\r\n"
+    "  uint64_t u64 = val;\r\n"
+    "  u64 *= val;\r\n"
+    "  g_bab.Store(addr, (uint)(u64 >> 32));\r\n"
+    "}";
+  static const int NumThreadsX = 8;
+  static const int NumThreadsY = 8;
+  static const int NumThreadsZ = 1;
+  static const int ThreadsPerGroup = NumThreadsX * NumThreadsY * NumThreadsZ;
+  static const int DispatchGroupCount = 1;
+
+  CComPtr<ID3D12Device> pDevice;
+  if (!CreateDevice(&pDevice))
+    return;
+
+  if (!DoesDeviceSupportInt64(pDevice)) {
+    // Optional feature, so it's correct to not support it if declared as such.
+    WEX::Logging::Log::Comment(L"Device does not support int64 operations.");
+    return;
+  }
+  std::vector<uint32_t> values;
+  SetupComputeValuePattern(values, ThreadsPerGroup * DispatchGroupCount);
+  VERIFY_ARE_EQUAL(values[0], 0);
+  RunRWByteBufferComputeTest(pDevice, pShader, values);
+  VERIFY_ARE_EQUAL(values[0], 0);
+}
+
+TEST_F(ExecutionTest, SignTest) {
+  static const char pShader[] =
+    "RWByteAddressBuffer g_bab : register(u0);\r\n"
+    "[numthreads(8,1,1)]\r\n"
+    "void main(uint GI : SV_GroupIndex) {"
+    "  uint addr = GI * 4;\r\n"
+    "  int val = g_bab.Load(addr);\r\n"
+    "  g_bab.Store(addr, (uint)(sign(val)));\r\n"
+    "}";
+  static const int NumThreadsX = 8;
+  static const int NumThreadsY = 1;
+  static const int NumThreadsZ = 1;
+  static const int ThreadsPerGroup = NumThreadsX * NumThreadsY * NumThreadsZ;
+  static const int DispatchGroupCount = 1;
+
+  CComPtr<ID3D12Device> pDevice;
+  if (!CreateDevice(&pDevice))
+    return;
+
+  std::vector<uint32_t> values = { (uint32_t)-3, (uint32_t)-2, (uint32_t)-1, 0, 1, 2, 3, 4};
+  RunRWByteBufferComputeTest(pDevice, pShader, values);
+  VERIFY_ARE_EQUAL(values[0], -1);
+  VERIFY_ARE_EQUAL(values[1], -1);
+  VERIFY_ARE_EQUAL(values[2], -1);
+  VERIFY_ARE_EQUAL(values[3], 0);
+  VERIFY_ARE_EQUAL(values[4], 1);
+  VERIFY_ARE_EQUAL(values[5], 1);
+  VERIFY_ARE_EQUAL(values[6], 1);
+  VERIFY_ARE_EQUAL(values[7], 1);
+}
+
+TEST_F(ExecutionTest, WaveIntrinsicsTest) {
+  WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
+
+  struct PerThreadData {
+    uint32_t id, flags, laneIndex, laneCount, firstLaneId, preds, firstlaneX, lane1X;
+    uint32_t allBC, allSum, allProd, allAND, allOR, allXOR, allMin, allMax;
+    uint32_t pfBC, pfSum, pfProd;
+    uint32_t ballot[4];
+    uint32_t diver;   // divergent value, used in calculation
+    int32_t i_diver;  // divergent value, used in calculation
+    int32_t i_allMax, i_allMin, i_allSum, i_allProd;
+    int32_t i_pfSum, i_pfProd;
+  };
+  static const char pShader[] =
+    WAVE_INTRINSIC_DXBC_GUARD
+    "struct PerThreadData {\r\n"
+    " uint id, flags, laneIndex, laneCount, firstLaneId, preds, firstlaneX, lane1X;\r\n"
+    " uint allBC, allSum, allProd, allAND, allOR, allXOR, allMin, allMax;\r\n"
+    " uint pfBC, pfSum, pfProd;\r\n"
+    " uint4 ballot;\r\n"
+    " uint diver;\r\n"
+    " int i_diver;\r\n"
+    " int i_allMax, i_allMin, i_allSum, i_allProd;\r\n"
+    " int i_pfSum, i_pfProd;\r\n"
+    "};\r\n"
+    "RWStructuredBuffer<PerThreadData> g_sb : register(u0);\r\n"
+    "[numthreads(8,8,1)]\r\n"
+    "void main(uint GI : SV_GroupIndex, uint3 GTID : SV_GroupThreadID) {"
+    "  PerThreadData pts = g_sb[GI];\r\n"
+    "  uint diver = GTID.x + 2;\r\n"
+    "  pts.diver = diver;\r\n"
+    "  pts.flags = 0;\r\n"
+    "  pts.preds = 0;\r\n"
+    "  if (WaveIsFirstLane()) pts.flags |= 1;\r\n"
+    "  pts.laneIndex = WaveGetLaneIndex();\r\n"
+    "  pts.laneCount = WaveGetLaneCount();\r\n"
+    "  pts.firstLaneId = WaveReadLaneFirst(pts.id);\r\n"
+    "  pts.preds |= ((WaveActiveAnyTrue(diver == 1) ? 1 : 0) << 0);\r\n"
+    "  pts.preds |= ((WaveActiveAllTrue(diver == 1) ? 1 : 0) << 1);\r\n"
+    "  pts.preds |= ((WaveActiveAllEqual(diver) ? 1 : 0) << 2);\r\n"
+    "  pts.preds |= ((WaveActiveAllEqual(GTID.z) ? 1 : 0) << 3);\r\n"
+    "  pts.preds |= ((WaveActiveAllEqual(WaveReadLaneFirst(diver)) ? 1 : 0) << 4);\r\n"
+    "  pts.ballot = WaveActiveBallot(diver > 3);\r\n"
+    "  pts.firstlaneX = WaveReadLaneFirst(GTID.x);\r\n"
+    "  pts.lane1X = WaveReadLaneAt(GTID.x, 1);\r\n"
+    "\r\n"
+    "  pts.allBC = WaveActiveCountBits(diver > 3);\r\n"
+    "  pts.allSum = WaveActiveSum(diver);\r\n"
+    "  pts.allProd = WaveActiveProduct(diver);\r\n"
+    "  pts.allAND = WaveActiveBitAnd(diver);\r\n"
+    "  pts.allOR = WaveActiveBitOr(diver);\r\n"
+    "  pts.allXOR = WaveActiveBitXor(diver);\r\n"
+    "  pts.allMin = WaveActiveMin(diver);\r\n"
+    "  pts.allMax = WaveActiveMax(diver);\r\n"
+    "\r\n"
+    "  pts.pfBC = WavePrefixCountBits(diver > 3);\r\n"
+    "  pts.pfSum = WavePrefixSum(diver);\r\n"
+    "  pts.pfProd = WavePrefixProduct(diver);\r\n"
+    "\r\n"
+    "  int i_diver = pts.i_diver;\r\n"
+    "  pts.i_allMax = WaveActiveMax(i_diver);\r\n"
+    "  pts.i_allMin = WaveActiveMin(i_diver);\r\n"
+    "  pts.i_allSum = WaveActiveSum(i_diver);\r\n"
+    "  pts.i_allProd = WaveActiveProduct(i_diver);\r\n"
+    "  pts.i_pfSum = WavePrefixSum(i_diver);\r\n"
+    "  pts.i_pfProd = WavePrefixProduct(i_diver);\r\n"
+    "\r\n"
+    "  g_sb[GI] = pts;\r\n"
+    "}";
+  static const int NumtheadsX = 8;
+  static const int NumtheadsY = 8;
+  static const int NumtheadsZ = 1;
+  static const int ThreadsPerGroup = NumtheadsX * NumtheadsY * NumtheadsZ;
+  static const int DispatchGroupCount = 1;
+
+  CComPtr<ID3D12Device> pDevice;
+  if (!CreateDevice(&pDevice))
+    return;
+
+  if (!DoesDeviceSupportWaveOps(pDevice)) {
+    // Optional feature, so it's correct to not support it if declared as such.
+    WEX::Logging::Log::Comment(L"Device does not support wave operations.");
+    return;
+  }
+
+  std::vector<PerThreadData> values;
+  values.resize(ThreadsPerGroup * DispatchGroupCount);
+  for (size_t i = 0; i < values.size(); ++i) {
+    memset(&values[i], 0, sizeof(PerThreadData));
+    values[i].id = i;
+    values[i].i_diver = (int)i;
+    values[i].i_diver *= (i % 2) ? 1 : -1;
+  }
+
+  static const int DispatchGroupX = 1;
+  static const int DispatchGroupY = 1;
+  static const int DispatchGroupZ = 1;
+
+  CComPtr<ID3D12GraphicsCommandList> pCommandList;
+  CComPtr<ID3D12CommandQueue> pCommandQueue;
+  CComPtr<ID3D12DescriptorHeap> pUavHeap;
+  CComPtr<ID3D12CommandAllocator> pCommandAllocator;
+  UINT uavDescriptorSize;
+  FenceObj FO;
+  bool dxbc = UseDxbc();
+
+  const size_t valueSizeInBytes = values.size() * sizeof(PerThreadData);
+  CreateComputeCommandQueue(pDevice, L"WaveIntrinsicsTest Command Queue", &pCommandQueue);
+  InitFenceObj(pDevice, &FO);
+
+  // Describe and create a UAV descriptor heap.
+  D3D12_DESCRIPTOR_HEAP_DESC heapDesc = {};
+  heapDesc.NumDescriptors = 1;
+  heapDesc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV;
+  heapDesc.Flags = D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE;
+  VERIFY_SUCCEEDED(pDevice->CreateDescriptorHeap(&heapDesc, IID_PPV_ARGS(&pUavHeap)));
+  uavDescriptorSize = pDevice->GetDescriptorHandleIncrementSize(heapDesc.Type);
+
+  // Create root signature.
+  CComPtr<ID3D12RootSignature> pRootSignature;
+  {
+    CD3DX12_DESCRIPTOR_RANGE ranges[1];
+    ranges[0].Init(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 0, 0);
+
+    CD3DX12_ROOT_PARAMETER rootParameters[1];
+    rootParameters[0].InitAsDescriptorTable(1, &ranges[0], D3D12_SHADER_VISIBILITY_ALL);
+
+    CD3DX12_ROOT_SIGNATURE_DESC rootSignatureDesc;
+    rootSignatureDesc.Init(_countof(rootParameters), rootParameters, 0, nullptr, D3D12_ROOT_SIGNATURE_FLAG_NONE);
+
+    CComPtr<ID3DBlob> signature;
+    CComPtr<ID3DBlob> error;
+    VERIFY_SUCCEEDED(D3D12SerializeRootSignature(&rootSignatureDesc, D3D_ROOT_SIGNATURE_VERSION_1, &signature, &error));
+    VERIFY_SUCCEEDED(pDevice->CreateRootSignature(0, signature->GetBufferPointer(), signature->GetBufferSize(), IID_PPV_ARGS(&pRootSignature)));
+  }
+
+  // Create pipeline state object.
+  CComPtr<ID3D12PipelineState> pComputeState;
+  CreateComputePSO(pDevice, pRootSignature, pShader, &pComputeState);
+
+  // Create a command allocator and list for compute.
+  VERIFY_SUCCEEDED(pDevice->CreateCommandAllocator(D3D12_COMMAND_LIST_TYPE_COMPUTE, IID_PPV_ARGS(&pCommandAllocator)));
+  VERIFY_SUCCEEDED(pDevice->CreateCommandList(0, D3D12_COMMAND_LIST_TYPE_COMPUTE, pCommandAllocator, pComputeState, IID_PPV_ARGS(&pCommandList)));
+
+  // Set up UAV resource.
+  CComPtr<ID3D12Resource> pUavResource;
+  CComPtr<ID3D12Resource> pReadBuffer;
+  CComPtr<ID3D12Resource> pUploadResource;
+  CreateTestUavs(pDevice, pCommandList, values.data(), valueSizeInBytes, &pUavResource, &pReadBuffer, &pUploadResource);
+
+  // Close the command list and execute it to perform the GPU setup.
+  pCommandList->Close();
+  ExecuteCommandList(pCommandQueue, pCommandList);
+  WaitForSignal(pCommandQueue, FO);
+  VERIFY_SUCCEEDED(pCommandAllocator->Reset());
+  VERIFY_SUCCEEDED(pCommandList->Reset(pCommandAllocator, pComputeState));
+
+  // Run the compute shader and copy the results back to readable memory.
+  {
+    D3D12_UNORDERED_ACCESS_VIEW_DESC uavDesc = {};
+    uavDesc.Format = DXGI_FORMAT_UNKNOWN;
+    uavDesc.ViewDimension = D3D12_UAV_DIMENSION_BUFFER;
+    uavDesc.Buffer.FirstElement = 0;
+    uavDesc.Buffer.NumElements = values.size();
+    uavDesc.Buffer.StructureByteStride = sizeof(PerThreadData);
+    uavDesc.Buffer.CounterOffsetInBytes = 0;
+    uavDesc.Buffer.Flags = D3D12_BUFFER_UAV_FLAG_NONE;
+    CD3DX12_CPU_DESCRIPTOR_HANDLE uavHandle(pUavHeap->GetCPUDescriptorHandleForHeapStart());
+    CD3DX12_GPU_DESCRIPTOR_HANDLE uavHandleGpu(pUavHeap->GetGPUDescriptorHandleForHeapStart());
+    pDevice->CreateUnorderedAccessView(pUavResource, nullptr, &uavDesc, uavHandle);
+    SetDescriptorHeap(pCommandList, pUavHeap);
+    pCommandList->SetComputeRootSignature(pRootSignature);
+    pCommandList->SetComputeRootDescriptorTable(0, uavHandleGpu);
+  }
+  pCommandList->Dispatch(DispatchGroupX, DispatchGroupY, DispatchGroupZ);
+  RecordTransitionBarrier(pCommandList, pUavResource, D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COPY_SOURCE);
+  pCommandList->CopyResource(pReadBuffer, pUavResource);
+  pCommandList->Close();
+  ExecuteCommandList(pCommandQueue, pCommandList);
+  WaitForSignal(pCommandQueue, FO);
+  {
+    MappedData mappedData(pReadBuffer, valueSizeInBytes);
+    PerThreadData *pData = (PerThreadData *)mappedData.data();
+    memcpy(values.data(), pData, valueSizeInBytes);
+
+    // Gather some general data.
+    // The 'firstLaneId' captures a unique number per first-lane per wave.
+    // Counting the number distinct firstLaneIds gives us the number of waves.
+    std::vector<uint32_t> firstLaneIds;
+    for (size_t i = 0; i < values.size(); ++i) {
+      PerThreadData &pts = values[i];
+      uint32_t firstLaneId = pts.firstLaneId;
+      if (!contains(firstLaneIds, firstLaneId)) {
+        firstLaneIds.push_back(firstLaneId);
+      }
+    }
+
+    // Waves should cover 4 threads or more.
+    LogCommentFmt(L"Found %u distinct lane ids: %u", firstLaneIds.size());
+    if (!dxbc) {
+      VERIFY_IS_GREATER_THAN_OR_EQUAL(values.size() / 4, firstLaneIds.size());
+    }
+
+    // Now, group threads into waves.
+    std::map<uint32_t, std::unique_ptr<std::vector<PerThreadData *> > > waves;
+    for (size_t i = 0; i < firstLaneIds.size(); ++i) {
+      waves[firstLaneIds[i]] = std::make_unique<std::vector<PerThreadData *> >();
+    }
+    for (size_t i = 0; i < values.size(); ++i) {
+      PerThreadData &pts = values[i];
+      std::unique_ptr<std::vector<PerThreadData *> > &wave = waves[pts.firstLaneId];
+      wave->push_back(&pts);
+    }
+
+    // Verify that all the wave values are coherent across the wave.
+    for (size_t i = 0; i < values.size(); ++i) {
+      PerThreadData &pts = values[i];
+      std::unique_ptr<std::vector<PerThreadData *> > &wave = waves[pts.firstLaneId];
+      // Sort the lanes by increasing lane ID.
+      struct LaneIdOrderPred {
+        bool operator()(PerThreadData *a, PerThreadData *b) {
+          return a->laneIndex < b->laneIndex;
+        }
+      };
+      std::sort(wave.get()->begin(), wave.get()->end(), LaneIdOrderPred());
+
+      // Verify some interesting properties of the first lane.
+      uint32_t pfBC, pfSum, pfProd;
+      int32_t i_pfSum, i_pfProd;
+      int32_t i_allMax, i_allMin;
+      {
+        PerThreadData *ptdFirst = wave->front();
+        VERIFY_IS_TRUE(0 != (ptdFirst->flags & 1)); // FirstLane sets this bit.
+        VERIFY_IS_TRUE(0 == ptdFirst->pfBC);
+        VERIFY_IS_TRUE(0 == ptdFirst->pfSum);
+        VERIFY_IS_TRUE(1 == ptdFirst->pfProd);
+        VERIFY_IS_TRUE(0 == ptdFirst->i_pfSum);
+        VERIFY_IS_TRUE(1 == ptdFirst->i_pfProd);
+        pfBC = (ptdFirst->diver > 3) ? 1 : 0;
+        pfSum = ptdFirst->diver;
+        pfProd = ptdFirst->diver;
+        i_pfSum = ptdFirst->i_diver;
+        i_pfProd = ptdFirst->i_diver;
+        i_allMax = i_allMin = ptdFirst->i_diver;
+      }
+
+      // Calculate values which take into consideration all lanes.
+      uint32_t preds = 0;
+      preds |= 1 << 1; // AllTrue starts true, switches to false if needed.
+      preds |= 1 << 2; // AllEqual starts true, switches to false if needed.
+      preds |= 1 << 3; // WaveActiveAllEqual(GTID.z) is always true
+      preds |= 1 << 4; // (WaveActiveAllEqual(WaveReadLaneFirst(diver)) is always true
+      uint32_t ballot[4] = { 0, 0, 0, 0 };
+      int32_t i_allSum = 0, i_allProd = 1;
+      for (size_t n = 0; n < wave->size(); ++n) {
+        std::vector<PerThreadData *> &lanes = *wave.get();
+        // pts.preds |= ((WaveActiveAnyTrue(diver == 1) ? 1 : 0) << 0);
+        if (lanes[n]->diver == 1) preds |= (1 << 0);
+        // pts.preds |= ((WaveActiveAllTrue(diver == 1) ? 1 : 0) << 1);
+        if (lanes[n]->diver != 1) preds &= ~(1 << 1);
+        // pts.preds |= ((WaveActiveAllEqual(diver) ? 1 : 0) << 2);
+        if (lanes[0]->diver != lanes[n]->diver) preds &= ~(1 << 2);
+        // pts.ballot = WaveActiveBallot(diver > 3);\r\n"
+        if (lanes[n]->diver > 3) {
+          // This is the uint4 result layout:
+          // .x -> bits  0 .. 31
+          // .y -> bits 32 .. 63
+          // .z -> bits 64 .. 95
+          // .w -> bits 96 ..127
+          uint32_t component = lanes[n]->laneIndex / 32;
+          uint32_t bit = lanes[n]->laneIndex % 32;
+          ballot[component] |= 1 << bit;
+        }
+        i_allMax = std::max(lanes[n]->i_diver, i_allMax);
+        i_allMin = std::min(lanes[n]->i_diver, i_allMin);
+        i_allProd *= lanes[n]->i_diver;
+        i_allSum += lanes[n]->i_diver;
+      }
+
+      for (size_t n = 1; n < wave->size(); ++n) {
+        // 'All' operations are uniform across the wave.
+        std::vector<PerThreadData *> &lanes = *wave.get();
+        VERIFY_IS_TRUE(0 == (lanes[n]->flags & 1)); // non-firstlanes do not set this bit
+        VERIFY_ARE_EQUAL(lanes[0]->allBC, lanes[n]->allBC);
+        VERIFY_ARE_EQUAL(lanes[0]->allSum, lanes[n]->allSum);
+        VERIFY_ARE_EQUAL(lanes[0]->allProd, lanes[n]->allProd);
+        VERIFY_ARE_EQUAL(lanes[0]->allAND, lanes[n]->allAND);
+        VERIFY_ARE_EQUAL(lanes[0]->allOR, lanes[n]->allOR);
+        VERIFY_ARE_EQUAL(lanes[0]->allXOR, lanes[n]->allXOR);
+        VERIFY_ARE_EQUAL(lanes[0]->allMin, lanes[n]->allMin);
+        VERIFY_ARE_EQUAL(lanes[0]->allMax, lanes[n]->allMax);
+        VERIFY_ARE_EQUAL(i_allMax, lanes[n]->i_allMax);
+        VERIFY_ARE_EQUAL(i_allMin, lanes[n]->i_allMin);
+        VERIFY_ARE_EQUAL(i_allProd, lanes[n]->i_allProd);
+        VERIFY_ARE_EQUAL(i_allSum, lanes[n]->i_allSum);
+
+        // first-lane reads and uniform reads are uniform across the wave.
+        VERIFY_ARE_EQUAL(lanes[0]->firstlaneX, lanes[n]->firstlaneX);
+        VERIFY_ARE_EQUAL(lanes[0]->lane1X, lanes[n]->lane1X);
+
+        // the lane count is uniform across the wave.
+        VERIFY_ARE_EQUAL(lanes[0]->laneCount, lanes[n]->laneCount);
+
+        // The predicates are uniform across the wave.
+        VERIFY_ARE_EQUAL(lanes[n]->preds, preds);
+
+        // the lane index is distinct per thread.
+        for (size_t prior = 0; prior < n; ++prior) {
+          VERIFY_ARE_NOT_EQUAL(lanes[prior]->laneIndex, lanes[n]->laneIndex);
+        }
+        // Ballot results are uniform across the wave.
+        VERIFY_ARE_EQUAL(0, memcmp(ballot, lanes[n]->ballot, sizeof(ballot)));
+
+        // Keep running total of prefix calculation. Prefix values are exclusive to
+        // the executing lane.
+        VERIFY_ARE_EQUAL(pfBC, lanes[n]->pfBC);
+        VERIFY_ARE_EQUAL(pfSum, lanes[n]->pfSum);
+        VERIFY_ARE_EQUAL(pfProd, lanes[n]->pfProd);
+        VERIFY_ARE_EQUAL(i_pfSum, lanes[n]->i_pfSum);
+        VERIFY_ARE_EQUAL(i_pfProd, lanes[n]->i_pfProd);
+        pfBC += (lanes[n]->diver > 3) ? 1 : 0;
+        pfSum += lanes[n]->diver;
+        pfProd *= lanes[n]->diver;
+        i_pfSum += lanes[n]->i_diver;
+        i_pfProd *= lanes[n]->i_diver;
+      }
+      // TODO: add divergent branching and verify that the otherwise uniform values properly diverge
+    }
+
+    // Compare each value of each per-thread element.
+    for (size_t i = 0; i < values.size(); ++i) {
+      PerThreadData &pts = values[i];
+      VERIFY_ARE_EQUAL(i, pts.id); // ID is unchanged.
+    }
+  }
+}
+
+TEST_F(ExecutionTest, WaveIntrinsicsInPSTest) {
+  WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
+
+  struct Vertex {
+    XMFLOAT3 position;
+  };
+
+  struct PerPixelData {
+    XMFLOAT4 position;
+    uint32_t id, flags, laneIndex, laneCount, firstLaneId, sum1;
+    uint32_t id0, id1, id2, id3;
+    uint32_t acrossX, acrossY, acrossDiag, quadActiveCount;
+  };
+
+  const UINT RTWidth = 128;
+  const UINT RTHeight = 128;
+
+  // Shaders.
+  static const char pShaders[] =
+    WAVE_INTRINSIC_DXBC_GUARD
+    "struct PSInput {\r\n"
+    "  float4 position : SV_POSITION;\r\n"
+    "};\r\n\r\n"
+    "PSInput VSMain(float4 position : POSITION) {\r\n"
+    "  PSInput result;\r\n"
+    "\r\n"
+    "  result.position = position;\r\n"
+    "  return result;\r\n"
+    "}\r\n\r\n"
+    "typedef uint uint32_t;\r\n"
+    "uint pos_to_id(float4 pos) { return pos.x * 128 + pos.y; }\r\n"
+    "struct PerPixelData {\r\n"
+    " float4 position;\r\n"
+    " uint32_t id, flags, laneIndex, laneCount, firstLaneId, sum1;\r\n"
+    " uint32_t id0, id1, id2, id3;\r\n"
+    " uint32_t acrossX, acrossY, acrossDiag, quadActiveCount;\r\n"
+    "};\r\n"
+    "AppendStructuredBuffer<PerPixelData> g_sb : register(u1);\r\n"
+    "float4 PSMain(PSInput input) : SV_TARGET {\r\n"
+    "  uint one = 1;\r\n"
+    "  PerPixelData d;\r\n"
+    "  d.position = input.position;\r\n"
+    "  d.id = pos_to_id(input.position);\r\n"
+    "  d.flags = 0;\r\n"
+    "  if (WaveIsFirstLane()) d.flags |= 1;\r\n"
+    "  d.laneIndex = WaveGetLaneIndex();\r\n"
+    "  d.laneCount = WaveGetLaneCount();\r\n"
+    "  d.firstLaneId = WaveReadLaneFirst(d.id);\r\n"
+    "  d.sum1 = WaveActiveSum(one);\r\n"
+    "  d.id0 = QuadReadLaneAt(d.id, 0);\r\n"
+    "  d.id1 = QuadReadLaneAt(d.id, 1);\r\n"
+    "  d.id2 = QuadReadLaneAt(d.id, 2);\r\n"
+    "  d.id3 = QuadReadLaneAt(d.id, 3);\r\n"
+    "  d.acrossX = QuadReadAcrossX(d.id);\r\n"
+    "  d.acrossY = QuadReadAcrossY(d.id);\r\n"
+    "  d.acrossDiag = QuadReadAcrossDiagonal(d.id);\r\n"
+    "  d.quadActiveCount = one + QuadReadAcrossX(one) + QuadReadAcrossY(one) + QuadReadAcrossDiagonal(one);\r\n"
+    "  g_sb.Append(d);\r\n"
+    "  return 1;\r\n"
+    "};\r\n";
+
+  CComPtr<ID3D12Device> pDevice;
+  CComPtr<ID3D12CommandQueue> pCommandQueue;
+  CComPtr<ID3D12DescriptorHeap> pUavHeap, pRtvHeap;
+  CComPtr<ID3D12CommandAllocator> pCommandAllocator;
+  CComPtr<ID3D12GraphicsCommandList> pCommandList;
+  CComPtr<ID3D12PipelineState> pPSO;
+  CComPtr<ID3D12Resource> pRenderTarget, pReadBuffer;
+  UINT uavDescriptorSize, rtvDescriptorSize;
+  CComPtr<ID3D12Resource> pVertexBuffer;
+  D3D12_VERTEX_BUFFER_VIEW vertexBufferView;
+
+  if (!CreateDevice(&pDevice))
+    return;
+  if (!DoesDeviceSupportWaveOps(pDevice)) {
+    // Optional feature, so it's correct to not support it if declared as such.
+    WEX::Logging::Log::Comment(L"Device does not support wave operations.");
+    return;
+  }
+
+  FenceObj FO;
+  InitFenceObj(pDevice, &FO);
+
+  // Describe and create a UAV descriptor heap.
+  D3D12_DESCRIPTOR_HEAP_DESC heapDesc = {};
+  heapDesc.NumDescriptors = 1;
+  heapDesc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV;
+  heapDesc.Flags = D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE;
+  VERIFY_SUCCEEDED(pDevice->CreateDescriptorHeap(&heapDesc, IID_PPV_ARGS(&pUavHeap)));
+  uavDescriptorSize = pDevice->GetDescriptorHandleIncrementSize(heapDesc.Type);
+
+  CreateRtvDescriptorHeap(pDevice, 1, &pRtvHeap, &rtvDescriptorSize);
+  CreateRenderTargetAndReadback(pDevice, pRtvHeap, RTHeight, RTWidth, &pRenderTarget, &pReadBuffer);
+
+  // Create root signature: one UAV.
+  CComPtr<ID3D12RootSignature> pRootSignature;
+  {
+    CD3DX12_DESCRIPTOR_RANGE ranges[1];
+    ranges[0].Init(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 1, 0, 0);
+
+    CD3DX12_ROOT_PARAMETER rootParameters[1];
+    rootParameters[0].InitAsDescriptorTable(1, &ranges[0], D3D12_SHADER_VISIBILITY_ALL);
+
+    CD3DX12_ROOT_SIGNATURE_DESC rootSignatureDesc;
+    rootSignatureDesc.Init(_countof(rootParameters), rootParameters, 0, nullptr, D3D12_ROOT_SIGNATURE_FLAG_ALLOW_INPUT_ASSEMBLER_INPUT_LAYOUT);
+
+    CreateRootSignatureFromDesc(pDevice, &rootSignatureDesc, &pRootSignature);
+  }
+
+  D3D12_INPUT_ELEMENT_DESC elementDesc[] = {
+      {"POSITION", 0, DXGI_FORMAT_R32G32B32_FLOAT, 0, 0,
+       D3D12_INPUT_CLASSIFICATION_PER_VERTEX_DATA, 0}};
+  D3D12_INPUT_LAYOUT_DESC InputLayout = {elementDesc, _countof(elementDesc)};
+  CreateGraphicsPSO(pDevice, &InputLayout, pRootSignature, pShaders, &pPSO);
+
+  CreateGraphicsCommandQueueAndList(pDevice, &pCommandQueue, &pCommandAllocator,
+                                    &pCommandList, pPSO);
+
+  // Single triangle covering half the target.
+  Vertex vertices[] = {
+    { { -1.0f,  1.0f, 0.0f } },
+    { {  1.0f,  1.0f, 0.0f } },
+    { { -1.0f, -1.0f, 0.0f } } };
+  const UINT TriangleCount = _countof(vertices) / 3;
+
+  CreateVertexBuffer(pDevice, vertices, &pVertexBuffer, &vertexBufferView);
+
+  bool dxbc = UseDxbc();
+
+  // Set up UAV resource.
+  std::vector<PerPixelData> values;
+  values.resize(RTWidth * RTHeight * 2);
+  UINT valueSizeInBytes = values.size() * sizeof(PerPixelData);
+  memset(values.data(), 0, valueSizeInBytes);
+  CComPtr<ID3D12Resource> pUavResource;
+  CComPtr<ID3D12Resource> pUavReadBuffer;
+  CComPtr<ID3D12Resource> pUploadResource;
+  CreateTestUavs(pDevice, pCommandList, values.data(), valueSizeInBytes, &pUavResource, &pUavReadBuffer, &pUploadResource);
+
+  // Set up the append counter resource.
+  CComPtr<ID3D12Resource> pUavCounterResource;
+  CComPtr<ID3D12Resource> pReadCounterBuffer;
+  CComPtr<ID3D12Resource> pUploadCounterResource;
+  BYTE zero[sizeof(UINT)] = { 0 };
+  CreateTestUavs(pDevice, pCommandList, zero, sizeof(zero), &pUavCounterResource, &pReadCounterBuffer, &pUploadCounterResource);
+
+  // Close the command list and execute it to perform the GPU setup.
+  pCommandList->Close();
+  ExecuteCommandList(pCommandQueue, pCommandList);
+  WaitForSignal(pCommandQueue, FO);
+  VERIFY_SUCCEEDED(pCommandAllocator->Reset());
+  VERIFY_SUCCEEDED(pCommandList->Reset(pCommandAllocator, pPSO));
+
+  pCommandList->SetGraphicsRootSignature(pRootSignature);
+  SetDescriptorHeap(pCommandList, pUavHeap);
+  {
+    D3D12_UNORDERED_ACCESS_VIEW_DESC uavDesc = {};
+    uavDesc.Format = DXGI_FORMAT_UNKNOWN;
+    uavDesc.ViewDimension = D3D12_UAV_DIMENSION_BUFFER;
+    uavDesc.Buffer.FirstElement = 0;
+    uavDesc.Buffer.NumElements = values.size();
+    uavDesc.Buffer.StructureByteStride = sizeof(PerPixelData);
+    uavDesc.Buffer.CounterOffsetInBytes = 0;
+    uavDesc.Buffer.Flags = D3D12_BUFFER_UAV_FLAG_NONE;
+    CD3DX12_CPU_DESCRIPTOR_HANDLE uavHandle(pUavHeap->GetCPUDescriptorHandleForHeapStart());
+    CD3DX12_GPU_DESCRIPTOR_HANDLE uavHandleGpu(pUavHeap->GetGPUDescriptorHandleForHeapStart());
+    pDevice->CreateUnorderedAccessView(pUavResource, pUavCounterResource, &uavDesc, uavHandle);
+    pCommandList->SetGraphicsRootDescriptorTable(0, uavHandleGpu);
+  }
+  RecordRenderAndReadback(pCommandList, pRtvHeap, rtvDescriptorSize, TriangleCount, &vertexBufferView, nullptr, pRenderTarget, pReadBuffer);
+  RecordTransitionBarrier(pCommandList, pUavResource, D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COPY_SOURCE);
+  RecordTransitionBarrier(pCommandList, pUavCounterResource, D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COPY_SOURCE);
+  pCommandList->CopyResource(pUavReadBuffer, pUavResource);
+  pCommandList->CopyResource(pReadCounterBuffer, pUavCounterResource);
+  VERIFY_SUCCEEDED(pCommandList->Close());
+  LogCommentFmt(L"Rendering to %u by %u", RTWidth, RTHeight);
+  ExecuteCommandList(pCommandQueue, pCommandList);
+  WaitForSignal(pCommandQueue, FO);
+  {
+    MappedData data(pReadBuffer, RTWidth * RTHeight * 4);
+    const uint32_t *pPixels = (uint32_t *)data.data();
+    if (SaveImages()) {
+      SavePixelsToFile(pPixels, DXGI_FORMAT_R8G8B8A8_UNORM, RTWidth, RTHeight, L"psintrin.bmp");
+    }
+  }
+
+  uint32_t appendCount;
+  {
+    MappedData mappedData(pReadCounterBuffer, sizeof(uint32_t));
+    appendCount = *((uint32_t *)mappedData.data());
+    LogCommentFmt(L"%u elements in append buffer", appendCount);
+  }
+
+  {
+    MappedData mappedData(pUavReadBuffer, values.size());
+    PerPixelData *pData = (PerPixelData *)mappedData.data();
+    memcpy(values.data(), pData, valueSizeInBytes);
+
+    // DXBC is handy to test pipeline setup, but interesting functions are
+    // stubbed out, so there is no point in further validation.
+    if (dxbc)
+      return;
+
+    uint32_t maxActiveLaneCount = 0;
+    uint32_t maxLaneCount = 0;
+    for (uint32_t i = 0; i < appendCount; ++i) {
+      maxActiveLaneCount = std::max(maxActiveLaneCount, values[i].sum1);
+      maxLaneCount = std::max(maxLaneCount, values[i].laneCount);
+    }
+
+    uint32_t peerOfHelperLanes = 0;
+    for (uint32_t i = 0; i < appendCount; ++i) {
+      if (values[i].sum1 != maxActiveLaneCount) {
+        ++peerOfHelperLanes;
+      }
+    }
+
+    LogCommentFmt(
+        L"Found: %u threads. Waves reported up to %u total lanes, up "
+        L"to %u active lanes, and %u threads had helper/inactive lanes.",
+        appendCount, maxLaneCount, maxActiveLaneCount, peerOfHelperLanes);
+
+    // Group threads into quad invocations.
+    uint32_t singlePixelCount = 0;
+    uint32_t multiPixelCount = 0;
+    std::unordered_set<uint32_t> ids;
+    std::multimap<uint32_t, PerPixelData *> idGroups;
+    std::multimap<uint32_t, PerPixelData *> firstIdGroups;
+    for (uint32_t i = 0; i < appendCount; ++i) {
+      ids.insert(values[i].id);
+      idGroups.insert(std::make_pair(values[i].id, &values[i]));
+      firstIdGroups.insert(std::make_pair(values[i].firstLaneId, &values[i]));
+    }
+    for (uint32_t id : ids) {
+      if (idGroups.count(id) == 1)
+        ++singlePixelCount;
+      else
+        ++multiPixelCount;
+    }
+    LogCommentFmt(L"%u pixels were processed by a single thread. %u invocations were for shared pixels.",
+      singlePixelCount, multiPixelCount);
+
+    // Multiple threads may have tried to shade the same pixel.
+    // Where every pixel is distinct, it's very straightforward to validate.
+    {
+      auto cur = firstIdGroups.begin(), end = firstIdGroups.end();
+      while (cur != end) {
+        bool simpleWave = true;
+        uint32_t firstId = (*cur).first;
+        auto groupEnd = cur;
+        while (groupEnd != end && (*groupEnd).first == firstId) {
+          if (idGroups.count((*groupEnd).second->id) > 1)
+            simpleWave = false;
+          ++groupEnd;
+        }
+        if (simpleWave) {
+          // Break the wave into quads.
+          struct QuadData {
+            unsigned count;
+            PerPixelData *data[4];
+          };
+          std::map<uint32_t, QuadData> quads;
+          for (auto i = cur; i != groupEnd; ++i) {
+            uint32_t quadId = (*i).second->id0;
+            auto match = quads.find(quadId);
+            if (match == quads.end()) {
+              QuadData qdata;
+              qdata.count = 1;
+              qdata.data[0] = (*i).second;
+              quads.insert(std::make_pair(quadId, qdata));
+            }
+            else {
+              VERIFY_IS_TRUE((*match).second.count < 4);
+              (*match).second.data[(*match).second.count++] = (*i).second;
+            }
+          }
+          for (auto quadPair : quads) {
+            unsigned count = quadPair.second.count;
+            if (count < 2) continue;
+            PerPixelData **data = quadPair.second.data;
+            bool isTop[4];
+            bool isLeft[4];
+            PerPixelData helperData;
+            memset(&helperData, sizeof(helperData), 0);
+            PerPixelData *layout[4]; // tl,tr,bl,br
+            memset(layout, sizeof(layout), 0);
+            auto fnToLayout = [&](bool top, bool left) -> PerPixelData ** {
+              int idx = top ? 0 : 2;
+              idx += left ? 0 : 1;
+              return &layout[idx];
+            };
+            auto fnToLayoutData = [&](bool top, bool left) -> PerPixelData * {
+              PerPixelData **pResult = fnToLayout(top, left);
+              if (*pResult == nullptr) return &helperData;
+              return *pResult;
+            };
+            VERIFY_IS_TRUE(count <= 4);
+            if (count == 2) {
+              isTop[0] = data[0]->position.y < data[1]->position.y;
+              isTop[1] = (data[0]->position.y == data[1]->position.y) ? isTop[0] : !isTop[0];
+              isLeft[0] = data[0]->position.x < data[1]->position.x;
+              isLeft[1] = (data[0]->position.x == data[1]->position.x) ? isLeft[0] : !isLeft[0];
+            }
+            else {
+              // with at least three samples, we have distinct x and y coordinates.
+              float left = std::min(data[0]->position.x, data[1]->position.x);
+              left = std::min(data[2]->position.x, left);
+              float top = std::min(data[0]->position.y, data[1]->position.y);
+              top = std::min(data[2]->position.y, top);
+              for (unsigned i = 0; i < count; ++i) {
+                isTop[i] = data[i]->position.y == top;
+                isLeft[i] = data[i]->position.x == left;
+              }
+            }
+            for (unsigned i = 0; i < count; ++i) {
+              *(fnToLayout(isTop[i], isLeft[i])) = data[i];
+            }
+
+            // Finally, we have a proper quad reconstructed. Validate.
+            for (unsigned i = 0; i < count; ++i) {
+              PerPixelData *d = data[i];
+              VERIFY_ARE_EQUAL(d->id0, fnToLayoutData(true, true)->id);
+              VERIFY_ARE_EQUAL(d->id1, fnToLayoutData(true, false)->id);
+              VERIFY_ARE_EQUAL(d->id2, fnToLayoutData(false, true)->id);
+              VERIFY_ARE_EQUAL(d->id3, fnToLayoutData(false, false)->id);
+              VERIFY_ARE_EQUAL(d->acrossX, fnToLayoutData(isTop[i], !isLeft[i])->id);
+              VERIFY_ARE_EQUAL(d->acrossY, fnToLayoutData(!isTop[i], isLeft[i])->id);
+              VERIFY_ARE_EQUAL(d->acrossDiag, fnToLayoutData(!isTop[i], !isLeft[i])->id);
+              VERIFY_ARE_EQUAL(d->quadActiveCount, count);
+            }
+          }
+        }
+        cur = groupEnd;
+      }
+    }
+
+    // TODO: provide validation for quads where the same pixel was shaded multiple times
+    //
+    // Consider: for pixels that were shaded multiple times, check whether
+    // some grouping of threads into quads satisfies all value requirements.
+  }
+}
+
+struct ShaderOpTestResult {
+  st::ShaderOp *ShaderOp;
+  std::shared_ptr<st::ShaderOpSet> ShaderOpSet;
+  std::shared_ptr<st::ShaderOpTest> Test;
+};
+
+struct SPrimitives {
+  float f_float;
+  float f_float2;
+  float f_float_o;
+  float f_float2_o;
+};
+
+std::shared_ptr<ShaderOpTestResult>
+RunShaderOpTestAfterParse(ID3D12Device *pDevice, dxc::DxcDllSupport &support,
+  IStream *pStream, LPCSTR pName,
+  st::ShaderOpTest::TInitCallbackFn pInitCallback, std::shared_ptr<st::ShaderOpSet> ShaderOpSet) {
+  DXASSERT_NOMSG(pStream != nullptr);
+  st::ShaderOp *pShaderOp;
+  if (pName == nullptr) {
+    if (ShaderOpSet->ShaderOps.size() != 1) {
+      VERIFY_FAIL(L"Expected a single shader operation.");
+    }
+    pShaderOp = ShaderOpSet->ShaderOps[0].get();
+  }
+  else {
+    pShaderOp = ShaderOpSet->GetShaderOp(pName);
+  }
+  if (pShaderOp == nullptr) {
+    std::string msg = "Unable to find shader op ";
+    msg += pName;
+    msg += "; available ops";
+    const char sep = ':';
+    for (auto &pAvailOp : ShaderOpSet->ShaderOps) {
+      msg += sep;
+      msg += pAvailOp->Name ? pAvailOp->Name : "[n/a]";
+    }
+    CA2W msgWide(msg.c_str());
+    VERIFY_FAIL(msgWide.m_psz);
+  }
+
+  // This won't actually be used since we're supplying the device,
+  // but let's make it consistent.
+  pShaderOp->UseWarpDevice = GetTestParamUseWARP(true);
+
+  std::shared_ptr<st::ShaderOpTest> test = std::make_shared<st::ShaderOpTest>();
+  test->SetDxcSupport(&support);
+  test->SetInitCallback(pInitCallback);
+  test->SetDevice(pDevice);
+  test->RunShaderOp(pShaderOp);
+
+  std::shared_ptr<ShaderOpTestResult> result =
+      std::make_shared<ShaderOpTestResult>();
+  result->ShaderOpSet = ShaderOpSet;
+  result->Test = test;
+  result->ShaderOp = pShaderOp;
+  return result;
+}
+
+std::shared_ptr<ShaderOpTestResult>
+RunShaderOpTest(ID3D12Device *pDevice, dxc::DxcDllSupport &support,
+                IStream *pStream, LPCSTR pName,
+                st::ShaderOpTest::TInitCallbackFn pInitCallback) {
+  DXASSERT_NOMSG(pStream != nullptr);
+  std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
+        std::make_shared<st::ShaderOpSet>();
+  st::ParseShaderOpSetFromStream(pStream, ShaderOpSet.get());
+  return RunShaderOpTestAfterParse(pDevice, support, pStream, pName, pInitCallback, ShaderOpSet);
+}
+
+TEST_F(ExecutionTest, OutOfBoundsTest) {
+  WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
+  CComPtr<IStream> pStream;
+  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+
+  // Single operation test at the moment.
+  CComPtr<ID3D12Device> pDevice;
+  if (!CreateDevice(&pDevice))
+    return;
+
+  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(pDevice, m_support, pStream, "OOB", nullptr);
+  MappedData data;
+  // Read back to CPU and examine contents - should get pure red.
+  {
+    MappedData data;
+    test->Test->GetReadBackData("RTarget", &data);
+    const uint32_t *pPixels = (uint32_t *)data.data();
+    uint32_t first = *pPixels;
+    VERIFY_ARE_EQUAL(0xff0000ff, first); // pure red - only first component is read
+  }
+}
+
+TEST_F(ExecutionTest, SaturateTest) {
+  WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
+  CComPtr<IStream> pStream;
+  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+
+  // Single operation test at the moment.
+  CComPtr<ID3D12Device> pDevice;
+  if (!CreateDevice(&pDevice))
+    return;
+
+  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(pDevice, m_support, pStream, "Saturate", nullptr);
+  MappedData data;
+  test->Test->GetReadBackData("U0", &data);
+  const float *pValues = (float *)data.data();
+  // Everything is zero except for 1.5f and +Inf, which saturate to 1.0f
+  const float ExpectedCases[9] = {
+    0.0f, 0.0f, 0.0f, 0.0f, // -inf, -1.5, -denorm, -0
+    0.0f, 0.0f, 1.0f, 1.0f, // 0, denorm, 1.5f, inf
+    0.0f                    // nan
+  };
+  for (size_t i = 0; i < _countof(ExpectedCases); ++i) {
+    VERIFY_ARE_EQUAL(*pValues, ExpectedCases[i]);
+    ++pValues;
+  }
+}
+
+TEST_F(ExecutionTest, BasicTriangleOpTest) {
+  WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
+  CComPtr<IStream> pStream;
+  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+
+  // Single operation test at the moment.
+  CComPtr<ID3D12Device> pDevice;
+  if (!CreateDevice(&pDevice))
+    return;
+
+  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(pDevice, m_support, pStream, "Triangle", nullptr);
+  MappedData data;
+  D3D12_RESOURCE_DESC &D = test->ShaderOp->GetResourceByName("RTarget")->Desc;
+  UINT width = (UINT64)D.Width;
+  UINT height = (UINT64)D.Height;
+  test->Test->GetReadBackData("RTarget", &data);
+  const uint32_t *pPixels = (uint32_t *)data.data();
+  if (SaveImages()) {
+    SavePixelsToFile(pPixels, DXGI_FORMAT_R8G8B8A8_UNORM, 320, 200, L"basic.bmp");
+  }
+  uint32_t top = pPixels[width / 2]; // Top center.
+  uint32_t mid = pPixels[width / 2 + width * (height / 2)]; // Middle center.
+  VERIFY_ARE_EQUAL(0xff663300, top); // clear color
+  VERIFY_ARE_EQUAL(0xffffffff, mid); // white
+
+  // This is the basic validation test for shader operations, so it's good to
+  // check this here at least for this one test case.
+  data.reset();
+  test.reset();
+  ReportLiveObjects();
+}
+
+// Rendering two right triangles forming a square and assigning a texture value
+// for each pixel to calculate derivates.
+TEST_F(ExecutionTest, PartialDerivTest) {
+  WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
+  CComPtr<IStream> pStream;
+  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+
+  CComPtr<ID3D12Device> pDevice;
+  if (!CreateDevice(&pDevice))
+      return;
+
+  std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(pDevice, m_support, pStream, "DerivFine", nullptr);
+  MappedData data;
+  D3D12_RESOURCE_DESC &D = test->ShaderOp->GetResourceByName("RTarget")->Desc;
+  UINT width = (UINT64)D.Width;
+  UINT height = (UINT64)D.Height;
+  UINT pixelSize = GetByteSizeForFormat(D.Format) / 4;
+
+  test->Test->GetReadBackData("RTarget", &data);
+  const float *pPixels = (float *)data.data();
+
+  UINT centerIndex = (UINT64)width * height / 2 - width / 2;
+
+  // pixel at the center
+  UINT offsetCenter = centerIndex * pixelSize;
+  float CenterDDXFine = pPixels[offsetCenter];
+  float CenterDDYFine = pPixels[offsetCenter + 1];
+  float CenterDDXCoarse = pPixels[offsetCenter + 2];
+  float CenterDDYCoarse = pPixels[offsetCenter + 3];
+
+  LogCommentFmt(
+      L"center  ddx_fine: %8f, ddy_fine: %8f, ddx_coarse: %8f, ddy_coarse: %8f",
+      CenterDDXFine, CenterDDYFine, CenterDDXCoarse, CenterDDYCoarse);
+
+  // The texture for the 9 pixels in the center should look like the following
+
+  // 256   32  64
+  // 2048 256 512
+  // 1   .125 .25
+
+  // In D3D12 there is no guarantee of how the adapter is grouping 2x2 pixels
+  // So for fine derivatives there can be up to two possible results for the center pixel,
+  // while for coarse derivatives there can be up to six possible results.
+  int ulpTolerance = 1;
+  // 512 - 256 or 2048 - 256
+  bool left = CompareFloatULP(CenterDDXFine, -1792.0f, ulpTolerance);
+  VERIFY_IS_TRUE(left || CompareFloatULP(CenterDDXFine, 256.0f, ulpTolerance));
+  // 256 - 32 or 256 - .125
+  bool top = CompareFloatULP(CenterDDYFine, 224.0f, ulpTolerance);
+  VERIFY_IS_TRUE(top || CompareFloatULP(CenterDDYFine, -255.875, ulpTolerance));
+
+  if (top && left) {
+    VERIFY_IS_TRUE((CompareFloatULP(CenterDDXCoarse, -224.0f, ulpTolerance) ||
+                   CompareFloatULP(CenterDDXCoarse, -1792.0f, ulpTolerance)) &&
+                   (CompareFloatULP(CenterDDYCoarse, 224.0f, ulpTolerance) ||
+                   CompareFloatULP(CenterDDYCoarse, 1792.0f, ulpTolerance)));
+  }
+  else if (top) { // top right quad
+    VERIFY_IS_TRUE((CompareFloatULP(CenterDDXCoarse, 256.0f, ulpTolerance)  ||
+                   CompareFloatULP(CenterDDXCoarse, 32.0f, ulpTolerance))   &&
+                   (CompareFloatULP(CenterDDYCoarse, 224.0f, ulpTolerance) ||
+                   CompareFloatULP(CenterDDYCoarse, 448.0f, ulpTolerance)));
+  }
+  else if (left) { // bottom left quad
+    VERIFY_IS_TRUE((CompareFloatULP(CenterDDXCoarse, -1792.0f, ulpTolerance) ||
+                   CompareFloatULP(CenterDDXCoarse, -.875f, ulpTolerance))   &&
+                   (CompareFloatULP(CenterDDYCoarse, -2047.0f, ulpTolerance) ||
+                   CompareFloatULP(CenterDDYCoarse, -255.875f, ulpTolerance)));
+  }
+  else { // bottom right
+    VERIFY_IS_TRUE((CompareFloatULP(CenterDDXCoarse, 256.0f, ulpTolerance) ||
+                   CompareFloatULP(CenterDDXCoarse, .125f, ulpTolerance))  &&
+                   (CompareFloatULP(CenterDDYCoarse, -255.875f, ulpTolerance) ||
+                   CompareFloatULP(CenterDDYCoarse, -511.75f, ulpTolerance)));
+  }
+}
+
+// Resource structure for data-driven tests.
+
+struct SUnaryFPOp {
+    float input;
+    float output;
+};
+
+struct SBinaryFPOp {
+    float input1;
+    float input2;
+    float output1;
+    float output2;
+};
+
+struct STertiaryFPOp {
+    float input1;
+    float input2;
+    float input3;
+    float output;
+};
+
+struct SUnaryIntOp {
+    int input;
+    int output;
+};
+
+struct SUnaryUintOp {
+    unsigned int input;
+    unsigned int output;
+};
+
+struct SBinaryIntOp {
+    int input1;
+    int input2;
+    int output1;
+    int output2;
+};
+
+struct STertiaryIntOp {
+    int input1;
+    int input2;
+    int input3;
+    int output;
+};
+
+struct SBinaryUintOp {
+    unsigned int input1;
+    unsigned int input2;
+    unsigned int output1;
+    unsigned int output2;
+};
+
+struct STertiaryUintOp {
+    unsigned int input1;
+    unsigned int input2;
+    unsigned int input3;
+    unsigned int output;
+};
+
+// representation for HLSL float vectors
+struct SDotOp {
+    XMFLOAT4 input1;
+    XMFLOAT4 input2;
+    float o_dot2;
+    float o_dot3;
+    float o_dot4;
+};
+
+struct SMsad4 {
+    unsigned int ref;
+    XMUINT2 src;
+    XMUINT4 accum;
+    XMUINT4 result;
+};
+
+// Parameter representation for taef data-driven tests
+struct TableParameter {
+    LPCWSTR m_name;
+    enum TableParameterType {
+        INT,
+        UINT,
+        DOUBLE,
+        STRING,
+        BOOL,
+        INT_TABLE,
+        DOUBLE_TABLE,
+        STRING_TABLE,
+        UINT_TABLE,
+        BOOL_TABLE
+    };
+    TableParameterType m_type;
+    bool m_required; // required parameter
+    int m_int;
+    unsigned int m_uint;
+    double m_double;
+    bool m_bool;
+    WEX::Common::String m_str;
+    WEX::TestExecution::TestDataArray<int> m_intTable;
+    WEX::TestExecution::TestDataArray<unsigned int> m_uintTable;
+    WEX::TestExecution::TestDataArray<double> m_doubleTable;
+    WEX::TestExecution::TestDataArray<bool> m_boolTable;
+    WEX::TestExecution::TestDataArray<WEX::Common::String> m_StringTable;
+};
+
+class TableParameterHandler {
+public:
+  TableParameter* m_table;
+  size_t m_tableSize;
+
+  TableParameterHandler(TableParameter *pTable, size_t size) : m_table(pTable), m_tableSize(size) {}
+
+  TableParameter* GetTableParamByName(LPCWSTR name) {
+    for (size_t i = 0; i < m_tableSize; ++i) {
+      if (_wcsicmp(name, m_table[i].m_name) == 0) {
+        return &m_table[i];
+      }
+    }
+    DXASSERT(false, "Invalid Table Parameter Name %s", name);
+    return nullptr;
+  }
+
+  void clearTableParameter() {
+    for (size_t i = 0; i < m_tableSize; ++i) {
+      m_table[i].m_int = 0;
+      m_table[i].m_uint = 0;
+      m_table[i].m_double = 0;
+      m_table[i].m_bool = false;
+      m_table[i].m_str = WEX::Common::String();
+    }
+  }
+
+  template <class T1>
+  WEX::TestExecution::TestDataArray<T1> *GetDataArray(LPCWSTR name) {
+    return nullptr;
+  }
+
+  template <>
+  WEX::TestExecution::TestDataArray<int> *GetDataArray(LPCWSTR name) {
+    for (size_t i = 0; i < m_tableSize; ++i) {
+      if (_wcsicmp(name, m_table[i].m_name) == 0) {
+        return &(m_table[i].m_intTable);
+      }
+    }
+    DXASSERT(false, "Invalid Table Parameter Name %s", name);
+    return nullptr;
+  }
+
+  template <>
+  WEX::TestExecution::TestDataArray<unsigned int> *GetDataArray(LPCWSTR name) {
+    for (size_t i = 0; i < m_tableSize; ++i) {
+      if (_wcsicmp(name, m_table[i].m_name) == 0) {
+        return &(m_table[i].m_uintTable);
+      }
+    }
+    DXASSERT(false, "Invalid Table Parameter Name %s", name);
+    return nullptr;
+  }
+
+  template <>
+  WEX::TestExecution::TestDataArray<double> *GetDataArray(LPCWSTR name) {
+    for (size_t i = 0; i < m_tableSize; ++i) {
+      if (_wcsicmp(name, m_table[i].m_name) == 0) {
+        return &(m_table[i].m_doubleTable);
+      }
+    }
+    DXASSERT(false, "Invalid Table Parameter Name %s", name);
+    return nullptr;
+  }
+
+  template <>
+  WEX::TestExecution::TestDataArray<bool> *GetDataArray(LPCWSTR name) {
+    for (size_t i = 0; i < m_tableSize; ++i) {
+      if (_wcsicmp(name, m_table[i].m_name) == 0) {
+        return &(m_table[i].m_boolTable);
+      }
+    }
+    DXASSERT(false, "Invalid Table Parameter Name %s", name);
+    return nullptr;
+  }
+};
+
+static TableParameter UnaryFPOpParameters[] = {
+    { L"ShaderOp.Name", TableParameter::STRING, true },
+    { L"ShaderOp.Target", TableParameter::STRING, true },
+    { L"ShaderOp.EntryPoint", TableParameter::STRING, true },
+    { L"ShaderOp.Text", TableParameter::STRING, true },
+    { L"Validation.Input", TableParameter::STRING_TABLE, true },
+    { L"Validation.Expected", TableParameter::STRING_TABLE, true },
+    { L"Validation.Type", TableParameter::STRING, true },
+    { L"Validation.Tolerance", TableParameter::DOUBLE, true },
+    { L"Validation.NumInput", TableParameter::UINT, true },
+    { L"Warp.Version", TableParameter::UINT, false }
+};
+
+static TableParameter BinaryFPOpParameters[] = {
+    { L"ShaderOp.Name", TableParameter::STRING, true },
+    { L"ShaderOp.Target", TableParameter::STRING, true },
+    { L"ShaderOp.EntryPoint", TableParameter::STRING, true },
+    { L"ShaderOp.Text", TableParameter::STRING, true },
+    { L"Validation.Input1", TableParameter::STRING_TABLE, true },
+    { L"Validation.Input2", TableParameter::STRING_TABLE, true },
+    { L"Validation.Expected1", TableParameter::STRING_TABLE, true },
+    { L"Validation.Expected2", TableParameter::STRING_TABLE, true },
+    { L"Validation.Type", TableParameter::STRING, true },
+    { L"Validation.Tolerance", TableParameter::DOUBLE, true },
+    { L"Validation.NumInput", TableParameter::UINT, true }
+};
+
+static TableParameter TertiaryFPOpParameters[] = {
+    { L"ShaderOp.Name", TableParameter::STRING, true },
+    { L"ShaderOp.Target", TableParameter::STRING, true },
+    { L"ShaderOp.EntryPoint", TableParameter::STRING, true },
+    { L"ShaderOp.Text", TableParameter::STRING, true },
+    { L"Validation.Input1", TableParameter::STRING_TABLE, true },
+    { L"Validation.Input2", TableParameter::STRING_TABLE, true },
+    { L"Validation.Input3", TableParameter::STRING_TABLE, true },
+    { L"Validation.Expected", TableParameter::STRING_TABLE, true },
+    { L"Validation.Type", TableParameter::STRING, true },
+    { L"Validation.Tolerance", TableParameter::DOUBLE, true },
+    { L"Validation.NumInput", TableParameter::UINT, true }
+};
+
+static TableParameter UnaryIntOpParameters[] = {
+    { L"ShaderOp.Name", TableParameter::STRING, true },
+    { L"ShaderOp.Target", TableParameter::STRING, true },
+    { L"ShaderOp.EntryPoint", TableParameter::STRING, true },
+    { L"ShaderOp.Text", TableParameter::STRING, true },
+    { L"Validation.Input", TableParameter::INT_TABLE, true },
+    { L"Validation.Expected", TableParameter::INT_TABLE, true },
+    { L"Validation.Tolerance", TableParameter::INT, true },
+    { L"Validation.NumInput", TableParameter::UINT, true }
+};
+
+static TableParameter UnaryUintOpParameters[] = {
+    { L"ShaderOp.Name", TableParameter::STRING, true },
+    { L"ShaderOp.Target", TableParameter::STRING, true },
+    { L"ShaderOp.EntryPoint", TableParameter::STRING, true },
+    { L"ShaderOp.Text", TableParameter::STRING, true },
+    { L"Validation.Input", TableParameter::UINT_TABLE, true },
+    { L"Validation.Expected", TableParameter::UINT_TABLE, true },
+    { L"Validation.Tolerance", TableParameter::INT, true },
+    { L"Validation.NumInput", TableParameter::UINT, true }
+};
+
+static TableParameter BinaryIntOpParameters[] = {
+    { L"ShaderOp.Name", TableParameter::STRING, true },
+    { L"ShaderOp.Target", TableParameter::STRING, true },
+    { L"ShaderOp.EntryPoint", TableParameter::STRING, true },
+    { L"ShaderOp.Text", TableParameter::STRING, true },
+    { L"Validation.Input1", TableParameter::INT_TABLE, true },
+    { L"Validation.Input2", TableParameter::INT_TABLE, true },
+    { L"Validation.Expected1", TableParameter::INT_TABLE, true },
+    { L"Validation.Expected2", TableParameter::INT_TABLE, false },
+    { L"Validation.Tolerance", TableParameter::INT, true },
+    { L"Validation.NumInput", TableParameter::UINT, true },
+    { L"Validation.NumExpected", TableParameter::INT, true }
+};
+
+static TableParameter TertiaryIntOpParameters[] = {
+    { L"ShaderOp.Name", TableParameter::STRING, true },
+    { L"ShaderOp.Target", TableParameter::STRING, true },
+    { L"ShaderOp.EntryPoint", TableParameter::STRING, true },
+    { L"ShaderOp.Text", TableParameter::STRING, true },
+    { L"Validation.Input1", TableParameter::INT_TABLE, true },
+    { L"Validation.Input2", TableParameter::INT_TABLE, true },
+    { L"Validation.Input3", TableParameter::INT_TABLE, true },
+    { L"Validation.Expected", TableParameter::INT_TABLE, true },
+    { L"Validation.Tolerance", TableParameter::INT, true },
+    { L"Validation.NumInput", TableParameter::UINT, true }
+};
+
+static TableParameter BinaryUintOpParameters[] = {
+    { L"ShaderOp.Name", TableParameter::STRING, true },
+    { L"ShaderOp.Target", TableParameter::STRING, true },
+    { L"ShaderOp.EntryPoint", TableParameter::STRING, true },
+    { L"ShaderOp.Text", TableParameter::STRING, true },
+    { L"Validation.Input1", TableParameter::UINT_TABLE, true },
+    { L"Validation.Input2", TableParameter::UINT_TABLE, true },
+    { L"Validation.Expected1", TableParameter::UINT_TABLE, true },
+    { L"Validation.Expected2", TableParameter::UINT_TABLE, false },
+    { L"Validation.Tolerance", TableParameter::INT, true },
+
+    { L"Validation.NumInput", TableParameter::UINT, true },
+    { L"Validation.NumExpected", TableParameter::INT, true },
+};
+
+static TableParameter TertiaryUintOpParameters[] = {
+    { L"ShaderOp.Name", TableParameter::STRING, true },
+    { L"ShaderOp.Target", TableParameter::STRING, true },
+    { L"ShaderOp.EntryPoint", TableParameter::STRING, true },
+    { L"ShaderOp.Text", TableParameter::STRING, true },
+    { L"Validation.Input1", TableParameter::UINT_TABLE, true },
+    { L"Validation.Input2", TableParameter::UINT_TABLE, true },
+    { L"Validation.Input3", TableParameter::UINT_TABLE, true },
+    { L"Validation.Expected", TableParameter::UINT_TABLE, true },
+    { L"Validation.Tolerance", TableParameter::INT, true },
+    { L"Validation.NumInput", TableParameter::UINT, true }
+};
+
+static TableParameter DotOpParameters[] = {
+    { L"ShaderOp.Name", TableParameter::STRING, true },
+    { L"ShaderOp.Target", TableParameter::STRING, true },
+    { L"ShaderOp.EntryPoint", TableParameter::STRING, true },
+    { L"ShaderOp.Text", TableParameter::STRING, true },
+    { L"Validation.Input1", TableParameter::STRING_TABLE, true },
+    { L"Validation.Input2", TableParameter::STRING_TABLE, true },
+    { L"Validation.dot2", TableParameter::STRING_TABLE, true },
+    { L"Validation.dot3", TableParameter::STRING_TABLE, true },
+    { L"Validation.dot4", TableParameter::STRING_TABLE, true },
+    { L"Validation.Type", TableParameter::STRING, true },
+    { L"Validation.Tolerance", TableParameter::DOUBLE, true },
+    { L"Validation.NumInput", TableParameter::UINT, true }
+};
+
+static TableParameter Msad4OpParameters[] = {
+    { L"ShaderOp.Text", TableParameter::STRING, true },
+    { L"Validation.Tolerance", TableParameter::DOUBLE, true },
+    { L"Validation.NumInput", TableParameter::UINT, true },
+    { L"Validation.Reference", TableParameter::UINT_TABLE, true},
+    { L"Validation.Source", TableParameter::STRING_TABLE, true },
+    { L"Validation.Accum", TableParameter::STRING_TABLE, true },
+    { L"Validation.Expected", TableParameter::STRING_TABLE, true }
+};
+
+static TableParameter WaveIntrinsicsActiveIntParameters[] = {
+    { L"ShaderOp.Name", TableParameter::STRING, true },
+    { L"ShaderOp.Text", TableParameter::STRING, true },
+    { L"Validation.NumInputSet", TableParameter::UINT, true },
+    { L"Validation.InputSet1", TableParameter::INT_TABLE, true },
+    { L"Validation.InputSet2", TableParameter::INT_TABLE, false },
+    { L"Validation.InputSet3", TableParameter::INT_TABLE, false },
+    { L"Validation.InputSet4", TableParameter::INT_TABLE, false }
+};
+
+static TableParameter WaveIntrinsicsPrefixIntParameters[] = {
+  { L"ShaderOp.Name", TableParameter::STRING, true },
+  { L"ShaderOp.Text", TableParameter::STRING, true },
+  { L"Validation.NumInputSet", TableParameter::UINT, true },
+  { L"Validation.InputSet1", TableParameter::INT_TABLE, true },
+  { L"Validation.InputSet2", TableParameter::INT_TABLE, false },
+  { L"Validation.InputSet3", TableParameter::INT_TABLE, false },
+  { L"Validation.InputSet4", TableParameter::INT_TABLE, false }
+};
+
+static TableParameter WaveIntrinsicsActiveUintParameters[] = {
+  { L"ShaderOp.Name", TableParameter::STRING, true },
+  { L"ShaderOp.Text", TableParameter::STRING, true },
+  { L"Validation.NumInputSet", TableParameter::UINT, true },
+  { L"Validation.InputSet1", TableParameter::UINT_TABLE, true },
+  { L"Validation.InputSet2", TableParameter::UINT_TABLE, false },
+  { L"Validation.InputSet3", TableParameter::UINT_TABLE, false },
+  { L"Validation.InputSet4", TableParameter::UINT_TABLE, false }
+};
+
+static TableParameter WaveIntrinsicsPrefixUintParameters[] = {
+  { L"ShaderOp.Name", TableParameter::STRING, true },
+  { L"ShaderOp.Text", TableParameter::STRING, true },
+  { L"Validation.NumInputSet", TableParameter::UINT, true },
+  { L"Validation.InputSet1", TableParameter::UINT_TABLE, true },
+  { L"Validation.InputSet2", TableParameter::UINT_TABLE, false },
+  { L"Validation.InputSet3", TableParameter::UINT_TABLE, false },
+  { L"Validation.InputSet4", TableParameter::UINT_TABLE, false }
+};
+
+static TableParameter WaveIntrinsicsActiveBoolParameters[] = {
+  { L"ShaderOp.Name", TableParameter::STRING, true },
+  { L"ShaderOp.Text", TableParameter::STRING, true },
+  { L"Validation.NumInputSet", TableParameter::UINT, true },
+  { L"Validation.InputSet1", TableParameter::BOOL_TABLE, true },
+  { L"Validation.InputSet2", TableParameter::BOOL_TABLE, false },
+  { L"Validation.InputSet3", TableParameter::BOOL_TABLE, false },
+};
+
+static HRESULT ParseDataToFloat(PCWSTR str, float &value) {
+  std::wstring wString(str);
+  wString.erase(std::remove(wString.begin(), wString.end(), L' '), wString.end());
+  PCWSTR wstr = wString.data();
+  if (_wcsicmp(wstr, L"NaN") == 0) {
+    value = NAN;
+  } else if (_wcsicmp(wstr, L"-inf") == 0) {
+    value = -(INFINITY);
+  } else if (_wcsicmp(wstr, L"inf") == 0) {
+    value = INFINITY;
+  } else if (_wcsicmp(wstr, L"-denorm") == 0) {
+    value = -(FLT_MIN / 2);
+  } else if (_wcsicmp(wstr, L"denorm") == 0) {
+    value = FLT_MIN / 2;
+  } else if (_wcsicmp(wstr, L"-0.0f") == 0 || _wcsicmp(wstr, L"-0.0") == 0 ||
+             _wcsicmp(wstr, L"-0") == 0) {
+    value = -0.0f;
+  } else if (_wcsicmp(wstr, L"0.0f") == 0 || _wcsicmp(wstr, L"0.0") == 0 ||
+             _wcsicmp(wstr, L"0") == 0) {
+    value = 0.0f;
+  } else {
+    // evaluate the expression of wstring
+    double val = _wtof(wstr);
+    if (val == 0) {
+      LogErrorFmt(L"Failed to parse parameter %s to float", wstr);
+      return E_FAIL;
+    }
+    value = val;
+  }
+  return S_OK;
+}
+
+static HRESULT ParseDataToInt(PCWSTR str, int &value) {
+  std::wstring wString(str);
+  wString.erase(std::remove(wString.begin(), wString.end(), L' '), wString.end());
+  PCWSTR wstr = wString.data();
+  // evaluate the expression of string
+  if (_wcsicmp(wstr, L"0.0") == 0 || _wcsicmp(wstr, L"0") == 0) {
+      value = 0;
+      return S_OK;
+  }
+  int val = _wtoi(wstr);
+  if (val == 0) {
+      LogErrorFmt(L"Failed to parse parameter %s to int", wstr);
+      return E_FAIL;
+  }
+  value = val;
+  return S_OK;
+}
+
+static HRESULT ParseDataToUint(PCWSTR str, unsigned int &value) {
+    std::wstring wString(str);
+    wString.erase(std::remove(wString.begin(), wString.end(), L' '), wString.end());
+    PCWSTR wstr = wString.data();
+    // evaluate the expression of string
+    if (_wcsicmp(wstr, L"0") == 0 || _wcsicmp(wstr, L"0x00000000") == 0) {
+        value = 0;
+        return S_OK;
+    }
+    wchar_t *end;
+    unsigned int val = std::wcstoul(wstr, &end, 0);
+    if (val == 0) {
+        LogErrorFmt(L"Failed to parse parameter %s to int", wstr);
+        return E_FAIL;
+    }
+    value = val;
+    return S_OK;
+}
+
+static HRESULT ParseDataToVectorFloat(PCWSTR str, float *ptr, size_t count) {
+    std::wstring wstr(str);
+    size_t curPosition = 0;
+    // parse a string of dot product separated by commas
+    for (size_t i = 0; i < count; ++i) {
+        size_t nextPosition = wstr.find(L",", curPosition);
+        if (FAILED(ParseDataToFloat(
+            wstr.substr(curPosition, nextPosition - curPosition).data(),
+            *(ptr + i)))) {
+            return E_FAIL;
+        }
+        curPosition = nextPosition + 1;
+    }
+    return S_OK;
+}
+
+static HRESULT ParseDataToVectorUint(PCWSTR str, unsigned int *ptr, size_t count) {
+    std::wstring wstr(str);
+    size_t curPosition = 0;
+    // parse a string of dot product separated by commas
+    for (size_t i = 0; i < count; ++i) {
+        size_t nextPosition = wstr.find(L",", curPosition);
+        if (FAILED(ParseDataToUint(
+            wstr.substr(curPosition, nextPosition - curPosition).data(),
+            *(ptr + i)))) {
+            return E_FAIL;
+        }
+        curPosition = nextPosition + 1;
+    }
+    return S_OK;
+}
+
+static HRESULT ParseTableRow(TableParameter *table, unsigned int size) {
+  for (unsigned int i = 0; i < size; ++i) {
+    switch (table[i].m_type) {
+    case TableParameter::INT:
+      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
+                                                           table[i].m_int)) && table[i].m_required) {
+        LogErrorFmt(L"Failed to get %s", table[i].m_name);
+        return E_FAIL;
+      }
+      break;
+    case TableParameter::UINT:
+        if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
+            table[i].m_uint)) && table[i].m_required) {
+            LogErrorFmt(L"Failed to get %s", table[i].m_name);
+            return E_FAIL;
+        }
+        break;
+    case TableParameter::DOUBLE:
+      if (FAILED(WEX::TestExecution::TestData::TryGetValue(
+              table[i].m_name, table[i].m_double)) && table[i].m_required) {
+        LogErrorFmt(L"Failed to get %s", table[i].m_name);
+        return E_FAIL;
+      }
+      break;
+    case TableParameter::STRING:
+      if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
+                                                           table[i].m_str)) && table[i].m_required) {
+        LogErrorFmt(L"Failed to get %s", table[i].m_name);
+        return E_FAIL;
+      }
+      break;
+    case TableParameter::BOOL:
+        if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
+            table[i].m_str)) && table[i].m_bool) {
+            LogErrorFmt(L"Failed to get %s", table[i].m_name);
+            return E_FAIL;
+        }
+        break;
+    case TableParameter::INT_TABLE:
+      if (FAILED(WEX::TestExecution::TestData::TryGetValue(
+              table[i].m_name, table[i].m_intTable)) && table[i].m_required) {
+        LogErrorFmt(L"Failed to get %s", table[i].m_name);
+        return E_FAIL;
+      }
+      break;
+    case TableParameter::UINT_TABLE:
+        if (FAILED(WEX::TestExecution::TestData::TryGetValue(
+            table[i].m_name, table[i].m_uintTable)) && table[i].m_required) {
+            LogErrorFmt(L"Failed to get %s", table[i].m_name);
+            return E_FAIL;
+        }
+        break;
+    case TableParameter::DOUBLE_TABLE:
+      if (FAILED(WEX::TestExecution::TestData::TryGetValue(
+              table[i].m_name, table[i].m_doubleTable)) && table[i].m_required) {
+        LogErrorFmt(L"Failed to get %s", table[i].m_name);
+        return E_FAIL;
+      }
+      break;
+    case TableParameter::BOOL_TABLE:
+      if (FAILED(WEX::TestExecution::TestData::TryGetValue(
+        table[i].m_name, table[i].m_boolTable)) && table[i].m_required) {
+        LogErrorFmt(L"Failed to get %s", table[i].m_name);
+        return E_FAIL;
+      }
+      break;
+    case TableParameter::STRING_TABLE:
+      if (FAILED(WEX::TestExecution::TestData::TryGetValue(
+              table[i].m_name, table[i].m_StringTable)) && table[i].m_required) {
+        LogErrorFmt(L"Failed to get %s", table[i].m_name);
+        return E_FAIL;
+      }
+      break;
+    default:
+      DXASSERT_NOMSG("Invalid Parameter Type");
+    }
+  }
+  return S_OK;
+}
+
+static void VerifyOutputWithExpectedValueInt(int output, int ref, int tolerance) {
+    VERIFY_IS_TRUE(output - ref <= tolerance && ref - output <= tolerance);
+}
+
+static void VerifyOutputWithExpectedValueFloat(float output, float ref, LPCWSTR type, double tolerance) {
+    if (_wcsicmp(type, L"Relative") == 0) {
+        VERIFY_IS_TRUE(CompareFloatRelativeEpsilon(output, ref, tolerance));
+    }
+    else if (_wcsicmp(type, L"Epsilon") == 0) {
+        VERIFY_IS_TRUE(CompareFloatEpsilon(output, ref, tolerance));
+    }
+    else if (_wcsicmp(type, L"ULP") == 0) {
+        VERIFY_IS_TRUE(CompareFloatULP(output, ref, (int)tolerance));
+    }
+    else {
+        LogErrorFmt(L"Failed to read comparison type %S", type);
+    }
+}
+
+TEST_F(ExecutionTest, UnaryFloatOpTest) {
+    WEX::TestExecution::SetVerifyOutput verifySettings(
+        WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
+    CComPtr<IStream> pStream;
+    ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+
+    CComPtr<ID3D12Device> pDevice;
+    if (!CreateDevice(&pDevice)) {
+      return;
+    }
+    // Read data from the table
+    int tableSize = sizeof(UnaryFPOpParameters) / sizeof(TableParameter);
+    TableParameterHandler handler(UnaryFPOpParameters, tableSize);
+    handler.clearTableParameter();
+    VERIFY_SUCCEEDED(ParseTableRow(UnaryFPOpParameters, tableSize));
+
+    st::ShaderOpShader shader;
+
+    CW2A Name(handler.GetTableParamByName(L"ShaderOp.Name")->m_str);
+    CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
+    CW2A EntryPoint(handler.GetTableParamByName(L"ShaderOp.EntryPoint")->m_str);
+    CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
+    shader.Name = Name.m_psz;
+    shader.Target = Target.m_psz;
+    shader.EntryPoint = EntryPoint.m_psz;
+    shader.Text = Text.m_psz;
+
+    unsigned int WarpVersion = handler.GetTableParamByName(L"Warp.Version")->m_uint;
+    if (GetTestParamUseWARP(true) && !IsValidWarpDllVersion(WarpVersion)) {
+        return;
+    }
+
+    WEX::TestExecution::TestDataArray<WEX::Common::String> *Validation_Input =
+        &(handler.GetTableParamByName(L"Validation.Input")->m_StringTable);
+    WEX::TestExecution::TestDataArray<WEX::Common::String> *Validation_Expected =
+        &(handler.GetTableParamByName(L"Validation.Expected")->m_StringTable);
+
+    LPCWSTR Validation_Type = handler.GetTableParamByName(L"Validation.Type")->m_str;
+    double Validation_Tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_double;
+
+    size_t count = handler.GetTableParamByName(L"Validation.NumInput")->m_uint;
+
+    std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+        pDevice, m_support, pStream, "UnaryFPOp",
+        // this callbacked is called when the test
+        // is creating the resource to run the test
+        [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
+          VERIFY_IS_TRUE(0 == _stricmp(Name, "SUnaryFPOp"));
+          size_t size = sizeof(SUnaryFPOp) * count;
+          Data.resize(size);
+          SUnaryFPOp *pPrimitives = (SUnaryFPOp *)Data.data();
+          for (size_t i = 0; i < count; ++i) {
+            SUnaryFPOp *p = &pPrimitives[i];
+            PCWSTR str = (*Validation_Input)[i % Validation_Input->GetSize()];
+            float val;
+            VERIFY_SUCCEEDED(ParseDataToFloat(str, val));
+            p->input = val;
+          }
+          // use shader from data table
+          pShaderOp->Shaders.at(0).Target = shader.Target;
+          pShaderOp->Shaders.at(0).EntryPoint = shader.EntryPoint;
+          pShaderOp->Shaders.at(0).Text = shader.Text;
+        });
+
+    MappedData data;
+    test->Test->GetReadBackData("SUnaryFPOp", &data);
+
+    SUnaryFPOp *pPrimitives = (SUnaryFPOp*)data.data();
+    WEX::TestExecution::DisableVerifyExceptions dve;
+    for (unsigned i = 0; i < count; ++i) {
+        SUnaryFPOp *p = &pPrimitives[i];
+        LPCWSTR str = (*Validation_Expected)[i % Validation_Expected->GetSize()];
+        float val;
+        VERIFY_SUCCEEDED(ParseDataToFloat(str, val));
+        LogCommentFmt(
+            L"element #%u, input = %10f, output = %10f, expected = %10f", i,
+            p->input, p->output, val);
+        VerifyOutputWithExpectedValueFloat(p->output, val, Validation_Type, Validation_Tolerance);
+    }
+}
+
+TEST_F(ExecutionTest, BinaryFloatOpTest) {
+    WEX::TestExecution::SetVerifyOutput verifySettings(
+        WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
+    CComPtr<IStream> pStream;
+    ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+
+    CComPtr<ID3D12Device> pDevice;
+    if (!CreateDevice(&pDevice)) {
+        return;
+    }
+    // Read data from the table
+    int tableSize = sizeof(BinaryFPOpParameters) / sizeof(TableParameter);
+    TableParameterHandler handler(BinaryFPOpParameters, tableSize);
+    handler.clearTableParameter();
+    VERIFY_SUCCEEDED(ParseTableRow(BinaryFPOpParameters, tableSize));
+
+
+    st::ShaderOpShader shader;
+
+    CW2A Name(handler.GetTableParamByName(L"ShaderOp.Name")->m_str);
+    CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
+    CW2A EntryPoint(handler.GetTableParamByName(L"ShaderOp.EntryPoint")->m_str);
+    CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
+    shader.Name = Name.m_psz;
+    shader.Target = Target.m_psz;
+    shader.EntryPoint = EntryPoint.m_psz;
+    shader.Text = Text.m_psz;
+
+    WEX::TestExecution::TestDataArray<WEX::Common::String> *Validation_Input1 =
+        &(handler.GetTableParamByName(L"Validation.Input1")->m_StringTable);
+    WEX::TestExecution::TestDataArray<WEX::Common::String> *Validation_Input2 =
+        &(handler.GetTableParamByName(L"Validation.Input2")->m_StringTable);
+
+    WEX::TestExecution::TestDataArray<WEX::Common::String> *Validation_Expected1 =
+        &(handler.GetTableParamByName(L"Validation.Expected1")->m_StringTable);
+
+    WEX::TestExecution::TestDataArray<WEX::Common::String> *Validation_Expected2 =
+        &(handler.GetTableParamByName(L"Validation.Expected2")->m_StringTable);
+
+    LPCWSTR Validation_Type = handler.GetTableParamByName(L"Validation.Type")->m_str;
+    double Validation_Tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_double;
+    size_t count = handler.GetTableParamByName(L"Validation.NumInput")->m_uint;
+
+    std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+        pDevice, m_support, pStream, "BinaryFPOp", 
+        // this callbacked is called when the test
+        // is creating the resource to run the test
+        [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
+        VERIFY_IS_TRUE(0 == _stricmp(Name, "SBinaryFPOp"));
+        size_t size = sizeof(SBinaryFPOp) * count;
+        Data.resize(size);
+        SBinaryFPOp *pPrimitives = (SBinaryFPOp *)Data.data();
+        for (size_t i = 0; i < count; ++i) {
+            SBinaryFPOp *p = &pPrimitives[i];
+            PCWSTR str1 = (*Validation_Input1)[i % Validation_Input1->GetSize()];
+            PCWSTR str2 = (*Validation_Input2)[i % Validation_Input2->GetSize()];
+            float val1, val2;
+            VERIFY_SUCCEEDED(ParseDataToFloat(str1, val1));
+            VERIFY_SUCCEEDED(ParseDataToFloat(str2, val2));
+            p->input1 = val1;
+            p->input2 = val2;
+        }
+
+        // use shader from data table
+        pShaderOp->Shaders.at(0).Target = shader.Target;
+        pShaderOp->Shaders.at(0).EntryPoint = shader.EntryPoint;
+        pShaderOp->Shaders.at(0).Text = shader.Text;
+    });
+
+    MappedData data;
+    test->Test->GetReadBackData("SBinaryFPOp", &data);
+
+    SBinaryFPOp *pPrimitives = (SBinaryFPOp *)data.data();
+    WEX::TestExecution::DisableVerifyExceptions dve;
+
+    for (unsigned i = 0; i < count; ++i) {
+        SBinaryFPOp *p = &pPrimitives[i];
+        LPCWSTR str1 = (*Validation_Expected1)[i % Validation_Expected1->GetSize()];
+        LPCWSTR str2 = (*Validation_Expected2)[i % Validation_Expected2->GetSize()];
+        float val1, val2;
+        VERIFY_SUCCEEDED(ParseDataToFloat(str1, val1));
+        VERIFY_SUCCEEDED(ParseDataToFloat(str2, val2));
+        LogCommentFmt(L"element #%u, input1 = %10f, input2 = %10f, output1 = "
+            L"%10f, expected1 = %10f, output2 = %10f, expected2 = %10f",
+            i, p->input1, p->input2, p->output1, val1, p->output2,
+            val2);
+        VerifyOutputWithExpectedValueFloat(p->output1, val1, Validation_Type,
+            Validation_Tolerance);
+        VerifyOutputWithExpectedValueFloat(p->output2, val2, Validation_Type,
+            Validation_Tolerance);
+    }
+}
+
+TEST_F(ExecutionTest, TertiaryFloatOpTest) {
+    WEX::TestExecution::SetVerifyOutput verifySettings(
+        WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
+    CComPtr<IStream> pStream;
+    ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+
+    CComPtr<ID3D12Device> pDevice;
+    if (!CreateDevice(&pDevice)) {
+        return;
+    }
+    // Read data from the table
+    
+    int tableSize = sizeof(TertiaryFPOpParameters) / sizeof(TableParameter);
+    TableParameterHandler handler(TertiaryFPOpParameters, tableSize);
+    handler.clearTableParameter();
+    VERIFY_SUCCEEDED(ParseTableRow(TertiaryFPOpParameters, tableSize));
+
+    st::ShaderOpShader shader;
+
+    CW2A Name(handler.GetTableParamByName(L"ShaderOp.Name")->m_str);
+    CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
+    CW2A EntryPoint(handler.GetTableParamByName(L"ShaderOp.EntryPoint")->m_str);
+    CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
+    shader.Name = Name.m_psz;
+    shader.Target = Target.m_psz;
+    shader.EntryPoint = EntryPoint.m_psz;
+    shader.Text = Text.m_psz;
+
+    WEX::TestExecution::TestDataArray<WEX::Common::String> *Validation_Input1 =
+        &(handler.GetTableParamByName(L"Validation.Input1")->m_StringTable);
+    WEX::TestExecution::TestDataArray<WEX::Common::String> *Validation_Input2 =
+        &(handler.GetTableParamByName(L"Validation.Input2")->m_StringTable);
+    WEX::TestExecution::TestDataArray<WEX::Common::String> *Validation_Input3 =
+        &(handler.GetTableParamByName(L"Validation.Input3")->m_StringTable);
+
+    WEX::TestExecution::TestDataArray<WEX::Common::String> *Validation_Expected =
+        &(handler.GetTableParamByName(L"Validation.Expected")->m_StringTable);
+
+    LPCWSTR Validation_Type = handler.GetTableParamByName(L"Validation.Type")->m_str;
+    double Validation_Tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_double;
+    size_t count = handler.GetTableParamByName(L"Validation.NumInput")->m_uint;
+
+    std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+        pDevice, m_support, pStream, "TertiaryFPOp",
+        // this callbacked is called when the test
+        // is creating the resource to run the test
+        [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
+        VERIFY_IS_TRUE(0 == _stricmp(Name, "STertiaryFPOp"));
+        size_t size = sizeof(STertiaryFPOp) * count;
+        Data.resize(size);
+        STertiaryFPOp *pPrimitives = (STertiaryFPOp *)Data.data();
+        for (size_t i = 0; i < count; ++i) {
+            STertiaryFPOp *p = &pPrimitives[i];
+            PCWSTR str1 = (*Validation_Input1)[i % Validation_Input1->GetSize()];
+            PCWSTR str2 = (*Validation_Input2)[i % Validation_Input2->GetSize()];
+            PCWSTR str3 = (*Validation_Input3)[i % Validation_Input3->GetSize()];
+            float val1, val2, val3;
+            VERIFY_SUCCEEDED(ParseDataToFloat(str1, val1));
+            VERIFY_SUCCEEDED(ParseDataToFloat(str2, val2));
+            VERIFY_SUCCEEDED(ParseDataToFloat(str3, val3));
+            p->input1 = val1;
+            p->input2 = val2;
+            p->input3 = val3;
+        }
+
+        // use shader from data table
+        pShaderOp->Shaders.at(0).Target = shader.Target;
+        pShaderOp->Shaders.at(0).EntryPoint = shader.EntryPoint;
+        pShaderOp->Shaders.at(0).Text = shader.Text;
+    });
+
+    MappedData data;
+    test->Test->GetReadBackData("STertiaryFPOp", &data);
+
+    STertiaryFPOp *pPrimitives = (STertiaryFPOp *)data.data();
+    WEX::TestExecution::DisableVerifyExceptions dve;
+
+    for (unsigned i = 0; i < count; ++i) {
+      STertiaryFPOp *p = &pPrimitives[i];
+      LPCWSTR str = (*Validation_Expected)[i % Validation_Expected->GetSize()];
+      float val;
+      VERIFY_SUCCEEDED(ParseDataToFloat(str, val));
+      LogCommentFmt(L"element #%u, input1 = %10f, input2 = %10f, input3 = %10f, output1 = "
+                    L"%10f, expected = %10f",
+                    i, p->input1, p->input2, p->input3, p->output, val);
+      VerifyOutputWithExpectedValueFloat(p->output, val, Validation_Type,
+                               Validation_Tolerance);
+    }
+}
+
+TEST_F(ExecutionTest, UnaryIntOpTest) {
+    WEX::TestExecution::SetVerifyOutput verifySettings(
+        WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
+    CComPtr<IStream> pStream;
+    ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+
+    CComPtr<ID3D12Device> pDevice;
+    if (!CreateDevice(&pDevice)) {
+        return;
+    }
+    // Read data from the table
+
+    int tableSize = sizeof(UnaryIntOpParameters) / sizeof(TableParameter);
+    TableParameterHandler handler(UnaryIntOpParameters, tableSize);
+    handler.clearTableParameter();
+    VERIFY_SUCCEEDED(ParseTableRow(UnaryIntOpParameters, tableSize));
+
+    st::ShaderOpShader shader;
+
+    CW2A Name(handler.GetTableParamByName(L"ShaderOp.Name")->m_str);
+    CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
+    CW2A EntryPoint(handler.GetTableParamByName(L"ShaderOp.EntryPoint")->m_str);
+    CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
+    shader.Name = Name.m_psz;
+    shader.Target = Target.m_psz;
+    shader.EntryPoint = EntryPoint.m_psz;
+    shader.Text = Text.m_psz;
+
+    WEX::TestExecution::TestDataArray<int> *Validation_Input =
+        &handler.GetTableParamByName(L"Validation.Input")->m_intTable;
+    WEX::TestExecution::TestDataArray<int> *Validation_Expected =
+        &handler.GetTableParamByName(L"Validation.Expected")->m_intTable;
+    int Validation_Tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_int;
+    size_t count = handler.GetTableParamByName(L"Validation.NumInput")->m_uint;
+
+    std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+        pDevice, m_support, pStream, "UnaryIntOp",
+        // this callbacked is called when the test
+        // is creating the resource to run the test
+        [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
+          VERIFY_IS_TRUE(0 == _stricmp(Name, "SUnaryIntOp"));
+          size_t size = sizeof(SUnaryIntOp) * count;
+          Data.resize(size);
+          SUnaryIntOp *pPrimitives = (SUnaryIntOp *)Data.data();
+          for (size_t i = 0; i < count; ++i) {
+            SUnaryIntOp *p = &pPrimitives[i];
+            int val = (*Validation_Input)[i % Validation_Input->GetSize()];
+            p->input = val;
+          }
+          // use shader data table
+          pShaderOp->Shaders.at(0).Target = shader.Target;
+          pShaderOp->Shaders.at(0).EntryPoint = shader.EntryPoint;
+          pShaderOp->Shaders.at(0).Text = shader.Text;
+        });
+
+    MappedData data;
+    test->Test->GetReadBackData("SUnaryIntOp", &data);
+
+    SUnaryIntOp *pPrimitives = (SUnaryIntOp *)data.data();
+    WEX::TestExecution::DisableVerifyExceptions dve;
+    for (unsigned i = 0; i < count; ++i) {
+      SUnaryIntOp *p = &pPrimitives[i];
+      int val = (*Validation_Expected)[i % Validation_Expected->GetSize()];
+      LogCommentFmt(L"element #%u, input = %11i(0x%08x), output = %11i(0x%08x), "
+                    L"expected = %11i(0x%08x)",
+                    i, p->input, p->input, p->output, p->output, val, val);
+      VerifyOutputWithExpectedValueInt(p->output, val, Validation_Tolerance);
+    }
+}
+
+TEST_F(ExecutionTest, UnaryUintOpTest) {
+    WEX::TestExecution::SetVerifyOutput verifySettings(
+        WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
+    CComPtr<IStream> pStream;
+    ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+
+    CComPtr<ID3D12Device> pDevice;
+    if (!CreateDevice(&pDevice)) {
+        return;
+    }
+    // Read data from the table
+
+    int tableSize = sizeof(UnaryUintOpParameters) / sizeof(TableParameter);
+    TableParameterHandler handler(UnaryUintOpParameters, tableSize);
+    handler.clearTableParameter();
+    VERIFY_SUCCEEDED(ParseTableRow(UnaryUintOpParameters, tableSize));
+
+    st::ShaderOpShader shader;
+
+    CW2A Name(handler.GetTableParamByName(L"ShaderOp.Name")->m_str);
+    CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
+    CW2A EntryPoint(handler.GetTableParamByName(L"ShaderOp.EntryPoint")->m_str);
+    CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
+    shader.Name = Name.m_psz;
+    shader.Target = Target.m_psz;
+    shader.EntryPoint = EntryPoint.m_psz;
+    shader.Text = Text.m_psz;
+
+    WEX::TestExecution::TestDataArray<unsigned int> *Validation_Input =
+        &handler.GetTableParamByName(L"Validation.Input")->m_uintTable;
+    WEX::TestExecution::TestDataArray<unsigned int> *Validation_Expected =
+        &handler.GetTableParamByName(L"Validation.Expected")->m_uintTable;
+    int Validation_Tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_int;
+    size_t count = handler.GetTableParamByName(L"Validation.NumInput")->m_uint;
+
+    std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+        pDevice, m_support, pStream, "UnaryUintOp",
+        // this callbacked is called when the test
+        // is creating the resource to run the test
+        [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
+        VERIFY_IS_TRUE(0 == _stricmp(Name, "SUnaryUintOp"));
+        size_t size = sizeof(SUnaryUintOp) * count;
+        Data.resize(size);
+        SUnaryUintOp *pPrimitives = (SUnaryUintOp *)Data.data();
+        for (size_t i = 0; i < count; ++i) {
+            SUnaryUintOp *p = &pPrimitives[i];
+            unsigned int val = (*Validation_Input)[i % Validation_Input->GetSize()];
+            p->input = val;
+        }
+        // use shader data table
+        pShaderOp->Shaders.at(0).Target = shader.Target;
+        pShaderOp->Shaders.at(0).EntryPoint = shader.EntryPoint;
+        pShaderOp->Shaders.at(0).Text = shader.Text;
+    });
+
+    MappedData data;
+    test->Test->GetReadBackData("SUnaryUintOp", &data);
+
+    SUnaryUintOp *pPrimitives = (SUnaryUintOp *)data.data();
+    WEX::TestExecution::DisableVerifyExceptions dve;
+    for (unsigned i = 0; i < count; ++i) {
+        SUnaryUintOp *p = &pPrimitives[i];
+        unsigned int val = (*Validation_Expected)[i % Validation_Expected->GetSize()];
+        LogCommentFmt(L"element #%u, input = %11u(0x%08x), output = %11u(0x%08x), "
+            L"expected = %11u(0x%08x)",
+            i, p->input, p->input, p->output, p->output, val, val);
+        VerifyOutputWithExpectedValueInt(p->output, val, Validation_Tolerance);
+    }
+}
+
+TEST_F(ExecutionTest, BinaryIntOpTest) {
+    WEX::TestExecution::SetVerifyOutput verifySettings(
+        WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
+    CComPtr<IStream> pStream;
+    ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+
+    CComPtr<ID3D12Device> pDevice;
+    if (!CreateDevice(&pDevice)) {
+      return;
+    }
+    // Read data from the table
+    size_t tableSize = sizeof(BinaryIntOpParameters) / sizeof(TableParameter);
+    TableParameterHandler handler(BinaryIntOpParameters, tableSize);
+    handler.clearTableParameter();
+    VERIFY_SUCCEEDED(ParseTableRow(BinaryIntOpParameters,tableSize));
+
+    st::ShaderOpShader shader;
+
+    CW2A Name(handler.GetTableParamByName(L"ShaderOp.Name")->m_str);
+    CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
+    CW2A EntryPoint(handler.GetTableParamByName(L"ShaderOp.EntryPoint")->m_str);
+    CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
+    shader.Name = Name.m_psz;
+    shader.Target = Target.m_psz;
+    shader.EntryPoint = EntryPoint.m_psz;
+    shader.Text = Text.m_psz;
+
+    int numExpected = handler.GetTableParamByName(L"Validation.NumExpected")->m_int;
+
+    WEX::TestExecution::TestDataArray<int> *Validation_Input1 =
+        &handler.GetTableParamByName(L"Validation.Input1")->m_intTable;
+    WEX::TestExecution::TestDataArray<int> *Validation_Input2 =
+        &handler.GetTableParamByName(L"Validation.Input2")->m_intTable;
+    WEX::TestExecution::TestDataArray<int> *Validation_Expected1 =
+        &handler.GetTableParamByName(L"Validation.Expected1")->m_intTable;
+    WEX::TestExecution::TestDataArray<int> *Validation_Expected2 =
+        &handler.GetTableParamByName(L"Validation.Expected2")->m_intTable;
+    int Validation_Tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_int;
+    size_t count = handler.GetTableParamByName(L"Validation.NumInput")->m_uint;
+
+    std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+        pDevice, m_support, pStream, "BinaryIntOp",
+        // this callbacked is called when the test
+        // is creating the resource to run the test
+        [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
+          VERIFY_IS_TRUE(0 == _stricmp(Name, "SBinaryIntOp"));
+          size_t size = sizeof(SBinaryIntOp) * count;
+          Data.resize(size);
+          SBinaryIntOp *pPrimitives = (SBinaryIntOp *)Data.data();
+          for (size_t i = 0; i < count; ++i) {
+            SBinaryIntOp *p = &pPrimitives[i];
+            int val1 = (*Validation_Input1)[i % Validation_Input1->GetSize()];
+            int val2 = (*Validation_Input2)[i % Validation_Input2->GetSize()];
+            p->input1 = val1;
+            p->input2 = val2;
+          }
+
+          // use shader from data table
+          pShaderOp->Shaders.at(0).Target = shader.Target;
+          pShaderOp->Shaders.at(0).EntryPoint = shader.EntryPoint;
+          pShaderOp->Shaders.at(0).Text = shader.Text;
+        });
+
+    MappedData data;
+    test->Test->GetReadBackData("SBinaryIntOp", &data);
+
+    SBinaryIntOp *pPrimitives = (SBinaryIntOp *)data.data();
+    WEX::TestExecution::DisableVerifyExceptions dve;
+
+    if (numExpected == 2) {
+        for (unsigned i = 0; i < count; ++i) {
+            SBinaryIntOp *p = &pPrimitives[i];
+            int val1 = (*Validation_Expected1)[i % Validation_Expected1->GetSize()];
+            int val2 = (*Validation_Expected2)[i % Validation_Expected2->GetSize()];
+            LogCommentFmt(L"element #%u, input1 = %11i(0x%08x), input2 = "
+                L"%11i(0x%08x), output1 = "
+                L"%11i(0x%08x), expected1 = %11i(0x%08x), output2 = "
+                L"%11i(0x%08x), expected2 = %11i(0x%08x)",
+                i, p->input1, p->input1, p->input2, p->input2, p->output1,
+                p->output1, val1, val1, p->output2, p->output2, val2,
+                val2);
+            VerifyOutputWithExpectedValueInt(p->output1, val1, Validation_Tolerance);
+            VerifyOutputWithExpectedValueInt(p->output2, val2, Validation_Tolerance);
+        }
+    }
+    else if (numExpected == 1) {
+        for (unsigned i = 0; i < count; ++i) {
+            SBinaryIntOp *p = &pPrimitives[i];
+            int val1 = (*Validation_Expected1)[i % Validation_Expected1->GetSize()];
+            LogCommentFmt(L"element #%u, input1 = %11i(0x%08x), input2 = "
+                          L"%11i(0x%08x), output = "
+                          L"%11i(0x%08x), expected = %11i(0x%08x)", i,
+                          p->input1, p->input1, p->input2, p->input2,
+                          p->output1, p->output1, val1, val1);
+            VerifyOutputWithExpectedValueInt(p->output1, val1, Validation_Tolerance);
+        }
+    }
+    else {
+        LogErrorFmt(L"Unexpected number of expected values for operation %i", numExpected);
+    }
+}
+
+TEST_F(ExecutionTest, TertiaryIntOpTest) {
+    WEX::TestExecution::SetVerifyOutput verifySettings(
+        WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
+    CComPtr<IStream> pStream;
+    ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+
+    CComPtr<ID3D12Device> pDevice;
+    if (!CreateDevice(&pDevice)) {
+        return;
+    }
+    // Read data from the table
+    size_t tableSize = sizeof(TertiaryIntOpParameters) / sizeof(TableParameter);
+    TableParameterHandler handler(TertiaryIntOpParameters, tableSize);
+    handler.clearTableParameter();
+    VERIFY_SUCCEEDED(ParseTableRow(TertiaryIntOpParameters, tableSize));
+
+    st::ShaderOpShader shader;
+
+    CW2A Name(handler.GetTableParamByName(L"ShaderOp.Name")->m_str);
+    CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
+    CW2A EntryPoint(handler.GetTableParamByName(L"ShaderOp.EntryPoint")->m_str);
+    CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
+    shader.Name = Name.m_psz;
+    shader.Target = Target.m_psz;
+    shader.EntryPoint = EntryPoint.m_psz;
+    shader.Text = Text.m_psz;
+
+    WEX::TestExecution::TestDataArray<int> *Validation_Input1 =
+        &handler.GetTableParamByName(L"Validation.Input1")->m_intTable;
+    WEX::TestExecution::TestDataArray<int> *Validation_Input2 =
+        &handler.GetTableParamByName(L"Validation.Input2")->m_intTable;
+    WEX::TestExecution::TestDataArray<int> *Validation_Input3 =
+        &handler.GetTableParamByName(L"Validation.Input3")->m_intTable;
+    WEX::TestExecution::TestDataArray<int> *Validation_Expected =
+        &handler.GetTableParamByName(L"Validation.Expected")->m_intTable;
+    int Validation_Tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_int;
+    size_t count = handler.GetTableParamByName(L"Validation.NumInput")->m_uint;
+
+    std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+        pDevice, m_support, pStream, "TertiaryIntOp",
+        // this callbacked is called when the test
+        // is creating the resource to run the test
+        [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
+        VERIFY_IS_TRUE(0 == _stricmp(Name, "STertiaryIntOp"));
+        size_t size = sizeof(STertiaryIntOp) * count;
+        Data.resize(size);
+        STertiaryIntOp *pPrimitives = (STertiaryIntOp *)Data.data();
+        for (size_t i = 0; i < count; ++i) {
+            STertiaryIntOp *p = &pPrimitives[i];
+            int val1 = (*Validation_Input1)[i % Validation_Input1->GetSize()];
+            int val2 = (*Validation_Input2)[i % Validation_Input2->GetSize()];
+            int val3 = (*Validation_Input3)[i % Validation_Input3->GetSize()];
+            p->input1 = val1;
+            p->input2 = val2;
+            p->input3 = val3;
+        }
+
+        // use shader from data table
+        pShaderOp->Shaders.at(0).Target = shader.Target;
+        pShaderOp->Shaders.at(0).EntryPoint = shader.EntryPoint;
+        pShaderOp->Shaders.at(0).Text = shader.Text;
+    });
+
+    MappedData data;
+    test->Test->GetReadBackData("STertiaryIntOp", &data);
+
+    STertiaryIntOp *pPrimitives = (STertiaryIntOp *)data.data();
+    WEX::TestExecution::DisableVerifyExceptions dve;
+    for (unsigned i = 0; i < count; ++i) {
+        STertiaryIntOp *p = &pPrimitives[i];
+        int val1 = (*Validation_Expected)[i % Validation_Expected->GetSize()];
+        LogCommentFmt(L"element #%u, input1 = %11i(0x%08x), input2 = "
+            L"%11i(0x%08x), input3= %11i(0x%08x), output = "
+            L"%11i(0x%08x), expected = %11i(0x%08x)",
+            i, p->input1, p->input1, p->input2, p->input2,
+            p->input3, p->input3, p->output, p->output, val1,
+            val1);
+        VerifyOutputWithExpectedValueInt(p->output, val1, Validation_Tolerance);
+    }
+}
+
+TEST_F(ExecutionTest, BinaryUintOpTest) {
+    WEX::TestExecution::SetVerifyOutput verifySettings(
+        WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
+    CComPtr<IStream> pStream;
+    ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+
+    CComPtr<ID3D12Device> pDevice;
+    if (!CreateDevice(&pDevice)) {
+        return;
+    }
+    // Read data from the table
+    size_t tableSize = sizeof(BinaryUintOpParameters) / sizeof(TableParameter);
+    TableParameterHandler handler(BinaryUintOpParameters, tableSize);
+    handler.clearTableParameter();
+    VERIFY_SUCCEEDED(ParseTableRow(BinaryUintOpParameters, tableSize));
+
+    st::ShaderOpShader shader;
+
+    CW2A Name(handler.GetTableParamByName(L"ShaderOp.Name")->m_str);
+    CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
+    CW2A EntryPoint(handler.GetTableParamByName(L"ShaderOp.EntryPoint")->m_str);
+    CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
+    shader.Name = Name.m_psz;
+    shader.Target = Target.m_psz;
+    shader.EntryPoint = EntryPoint.m_psz;
+    shader.Text = Text.m_psz;
+
+    int numExpected = handler.GetTableParamByName(L"Validation.NumExpected")->m_int;
+
+    WEX::TestExecution::TestDataArray<unsigned int> *Validation_Input1 =
+        &handler.GetTableParamByName(L"Validation.Input1")->m_uintTable;
+    WEX::TestExecution::TestDataArray<unsigned int> *Validation_Input2 =
+        &handler.GetTableParamByName(L"Validation.Input2")->m_uintTable;
+    WEX::TestExecution::TestDataArray<unsigned int> *Validation_Expected1 =
+        &handler.GetTableParamByName(L"Validation.Expected1")->m_uintTable;
+    WEX::TestExecution::TestDataArray<unsigned int> *Validation_Expected2 =
+        &handler.GetTableParamByName(L"Validation.Expected2")->m_uintTable;
+    int Validation_Tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_int;
+    size_t count = handler.GetTableParamByName(L"Validation.NumInput")->m_uint;
+
+    std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+        pDevice, m_support, pStream, "BinaryUintOp",
+        // this callbacked is called when the test
+        // is creating the resource to run the test
+        [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
+        VERIFY_IS_TRUE(0 == _stricmp(Name, "SBinaryUintOp"));
+        size_t size = sizeof(SBinaryUintOp) * count;
+        Data.resize(size);
+        SBinaryUintOp *pPrimitives = (SBinaryUintOp *)Data.data();
+        for (size_t i = 0; i < count; ++i) {
+            SBinaryUintOp *p = &pPrimitives[i];
+            unsigned int val1 = (*Validation_Input1)[i % Validation_Input1->GetSize()];
+            unsigned int val2 = (*Validation_Input2)[i % Validation_Input2->GetSize()];
+            p->input1 = val1;
+            p->input2 = val2;
+        }
+
+        // use shader from data table
+        pShaderOp->Shaders.at(0).Target = shader.Target;
+        pShaderOp->Shaders.at(0).EntryPoint = shader.EntryPoint;
+        pShaderOp->Shaders.at(0).Text = shader.Text;
+    });
+
+    MappedData data;
+    test->Test->GetReadBackData("SBinaryUintOp", &data);
+
+    SBinaryUintOp *pPrimitives = (SBinaryUintOp *)data.data();
+    WEX::TestExecution::DisableVerifyExceptions dve;
+    if (numExpected == 2) {
+        for (unsigned i = 0; i < count; ++i) {
+            SBinaryUintOp *p = &pPrimitives[i];
+            unsigned int val1 = (*Validation_Expected1)[i % Validation_Expected1->GetSize()];
+            unsigned int val2 = (*Validation_Expected2)[i % Validation_Expected2->GetSize()];
+            LogCommentFmt(L"element #%u, input1 = %11u(0x%08x), input2 = "
+                L"%11u(0x%08x), output1 = "
+                L"%11u(0x%08x), expected1 = %11u(0x%08x), output2 = "
+                L"%11u(0x%08x), expected2 = %11u(0x%08x)",
+                i, p->input1, p->input1, p->input2, p->input2, p->output1,
+                p->output1, val1, val1, p->output2, p->output2, val2,
+                val2);
+            VerifyOutputWithExpectedValueInt(p->output1, val1, Validation_Tolerance);
+            VerifyOutputWithExpectedValueInt(p->output2, val2, Validation_Tolerance);
+        }
+    }
+    else if (numExpected == 1) {
+        for (unsigned i = 0; i < count; ++i) {
+            SBinaryUintOp *p = &pPrimitives[i];
+            unsigned int val1 = (*Validation_Expected1)[i % Validation_Expected1->GetSize()];
+            LogCommentFmt(L"element #%u, input1 = %11u(0x%08x), input2 = "
+                L"%11u(0x%08x), output = "
+                L"%11u(0x%08x), expected = %11u(0x%08x)", i,
+                p->input1, p->input1, p->input2, p->input2,
+                p->output1, p->output1, val1, val1);
+            VerifyOutputWithExpectedValueInt(p->output1, val1, Validation_Tolerance);
+        }
+    }
+    else {
+        LogErrorFmt(L"Unexpected number of expected values for operation %i", numExpected);
+    }
+}
+
+TEST_F(ExecutionTest, TertiaryUintOpTest) {
+    WEX::TestExecution::SetVerifyOutput verifySettings(
+        WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
+    CComPtr<IStream> pStream;
+    ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+
+    CComPtr<ID3D12Device> pDevice;
+    if (!CreateDevice(&pDevice)) {
+        return;
+    }
+    // Read data from the table
+    size_t tableSize = sizeof(TertiaryUintOpParameters) / sizeof(TableParameter);
+    TableParameterHandler handler(TertiaryUintOpParameters, tableSize);
+    handler.clearTableParameter();
+    VERIFY_SUCCEEDED(ParseTableRow(TertiaryUintOpParameters, tableSize));
+
+    st::ShaderOpShader shader;
+
+    CW2A Name(handler.GetTableParamByName(L"ShaderOp.Name")->m_str);
+    CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
+    CW2A EntryPoint(handler.GetTableParamByName(L"ShaderOp.EntryPoint")->m_str);
+    CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
+    shader.Name = Name.m_psz;
+    shader.Target = Target.m_psz;
+    shader.EntryPoint = EntryPoint.m_psz;
+    shader.Text = Text.m_psz;
+
+    WEX::TestExecution::TestDataArray<unsigned int> *Validation_Input1 =
+        &handler.GetTableParamByName(L"Validation.Input1")->m_uintTable;
+    WEX::TestExecution::TestDataArray<unsigned int> *Validation_Input2 =
+        &handler.GetTableParamByName(L"Validation.Input2")->m_uintTable;
+    WEX::TestExecution::TestDataArray<unsigned int> *Validation_Input3 =
+        &handler.GetTableParamByName(L"Validation.Input3")->m_uintTable;
+    WEX::TestExecution::TestDataArray<unsigned int> *Validation_Expected =
+        &handler.GetTableParamByName(L"Validation.Expected")->m_uintTable;
+    int Validation_Tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_int;
+    size_t count = handler.GetTableParamByName(L"Validation.NumInput")->m_uint;
+
+    std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+        pDevice, m_support, pStream, "TertiaryUintOp",
+        // this callbacked is called when the test
+        // is creating the resource to run the test
+        [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
+        VERIFY_IS_TRUE(0 == _stricmp(Name, "STertiaryUintOp"));
+        size_t size = sizeof(STertiaryUintOp) * count;
+        Data.resize(size);
+        STertiaryUintOp *pPrimitives = (STertiaryUintOp *)Data.data();
+        for (size_t i = 0; i < count; ++i) {
+            STertiaryUintOp *p = &pPrimitives[i];
+            unsigned int val1 = (*Validation_Input1)[i % Validation_Input1->GetSize()];
+            unsigned int val2 = (*Validation_Input2)[i % Validation_Input2->GetSize()];
+            unsigned int val3 = (*Validation_Input3)[i % Validation_Input3->GetSize()];
+            p->input1 = val1;
+            p->input2 = val2;
+            p->input3 = val3;
+        }
+
+        // use shader from data table
+        pShaderOp->Shaders.at(0).Target = shader.Target;
+        pShaderOp->Shaders.at(0).EntryPoint = shader.EntryPoint;
+        pShaderOp->Shaders.at(0).Text = shader.Text;
+    });
+
+    MappedData data;
+    test->Test->GetReadBackData("STertiaryUintOp", &data);
+
+    STertiaryUintOp *pPrimitives = (STertiaryUintOp *)data.data();
+    WEX::TestExecution::DisableVerifyExceptions dve;
+    for (unsigned i = 0; i < count; ++i) {
+        STertiaryUintOp *p = &pPrimitives[i];
+        unsigned int val1 = (*Validation_Expected)[i % Validation_Expected->GetSize()];
+        LogCommentFmt(L"element #%u, input1 = %11u(0x%08x), input2 = "
+            L"%11u(0x%08x), input3 = %11u(0x%08x), output = "
+            L"%11u(0x%08x), expected = %11u(0x%08x)", i,
+            p->input1, p->input1, p->input2, p->input2, p->input3, p->input3,
+            p->output, p->output, val1, val1);
+        VerifyOutputWithExpectedValueInt(p->output, val1, Validation_Tolerance);
+    }
+}
+
+TEST_F(ExecutionTest, DotTest) {
+    WEX::TestExecution::SetVerifyOutput verifySettings(
+        WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
+    CComPtr<IStream> pStream;
+    ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+
+    CComPtr<ID3D12Device> pDevice;
+    if (!CreateDevice(&pDevice)) {
+        return;
+    }
+
+    int tableSize = sizeof(DotOpParameters) / sizeof(TableParameter);
+    TableParameterHandler handler(DotOpParameters, tableSize);
+    handler.clearTableParameter();
+    VERIFY_SUCCEEDED(ParseTableRow(DotOpParameters, tableSize));
+
+    st::ShaderOpShader shader;
+
+    CW2A Name(handler.GetTableParamByName(L"ShaderOp.Name")->m_str);
+    CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
+    CW2A EntryPoint(handler.GetTableParamByName(L"ShaderOp.EntryPoint")->m_str);
+    CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
+    shader.Name = Name.m_psz;
+    shader.Target = Target.m_psz;
+    shader.EntryPoint = EntryPoint.m_psz;
+    shader.Text = Text.m_psz;
+
+    WEX::TestExecution::TestDataArray<WEX::Common::String> *Validation_Input1 =
+        &handler.GetTableParamByName(L"Validation.Input1")->m_StringTable;
+    WEX::TestExecution::TestDataArray<WEX::Common::String> *Validation_Input2 =
+        &handler.GetTableParamByName(L"Validation.Input2")->m_StringTable;
+    WEX::TestExecution::TestDataArray<WEX::Common::String> *Validation_dot2 =
+        &handler.GetTableParamByName(L"Validation.dot2")->m_StringTable;
+    WEX::TestExecution::TestDataArray<WEX::Common::String> *Validation_dot3 =
+        &handler.GetTableParamByName(L"Validation.dot3")->m_StringTable;
+    WEX::TestExecution::TestDataArray<WEX::Common::String> *Validation_dot4 =
+        &handler.GetTableParamByName(L"Validation.dot4")->m_StringTable;
+
+    PCWSTR Validation_type = handler.GetTableParamByName(L"Validation.Type")->m_str;
+    double tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_double;
+    unsigned int count = handler.GetTableParamByName(L"Validation.NumInput")->m_uint;
+
+    std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+        pDevice, m_support, pStream, "DotOp",
+        // this callbacked is called when the test
+        // is creating the resource to run the test
+        [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
+        VERIFY_IS_TRUE(0 == _stricmp(Name, "SDotOp"));
+        size_t size = sizeof(SDotOp) * count;
+        Data.resize(size);
+        SDotOp *pPrimitives = (SDotOp*)Data.data();
+        for (size_t i = 0; i < count; ++i) {
+            SDotOp *p = &pPrimitives[i];
+            XMFLOAT4 val1,val2;
+            VERIFY_SUCCEEDED(ParseDataToVectorFloat((*Validation_Input1)[i],
+                                                    (float *)&val1, 4));
+            VERIFY_SUCCEEDED(ParseDataToVectorFloat((*Validation_Input2)[i],
+                                                    (float *)&val2, 4));
+            p->input1 = val1;
+            p->input2 = val2;
+        }
+        // use shader from data table
+        pShaderOp->Shaders.at(0).Target = shader.Target;
+        pShaderOp->Shaders.at(0).EntryPoint = shader.EntryPoint;
+        pShaderOp->Shaders.at(0).Text = shader.Text;
+    });
+
+    MappedData data;
+    test->Test->GetReadBackData("SDotOp", &data);
+
+    SDotOp *pPrimitives = (SDotOp*)data.data();
+    WEX::TestExecution::DisableVerifyExceptions dve;
+    for (size_t i = 0; i < count; ++i) {
+        SDotOp *p = &pPrimitives[i];
+        float dot2, dot3, dot4;
+        VERIFY_SUCCEEDED(ParseDataToFloat((*Validation_dot2)[i], dot2));
+        VERIFY_SUCCEEDED(ParseDataToFloat((*Validation_dot3)[i], dot3));
+        VERIFY_SUCCEEDED(ParseDataToFloat((*Validation_dot4)[i], dot4));
+        LogCommentFmt(
+            L"element #%u, input1 = (%f, %f, %f, %f), input2 = (%f, %f, "
+            L"%f, %f), \n dot2 = %f, dot2_expected = %f, dot3 = %f, "
+            L"dot3_expected = %f, dot4 = %f, dot4_expected = %f",
+            i, p->input1.x, p->input1.y, p->input1.z, p->input1.w, p->input2.x,
+            p->input2.y, p->input2.z, p->input2.w, p->o_dot2, dot2, p->o_dot3, dot3,
+            p->o_dot4, dot4);
+        VerifyOutputWithExpectedValueFloat(p->o_dot2, dot2, Validation_type,
+                                           tolerance);
+        VerifyOutputWithExpectedValueFloat(p->o_dot3, dot3, Validation_type,
+                                           tolerance);
+        VerifyOutputWithExpectedValueFloat(p->o_dot4, dot4, Validation_type,
+                                           tolerance);
+    }
+}
+
+TEST_F(ExecutionTest, Msad4Test) {
+    WEX::TestExecution::SetVerifyOutput verifySettings(
+        WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
+    CComPtr<IStream> pStream;
+    ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+
+    CComPtr<ID3D12Device> pDevice;
+    if (!CreateDevice(&pDevice)) {
+        return;
+    }
+    size_t tableSize = sizeof(Msad4OpParameters) / sizeof(TableParameter);
+    TableParameterHandler handler(Msad4OpParameters, tableSize);
+    handler.clearTableParameter();
+    VERIFY_SUCCEEDED(ParseTableRow(Msad4OpParameters, tableSize));
+
+    CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
+    double tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_double;
+    unsigned int count = handler.GetTableParamByName(L"Validation.NumInput")->m_uint;
+
+    WEX::TestExecution::TestDataArray<unsigned int> *Validation_Reference =
+        &handler.GetTableParamByName(L"Validation.Reference")->m_uintTable;
+    WEX::TestExecution::TestDataArray<WEX::Common::String> *Validation_Source =
+        &handler.GetTableParamByName(L"Validation.Source")->m_StringTable;
+    WEX::TestExecution::TestDataArray<WEX::Common::String> *Validation_Accum =
+        &handler.GetTableParamByName(L"Validation.Accum")->m_StringTable;
+    WEX::TestExecution::TestDataArray<WEX::Common::String> *Validation_Expected =
+        &handler.GetTableParamByName(L"Validation.Expected")->m_StringTable;
+
+    std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
+        pDevice, m_support, pStream, "Msad4",
+        // this callbacked is called when the test
+        // is creating the resource to run the test
+        [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
+        VERIFY_IS_TRUE(0 == _stricmp(Name, "SMsad4"));
+        size_t size = sizeof(SMsad4) * count;
+        Data.resize(size);
+        SMsad4 *pPrimitives = (SMsad4*)Data.data();
+        for (size_t i = 0; i < count; ++i) {
+            SMsad4 *p = &pPrimitives[i];
+            XMUINT2 src;
+            XMUINT4 accum;
+            VERIFY_SUCCEEDED(ParseDataToVectorUint((*Validation_Source)[i], (unsigned int*)&src, 2));
+            VERIFY_SUCCEEDED(ParseDataToVectorUint((*Validation_Accum)[i], (unsigned int*)&accum, 4));
+            p->ref = (*Validation_Reference)[i];
+            p->src = src;
+            p->accum = accum;
+        }
+        // use shader from data table
+        pShaderOp->Shaders.at(0).Text = Text.m_psz;
+    });
+
+    MappedData data;
+    test->Test->GetReadBackData("SMsad4", &data);
+
+    SMsad4 *pPrimitives = (SMsad4*)data.data();
+    WEX::TestExecution::DisableVerifyExceptions dve;
+    for (size_t i = 0; i < count; ++i) {
+        SMsad4 *p = &pPrimitives[i];
+        XMUINT4 result;
+        VERIFY_SUCCEEDED(ParseDataToVectorUint((*Validation_Expected)[i],
+                                               (unsigned int *)&result, 4));
+        LogCommentFmt(
+            L"element #%u, ref = %u(0x%08x), src = %u(0x%08x), %u(0x%08x), "
+            L"accum = %u(0x%08x), %u(0x%08x), %u(0x%08x), %u(0x%08x),\n"
+            L"result = %u(0x%08x), %u(0x%08x), %u(0x%08x), %u(0x%08x),\n"
+            L"expected = %u(0x%08x), %u(0x%08x), %u(0x%08x), %u(0x%08x)", i,
+            p->ref, p->ref, p->src.x, p->src.x, p->src.y, p->src.y, p->accum.x,
+            p->accum.x, p->accum.y, p->accum.y, p->accum.z, p->accum.z,
+            p->accum.w, p->accum.w, p->result.x, p->result.x, p->result.y,
+            p->result.y, p->result.z, p->result.z, p->result.w, p->result.w,
+            result.x, result.x, result.y, result.y, result.z, result.z,
+            result.w, result.w);
+
+        VerifyOutputWithExpectedValueInt(p->result.x, result.x, tolerance);
+        VerifyOutputWithExpectedValueInt(p->result.y, result.y, tolerance);
+        VerifyOutputWithExpectedValueInt(p->result.z, result.z, tolerance);
+        VerifyOutputWithExpectedValueInt(p->result.w, result.w, tolerance);
+    }
+}
+
+template <class T1, class T2>
+void ExecutionTest::WaveIntrinsicsActivePrefixTest(
+    TableParameter *pParameterList, size_t numParameter, bool isPrefix) {
+  WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
+
+  // Resource representation for compute shader
+  // firstLaneId is used to group different waves
+  struct PerThreadData {
+      int firstLaneId;
+      int mask;
+      T1 input;
+      T2 output;
+  };
+
+  unsigned int NumThreadsX = 8;
+  unsigned int NumThreadsY = 12;
+  unsigned int NumThreadsZ = 1;
+
+  static const unsigned int ThreadsPerGroup = NumThreadsX * NumThreadsY * NumThreadsZ;
+  static const unsigned int DispatchGroupCount = 1;
+  static const unsigned int ThreadCount = ThreadsPerGroup * DispatchGroupCount;
+  CComPtr<IStream> pStream;
+  ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
+
+  CComPtr<ID3D12Device> pDevice;
+  if (!CreateDevice(&pDevice)) {
+    return;
+  }
+  if (!DoesDeviceSupportWaveOps(pDevice)) {
+    // Optional feature, so it's correct to not support it if declared as such.
+    WEX::Logging::Log::Comment(L"Device does not support wave operations.");
+    return;
+  }
+
+  TableParameterHandler handler(pParameterList, numParameter);
+  handler.clearTableParameter();
+  VERIFY_SUCCEEDED(ParseTableRow(pParameterList, numParameter));
+
+  unsigned int numInputSet = handler.GetTableParamByName(L"Validation.NumInputSet")->m_uint;
+
+  // Obtain the list of input lists
+  typedef WEX::TestExecution::TestDataArray<T1> DataArray;
+  std::vector<DataArray*> InputList;
+  for (unsigned int i = 0;
+    i < numInputSet; ++i) {
+    std::wstring inputName = L"Validation.InputSet";
+    inputName.append(std::to_wstring(i + 1));
+    InputList.push_back(handler.GetDataArray<T1>(inputName.data()));
+  }
+  CW2A Text(handler.GetTableParamByName(L"ShaderOp.text")->m_str);
+
+  std::shared_ptr<st::ShaderOpSet> ShaderOpSet = std::make_shared<st::ShaderOpSet>();
+  st::ParseShaderOpSetFromStream(pStream, ShaderOpSet.get());
+
+  // Running compute shader for each input set with different masks
+  for (size_t setIndex = 0; setIndex < numInputSet; ++setIndex) {
+    for (size_t maskIndex = 0; maskIndex < sizeof(MaskFunctionTable) / sizeof(MaskFunction); ++maskIndex) {
+      std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(
+        pDevice, m_support, pStream, "WaveIntrinsicsOp",
+        // this callbacked is called when the test
+        // is creating the resource to run the test
+        [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
+        VERIFY_IS_TRUE(0 == _stricmp(Name, "SWaveIntrinsicsOp"));
+        size_t size = sizeof(PerThreadData) * ThreadCount;
+        Data.resize(size);
+        PerThreadData *pPrimitives = (PerThreadData*)Data.data();
+        // 4 different inputs for each operation test
+        size_t index = 0;
+        while (index < ThreadCount) {
+          PerThreadData *p = &pPrimitives[index];
+          DataArray *IntList = InputList[setIndex];
+          p->mask = MaskFunctionTable[maskIndex](index);
+          p->input = (*IntList)[index % IntList->GetSize()];
+          p->output = 0xFFFFBFFF;
+          index++;
+        }
+        // use shader from data table
+        pShaderOp->Shaders.at(0).Text = Text.m_psz;
+      }, ShaderOpSet);
+
+      // Check the value
+      MappedData data;
+      test->Test->GetReadBackData("SWaveIntrinsicsOp", &data);
+
+      PerThreadData *pPrimitives = (PerThreadData*)data.data();
+      WEX::TestExecution::DisableVerifyExceptions dve;
+
+      // Grouping data by waves
+      std::vector<int> firstLaneIds;
+      for (size_t i = 0; i < ThreadCount; ++i) {
+        PerThreadData *p = &pPrimitives[i];
+        int firstLaneId = p->firstLaneId;
+        if (!contains(firstLaneIds, firstLaneId)) {
+          firstLaneIds.push_back(firstLaneId);
+        }
+      }
+
+      std::map<int, std::unique_ptr<std::vector<PerThreadData *>>> waves;
+      for (size_t i = 0; i < firstLaneIds.size(); ++i) {
+        waves[firstLaneIds.at(i)] = std::make_unique<std::vector<PerThreadData*>>(std::vector<PerThreadData*>());
+      }
+
+      for (size_t i = 0; i < ThreadCount; ++i) {
+        PerThreadData *p = &pPrimitives[i];
+        waves[p->firstLaneId].get()->push_back(p);
+      }
+
+      // validate for each wave
+      for (size_t i = 0; i < firstLaneIds.size(); ++i) {
+        // collect inputs and masks for a given wave
+        std::vector<PerThreadData *> *waveData = waves[firstLaneIds.at(i)].get();
+        std::vector<T1> inputList(waveData->size());
+        std::vector<int> maskList(waveData->size());
+        std::wstring inputStr = L"Wave Inputs: ";
+        std::wstring maskStr =  L"Wave Mask:   ";
+        for (size_t j = 0; j < waveData->size(); ++j) {
+          inputList.at(j) = (waveData->at(j)->input);
+          maskList.at(j) = (waveData->at(j)->mask);
+          inputStr.append(std::to_wstring(waveData->at(j)->input));
+          inputStr.append(L" ");
+          maskStr.append(std::to_wstring(waveData->at(j)->mask));
+          maskStr.append(L" ");
+        }
+        LogCommentFmt(inputStr.data());
+        LogCommentFmt(maskStr.data());
+        // Compute expected output for a given inputs, masks, and index
+        for (size_t laneIndex = 0; laneIndex < waveData->size(); ++laneIndex) {
+          T2 expected;
+          // WaveActive is equivalent to WavePrefix lane # lane count
+          unsigned int index = isPrefix ? laneIndex : waveData->size();
+          if (waveData->at(laneIndex)->mask == 1) {
+            expected = computeExpectedWithShaderOp<T1, T2>(
+              inputList, maskList, 1, index,
+              handler.GetTableParamByName(L"ShaderOp.Name")->m_str);
+          }
+          else {
+            expected = computeExpectedWithShaderOp<T1, T2>(
+              inputList, maskList, 0, index,
+              handler.GetTableParamByName(L"ShaderOp.Name")->m_str);
+          }
+          // TODO: use different comparison for floating point inputs
+          bool equal = waveData->at(laneIndex)->output == expected;
+          if (!equal) {
+            LogCommentFmt(L"lane%d: %4d, Expected : %4d", laneIndex, waveData->at(laneIndex)->output, expected);
+          }
+          VERIFY_IS_TRUE(equal);
+        }
+      }
+    }
+  }
+}
+
+static const unsigned int MinWarpVersionForWaveIntrinsics = 16202;
+
+TEST_F(ExecutionTest, WaveIntrinsicsActiveIntTest) {
+  if (GetTestParamUseWARP(true) &&
+      !IsValidWarpDllVersion(MinWarpVersionForWaveIntrinsics)) {
+    return;
+  }
+  WaveIntrinsicsActivePrefixTest<int, int>(
+      WaveIntrinsicsActiveIntParameters,
+      sizeof(WaveIntrinsicsActiveIntParameters) / sizeof(TableParameter),
+      /*isPrefix*/ false);
+}
+
+TEST_F(ExecutionTest, WaveIntrinsicsActiveUintTest) {
+  if (GetTestParamUseWARP(true) &&
+      !IsValidWarpDllVersion(MinWarpVersionForWaveIntrinsics)) {
+    return;
+  }
+  WaveIntrinsicsActivePrefixTest<unsigned int, unsigned int>(
+      WaveIntrinsicsActiveUintParameters,
+      sizeof(WaveIntrinsicsActiveUintParameters) / sizeof(TableParameter),
+      /*isPrefix*/ false);
+}
+
+TEST_F(ExecutionTest, WaveIntrinsicsPrefixIntTest) {
+  if (GetTestParamUseWARP(true) &&
+      !IsValidWarpDllVersion(MinWarpVersionForWaveIntrinsics)) {
+    return;
+  }
+  WaveIntrinsicsActivePrefixTest<int, int>(
+      WaveIntrinsicsPrefixIntParameters,
+      sizeof(WaveIntrinsicsPrefixIntParameters) / sizeof(TableParameter),
+      /*isPrefix*/ true);
+}
+
+TEST_F(ExecutionTest, WaveIntrinsicsPrefixUintTest) {
+  if (GetTestParamUseWARP(true) &&
+      !IsValidWarpDllVersion(MinWarpVersionForWaveIntrinsics)) {
+    return;
+  }
+  WaveIntrinsicsActivePrefixTest<unsigned int, unsigned int>(
+      WaveIntrinsicsPrefixUintParameters,
+      sizeof(WaveIntrinsicsPrefixUintParameters) / sizeof(TableParameter),
+      /*isPrefix*/ true);
+}
+
+static void WriteReadBackDump(st::ShaderOp *pShaderOp, st::ShaderOpTest *pTest,
+                              char **pReadBackDump) {
+  std::stringstream str;
+
+  unsigned count = 0;
+  for (auto &R : pShaderOp->Resources) {
+    if (!R.ReadBack)
+      continue;
+    ++count;
+    str << "Resource: " << R.Name << "\r\n";
+    // Find a descriptor that can tell us how to dump this resource.
+    bool found = false;
+    for (auto &Heaps : pShaderOp->DescriptorHeaps) {
+      for (auto &D : Heaps.Descriptors) {
+        if (_stricmp(D.ResName, R.Name) != 0) {
+          continue;
+        }
+        found = true;
+        if (_stricmp(D.Kind, "UAV") != 0) {
+          str << "Resource dump for kind " << D.Kind << " not implemented yet.\r\n";
+          break;
+        }
+        if (D.UavDesc.ViewDimension != D3D12_UAV_DIMENSION_BUFFER) {
+          str << "Resource dump for this kind of view dimension not implemented yet.\r\n";
+          break;
+        }
+        // We can map back to the structure if a structured buffer via the shader, but
+        // we'll keep this simple and simply dump out 32-bit uint/float representations.
+        MappedData data;
+        pTest->GetReadBackData(R.Name, &data);
+        uint32_t *pData = (uint32_t *)data.data();
+        size_t u32_count = R.Desc.Width / sizeof(uint32_t);
+        for (size_t i = 0; i < u32_count; ++i) {
+          float f = *(float *)pData;
+          str << i << ": 0n" << *pData << "   0x" << std::hex << *pData
+              << std::dec << "   " << f << "\r\n";
+          ++pData;
+        }
+        break;
+      }
+      if (found) break;
+    }
+    if (!found) {
+      str << "Unable to find a view for the resource.\r\n";
+    }
+  }
+
+  str << "Resources read back: " << count << "\r\n";
+
+  std::string s(str.str());
+  CComHeapPtr<char> pDump;
+  if (!pDump.Allocate(s.size() + 1))
+    throw std::bad_alloc();
+  memcpy(pDump.m_pData, s.data(), s.size());
+  pDump.m_pData[s.size()] = '\0';
+  *pReadBackDump = pDump.Detach();
+}
+
+// This is the exported interface by use from HLSLHost.exe.
+// It's exclusive with the use of the DLL as a TAEF target.
+extern "C" {
+  __declspec(dllexport) HRESULT WINAPI InitializeOpTests(void *pStrCtx, st::OutputStringFn pOutputStrFn) {
+    HRESULT hr = EnableExperimentalShaderModels();
+    if (FAILED(hr)) {
+      pOutputStrFn(pStrCtx, L"Unable to enable experimental shader models.\r\n.");
+    }
+    return S_OK;
+  }
+
+  __declspec(dllexport) HRESULT WINAPI
+      RunOpTest(void *pStrCtx, st::OutputStringFn pOutputStrFn, LPCSTR pText,
+                ID3D12Device *pDevice, ID3D12CommandQueue *pCommandQueue,
+                ID3D12Resource *pRenderTarget, char **pReadBackDump) {
+
+    HRESULT hr;
+    if (pReadBackDump) *pReadBackDump = nullptr;
+    st::SetOutputFn(pStrCtx, pOutputStrFn);
+    CComPtr<ID3D12InfoQueue> pInfoQueue;
+    CComHeapPtr<char> pDump;
+    bool FilterCreation = false;
+    if (SUCCEEDED(pDevice->QueryInterface(&pInfoQueue))) {
+      // Creation is largely driven by inputs, so don't log create/destroy messages.
+      pInfoQueue->PushEmptyStorageFilter();
+      pInfoQueue->PushEmptyRetrievalFilter();
+      if (FilterCreation) {
+        D3D12_INFO_QUEUE_FILTER filter;
+        D3D12_MESSAGE_CATEGORY denyCategories[] = { D3D12_MESSAGE_CATEGORY_STATE_CREATION };
+        ZeroMemory(&filter, sizeof(filter));
+        filter.DenyList.NumCategories = _countof(denyCategories);
+        filter.DenyList.pCategoryList = denyCategories;
+        pInfoQueue->PushStorageFilter(&filter);
+      }
+    }
+    else {
+      pOutputStrFn(pStrCtx, L"Unable to enable info queue for D3D.\r\n.");
+    }
+    try {
+      dxc::DxcDllSupport m_support;
+      m_support.Initialize();
+
+      const char *pName = nullptr;
+      CComPtr<IStream> pStream = SHCreateMemStream((BYTE *)pText, strlen(pText));
+      std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
+        std::make_shared<st::ShaderOpSet>();
+      st::ParseShaderOpSetFromStream(pStream, ShaderOpSet.get());
+      st::ShaderOp *pShaderOp;
+      if (pName == nullptr) {
+        if (ShaderOpSet->ShaderOps.size() != 1) {
+          pOutputStrFn(pStrCtx, L"Expected a single shader operation.\r\n");
+          return E_FAIL;
+        }
+        pShaderOp = ShaderOpSet->ShaderOps[0].get();
+      }
+      else {
+        pShaderOp = ShaderOpSet->GetShaderOp(pName);
+      }
+      if (pShaderOp == nullptr) {
+        std::string msg = "Unable to find shader op ";
+        msg += pName;
+        msg += "; available ops";
+        const char sep = ':';
+        for (auto &pAvailOp : ShaderOpSet->ShaderOps) {
+          msg += sep;
+          msg += pAvailOp->Name ? pAvailOp->Name : "[n/a]";
+        }
+        CA2W msgWide(msg.c_str());
+        pOutputStrFn(pStrCtx, msgWide);
+        return E_FAIL;
+      }
+
+      std::shared_ptr<st::ShaderOpTest> test = std::make_shared<st::ShaderOpTest>();
+      test->SetupRenderTarget(pShaderOp, pDevice, pCommandQueue, pRenderTarget);
+      test->SetDxcSupport(&m_support);
+      test->RunShaderOp(pShaderOp);
+      test->PresentRenderTarget(pShaderOp, pCommandQueue, pRenderTarget);
+
+      pOutputStrFn(pStrCtx, L"Rendering complete.\r\n");
+
+      if (!pShaderOp->IsCompute()) {
+        D3D12_QUERY_DATA_PIPELINE_STATISTICS stats;
+        test->GetPipelineStats(&stats);
+        wchar_t statsText[400];
+        StringCchPrintfW(statsText, _countof(statsText),
+          L"Vertices/primitives read by input assembler: %I64u/%I64u\r\n"
+          L"Vertex shader invocations: %I64u\r\n"
+          L"Geometry shader invocations/output primitive: %I64u/%I64u\r\n"
+          L"Primitives sent to rasterizer/rendered: %I64u/%I64u\r\n"
+          L"PS/HS/DS/CS invocations: %I64u/%I64u/%I64u/%I64u\r\n",
+          stats.IAVertices, stats.IAPrimitives, stats.VSInvocations,
+          stats.GSInvocations, stats.GSPrimitives, stats.CInvocations,
+          stats.CPrimitives, stats.PSInvocations, stats.HSInvocations,
+          stats.DSInvocations, stats.CSInvocations);
+        pOutputStrFn(pStrCtx, statsText);
+      }
+
+      if (pReadBackDump) {
+        WriteReadBackDump(pShaderOp, test.get(), &pDump);
+      }
+
+      hr = S_OK;
+    }
+    catch (const CAtlException &E)
+    {
+      hr = E.m_hr;
+    }
+    catch (const std::bad_alloc &)
+    {
+      hr = E_OUTOFMEMORY;
+    }
+    catch (const std::exception &)
+    {
+      hr = E_FAIL;
+    }
+
+    // Drain the device message queue if available.
+    if (pInfoQueue != nullptr) {
+      wchar_t buf[200];
+      StringCchPrintfW(buf, _countof(buf),
+        L"NumStoredMessages=%u limit/discarded by limit=%u/%u "
+        L"allowed/denied by storage filter=%u/%u "
+        L"NumStoredMessagesAllowedByRetrievalFilter=%u\r\n",
+        (unsigned)pInfoQueue->GetNumStoredMessages(),
+        (unsigned)pInfoQueue->GetMessageCountLimit(),
+        (unsigned)pInfoQueue->GetNumMessagesDiscardedByMessageCountLimit(),
+        (unsigned)pInfoQueue->GetNumMessagesAllowedByStorageFilter(),
+        (unsigned)pInfoQueue->GetNumMessagesDeniedByStorageFilter(),
+        (unsigned)pInfoQueue->GetNumStoredMessagesAllowedByRetrievalFilter());
+      pOutputStrFn(pStrCtx, buf);
+
+      WriteInfoQueueMessages(pStrCtx, pOutputStrFn, pInfoQueue);
+
+      pInfoQueue->ClearStoredMessages();
+      pInfoQueue->PopRetrievalFilter();
+      pInfoQueue->PopStorageFilter();
+      if (FilterCreation) {
+        pInfoQueue->PopStorageFilter();
+      }
+    }
+
+    if (pReadBackDump) *pReadBackDump = pDump.Detach();
+
+    return hr;
+  }
+}
diff --git a/tools/clang/unittests/HLSL/HlslTestUtils.h b/tools/clang/unittests/HLSL/HlslTestUtils.h
index afd2ef103..7b4d99c43 100644
--- a/tools/clang/unittests/HLSL/HlslTestUtils.h
+++ b/tools/clang/unittests/HLSL/HlslTestUtils.h
@@ -1,333 +1,333 @@
-///////////////////////////////////////////////////////////////////////////////
-//                                                                           //
-// HlslTestUtils.h                                                           //
-// Copyright (C) Microsoft Corporation. All rights reserved.                 //
-// This file is distributed under the University of Illinois Open Source     //
-// License. See LICENSE.TXT for details.                                     //
-//                                                                           //
-// Provides utility functions for HLSL tests.                                //
-//                                                                           //
-///////////////////////////////////////////////////////////////////////////////
-
-#include <string>
-#include <sstream>
-#include <fstream>
-#include "dxc/Support/Unicode.h"
-#include <dxgiformat.h>
-
-// If TAEF verify macros are available, use them to alias other legacy
-// comparison macros that don't have a direct translation.
-//
-// Other common replacements are as follows.
-//
-// EXPECT_EQ -> VERIFY_ARE_EQUAL
-// ASSERT_EQ -> VERIFY_ARE_EQUAL
-//
-// Note that whether verification throws or continues depends on
-// preprocessor settings.
-
-#ifdef VERIFY_ARE_EQUAL
-#define EXPECT_STREQ(a, b) VERIFY_ARE_EQUAL(0, strcmp(a, b))
-#define EXPECT_STREQW(a, b) VERIFY_ARE_EQUAL(0, wcscmp(a, b))
-#define VERIFY_ARE_EQUAL_CMP(a, b, ...) VERIFY_IS_TRUE(a == b, __VA_ARGS__)
-#define VERIFY_ARE_EQUAL_STR(a, b, ...) { \
-  const char *pTmpA = (a);\
-  const char *pTmpB = (b);\
-  if (0 != strcmp(pTmpA, pTmpB)) {\
-    CA2W conv(pTmpB, CP_UTF8); WEX::Logging::Log::Comment(conv);\
-    const char *pA = pTmpA; const char *pB = pTmpB; \
-    while(*pA == *pB) { pA++; pB++; } \
-    wchar_t diffMsg[32]; swprintf_s(diffMsg, _countof(diffMsg), L"diff at %u", (unsigned)(pA-pTmpA)); \
-    WEX::Logging::Log::Comment(diffMsg); \
-  } \
-  VERIFY_ARE_EQUAL(0, strcmp(pTmpA, pTmpB), __VA_ARGS__); \
-}
-#define VERIFY_ARE_EQUAL_WSTR(a, b, ...) { \
-  if (0 != wcscmp(a, b)) { WEX::Logging::Log::Comment(b);} \
-  VERIFY_ARE_EQUAL(0, wcscmp(a, b), __VA_ARGS__); \
-}
-#define ASSERT_EQ(expected, actual) VERIFY_ARE_EQUAL(expected, actual)
-#define ASSERT_NE(expected, actual) VERIFY_ARE_NOT_EQUAL(expected, actual)
-#define TEST_F(typeName, functionName) void typeName::functionName()
-#define ASSERT_HRESULT_SUCCEEDED VERIFY_SUCCEEDED
-#define EXPECT_EQ(expected, actual) VERIFY_ARE_EQUAL(expected, actual)
-#endif
-
-namespace hlsl_test {
-
-inline std::wstring
-vFormatToWString(_In_z_ _Printf_format_string_ const wchar_t *fmt, va_list argptr) {
-  std::wstring result;
-  int len = _vscwprintf(fmt, argptr);
-  result.resize(len + 1);
-  vswprintf_s((wchar_t *)result.data(), len + 1, fmt, argptr);
-  return result;
-}
-
-inline std::wstring
-FormatToWString(_In_z_ _Printf_format_string_ const wchar_t *fmt, ...) {
-  va_list args;
-  va_start(args, fmt);
-  std::wstring result(vFormatToWString(fmt, args));
-  va_end(args);
-  return result;
-}
-
-inline void LogCommentFmt(_In_z_ _Printf_format_string_ const wchar_t *fmt, ...) {
-  va_list args;
-  va_start(args, fmt);
-  std::wstring buf(vFormatToWString(fmt, args));
-  va_end(args);
-  WEX::Logging::Log::Comment(buf.data());
-}
-
-inline void LogErrorFmt(_In_z_ _Printf_format_string_ const wchar_t *fmt, ...) {
-    va_list args;
-    va_start(args, fmt);
-    std::wstring buf(vFormatToWString(fmt, args));
-    va_end(args);
-    WEX::Logging::Log::Error(buf.data());
-}
-
-inline std::wstring GetPathToHlslDataFile(const wchar_t* relative) {
-  WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
-  WEX::Common::String HlslDataDirValue;
-  ASSERT_HRESULT_SUCCEEDED(WEX::TestExecution::RuntimeParameters::TryGetValue(L"HlslDataDir", HlslDataDirValue));
-
-  wchar_t envPath[MAX_PATH];
-  wchar_t expanded[MAX_PATH];
-  swprintf_s(envPath, _countof(envPath), L"%s\\%s", reinterpret_cast<wchar_t*>(HlslDataDirValue.GetBuffer()), relative);
-  VERIFY_WIN32_BOOL_SUCCEEDED(ExpandEnvironmentStringsW(envPath, expanded, _countof(expanded)));
-  return std::wstring(expanded);
-}
-
-inline bool PathLooksAbsolute(LPCWSTR name) {
-  // Very simplified, only for the cases we care about in the test suite.
-  return name && *name && ((*name == L'\\') || (name[1] == L':'));
-}
-
-inline std::string GetFirstLine(LPCWSTR name) {
-  char firstLine[300];
-  memset(firstLine, 0, sizeof(firstLine));
-
-  const std::wstring path = PathLooksAbsolute(name)
-                                ? std::wstring(name)
-                                : hlsl_test::GetPathToHlslDataFile(name);
-  std::ifstream infile(path);
-  if (infile.bad()) {
-    std::wstring errMsg(L"Unable to read file ");
-    errMsg += path;
-    WEX::Logging::Log::Error(errMsg.c_str());
-    VERIFY_FAIL();
-  }
-
-  infile.getline(firstLine, _countof(firstLine));
-  return firstLine;
-}
-
-inline HANDLE CreateFileForReading(LPCWSTR path) {
-  HANDLE sourceHandle = CreateFileW(path, GENERIC_READ, 0, 0, OPEN_EXISTING, 0, 0);
-  if (sourceHandle == INVALID_HANDLE_VALUE) {
-    DWORD err = GetLastError();
-    std::wstring errorMessage(FormatToWString(L"Unable to open file '%s', err=%u", path, err).c_str());
-    VERIFY_SUCCEEDED(HRESULT_FROM_WIN32(err), errorMessage.c_str());
-  }
-  return sourceHandle;
-}
-
-inline HANDLE CreateNewFileForReadWrite(LPCWSTR path) {
-  HANDLE sourceHandle = CreateFileW(path, GENERIC_READ | GENERIC_WRITE, 0, 0, CREATE_ALWAYS, 0, 0);
-  if (sourceHandle == INVALID_HANDLE_VALUE) {
-    DWORD err = GetLastError();
-    std::wstring errorMessage(FormatToWString(L"Unable to create file '%s', err=%u", path, err).c_str());
-    VERIFY_SUCCEEDED(HRESULT_FROM_WIN32(err), errorMessage.c_str());
-  }
-  return sourceHandle;
-}
-
-inline bool GetTestParamBool(LPCWSTR name) {
-  WEX::Common::String ParamValue;
-  WEX::Common::String NameValue;
-  if (FAILED(WEX::TestExecution::RuntimeParameters::TryGetValue(name,
-                                                                ParamValue))) {
-    return false;
-  }
-  if (ParamValue.IsEmpty()) {
-    return false;
-  }
-  if (0 == wcscmp(ParamValue, L"*")) {
-    return true;
-  }
-  VERIFY_SUCCEEDED(WEX::TestExecution::RuntimeParameters::TryGetValue(
-      L"TestName", NameValue));
-  if (NameValue.IsEmpty()) {
-    return false;
-  }
-  return Unicode::IsStarMatchUTF16(ParamValue, ParamValue.GetLength(),
-                                   NameValue, NameValue.GetLength());
-}
-
-inline bool GetTestParamUseWARP(bool defaultVal) {
-  WEX::Common::String AdapterValue;
-  if (FAILED(WEX::TestExecution::RuntimeParameters::TryGetValue(
-          L"Adapter", AdapterValue))) {
-    return defaultVal;
-  }
-  if (defaultVal && AdapterValue.IsEmpty() ||
-      AdapterValue.CompareNoCase(L"WARP") == 0) {
-    return true;
-  }
-  return false;
-}
-
-}
-
-inline bool isdenorm(float f) {
-  return FP_SUBNORMAL == fpclassify(f);
-}
-
-inline bool isdenorm(double d) {
-  return FP_SUBNORMAL == fpclassify(d);
-}
-
-inline float ifdenorm_flushf(float a) {
-  return isdenorm(a) ? copysign(0.0f, a) : a;
-}
-
-inline bool ifdenorm_flushf_eq(float a, float b) {
-  return ifdenorm_flushf(a) == ifdenorm_flushf(b);
-}
-
-inline bool ifdenorm_flushf_eq_or_nans(float a, float b) {
-  if (isnan(a) && isnan(b)) return true;
-  return ifdenorm_flushf(a) == ifdenorm_flushf(b);
-}
-
-inline bool CompareFloatULP(const float &fsrc, const float &fref, int ULPTolerance) {
-    if (isnan(fsrc)) {
-        return isnan(fref);
-    }
-    if (isdenorm(fref)) { // Arithmetic operations of denorm may flush to sign-preserved zero
-        return (isdenorm(fsrc) || fsrc == 0) && (signbit(fsrc) == signbit(fref));
-    }
-    if (fsrc == fref) {
-        return true;
-    }
-    int diff = *((DWORD *)&fsrc) - *((DWORD *)&fref);
-    unsigned int uDiff = diff < 0 ? -diff : diff;
-    return uDiff <= (unsigned int)ULPTolerance;
-}
-
-inline bool CompareFloatEpsilon(const float &fsrc, const float &fref, float epsilon) {
-    if (isnan(fsrc)) {
-        return isnan(fref);
-    }
-    if (isdenorm(fref)) { // Arithmetic operations of denorm may flush to sign-preserved zero
-        return (isdenorm(fsrc) || fsrc == 0) && (signbit(fsrc) == signbit(fref));
-    }
-    return fsrc == fref || fabsf(fsrc - fref) < epsilon;
-}
-
-// Compare using relative error (relative error < 2^{nRelativeExp})
-inline bool CompareFloatRelativeEpsilon(const float &fsrc, const float &fref, int nRelativeExp) {
-    return CompareFloatULP(fsrc, fref, 23 - nRelativeExp);
-}
-
-// returns the number of bytes per pixel for a given dxgi format
-// add more cases if different format needed to copy back resources
-inline UINT GetByteSizeForFormat(DXGI_FORMAT value) {
-    switch (value) {
-    case DXGI_FORMAT_R32G32B32A32_TYPELESS: return 16;
-    case DXGI_FORMAT_R32G32B32A32_FLOAT: return 16;
-    case DXGI_FORMAT_R32G32B32A32_UINT: return 16;
-    case DXGI_FORMAT_R32G32B32A32_SINT: return 16;
-    case DXGI_FORMAT_R32G32B32_TYPELESS: return 12;
-    case DXGI_FORMAT_R32G32B32_FLOAT: return 12;
-    case DXGI_FORMAT_R32G32B32_UINT: return 12;
-    case DXGI_FORMAT_R32G32B32_SINT: return 12;
-    case DXGI_FORMAT_R16G16B16A16_TYPELESS: return 8;
-    case DXGI_FORMAT_R16G16B16A16_FLOAT: return 8;
-    case DXGI_FORMAT_R16G16B16A16_UNORM: return 8;
-    case DXGI_FORMAT_R16G16B16A16_UINT: return 8;
-    case DXGI_FORMAT_R16G16B16A16_SNORM: return 8;
-    case DXGI_FORMAT_R16G16B16A16_SINT: return 8;
-    case DXGI_FORMAT_R32G32_TYPELESS: return 8;
-    case DXGI_FORMAT_R32G32_FLOAT: return 8;
-    case DXGI_FORMAT_R32G32_UINT: return 8;
-    case DXGI_FORMAT_R32G32_SINT: return 8;
-    case DXGI_FORMAT_R32G8X24_TYPELESS: return 8;
-    case DXGI_FORMAT_D32_FLOAT_S8X24_UINT: return 4;
-    case DXGI_FORMAT_R32_FLOAT_X8X24_TYPELESS: return 4;
-    case DXGI_FORMAT_X32_TYPELESS_G8X24_UINT: return 4;
-    case DXGI_FORMAT_R10G10B10A2_TYPELESS: return 4;
-    case DXGI_FORMAT_R10G10B10A2_UNORM: return 4;
-    case DXGI_FORMAT_R10G10B10A2_UINT: return 4;
-    case DXGI_FORMAT_R11G11B10_FLOAT: return 4;
-    case DXGI_FORMAT_R8G8B8A8_TYPELESS: return 4;
-    case DXGI_FORMAT_R8G8B8A8_UNORM: return 4;
-    case DXGI_FORMAT_R8G8B8A8_UNORM_SRGB: return 4;
-    case DXGI_FORMAT_R8G8B8A8_UINT: return 4;
-    case DXGI_FORMAT_R8G8B8A8_SNORM: return 4;
-    case DXGI_FORMAT_R8G8B8A8_SINT: return 4;
-    case DXGI_FORMAT_R16G16_TYPELESS: return 4;
-    case DXGI_FORMAT_R16G16_FLOAT: return 4;
-    case DXGI_FORMAT_R16G16_UNORM: return 4;
-    case DXGI_FORMAT_R16G16_UINT: return 4;
-    case DXGI_FORMAT_R16G16_SNORM: return 4;
-    case DXGI_FORMAT_R16G16_SINT: return 4;
-    case DXGI_FORMAT_R32_TYPELESS: return 4;
-    case DXGI_FORMAT_D32_FLOAT: return 4;
-    case DXGI_FORMAT_R32_FLOAT: return 4;
-    case DXGI_FORMAT_R32_UINT: return 4;
-    case DXGI_FORMAT_R32_SINT: return 4;
-    case DXGI_FORMAT_R24G8_TYPELESS: return 4;
-    case DXGI_FORMAT_D24_UNORM_S8_UINT: return 4;
-    case DXGI_FORMAT_R24_UNORM_X8_TYPELESS: return 4;
-    case DXGI_FORMAT_X24_TYPELESS_G8_UINT: return 4;
-    case DXGI_FORMAT_R8G8_TYPELESS: return 2;
-    case DXGI_FORMAT_R8G8_UNORM: return 2;
-    case DXGI_FORMAT_R8G8_UINT: return 2;
-    case DXGI_FORMAT_R8G8_SNORM: return 2;
-    case DXGI_FORMAT_R8G8_SINT: return 2;
-    case DXGI_FORMAT_R16_TYPELESS: return 2;
-    case DXGI_FORMAT_R16_FLOAT: return 2;
-    case DXGI_FORMAT_D16_UNORM: return 2;
-    case DXGI_FORMAT_R16_UNORM: return 2;
-    case DXGI_FORMAT_R16_UINT: return 2;
-    case DXGI_FORMAT_R16_SNORM: return 2;
-    case DXGI_FORMAT_R16_SINT: return 2;
-    case DXGI_FORMAT_R8_TYPELESS: return 1;
-    case DXGI_FORMAT_R8_UNORM: return 1;
-    case DXGI_FORMAT_R8_UINT: return 1;
-    case DXGI_FORMAT_R8_SNORM: return 1;
-    case DXGI_FORMAT_R8_SINT: return 1;
-    case DXGI_FORMAT_A8_UNORM: return 1;
-    case DXGI_FORMAT_R1_UNORM: return 1;
-    default:
-        VERIFY_FAILED(E_INVALIDARG);
-        return 0;
-    }
-}
-
-
-#define SIMPLE_IUNKNOWN_IMPL1(_IFACE_) \
-  private: volatile ULONG m_dwRef; \
-  public:\
-  ULONG STDMETHODCALLTYPE AddRef() { return InterlockedIncrement(&m_dwRef); } \
-  ULONG STDMETHODCALLTYPE Release() { \
-    ULONG result = InterlockedDecrement(&m_dwRef); \
-    if (result == 0) delete this; \
-    return result; \
-  } \
-  HRESULT STDMETHODCALLTYPE QueryInterface(REFIID iid, void** ppvObject) { \
-    if (ppvObject == nullptr) return E_POINTER; \
-    if (IsEqualIID(iid, __uuidof(IUnknown)) || \
-      IsEqualIID(iid, __uuidof(INoMarshal)) || \
-      IsEqualIID(iid, __uuidof(_IFACE_))) { \
-      *ppvObject = reinterpret_cast<_IFACE_*>(this); \
-      reinterpret_cast<_IFACE_*>(this)->AddRef(); \
-      return S_OK; \
-    } \
-    return E_NOINTERFACE; \
-  }
-
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// HlslTestUtils.h                                                           //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// Provides utility functions for HLSL tests.                                //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#include <string>
+#include <sstream>
+#include <fstream>
+#include "dxc/Support/Unicode.h"
+#include <dxgiformat.h>
+
+// If TAEF verify macros are available, use them to alias other legacy
+// comparison macros that don't have a direct translation.
+//
+// Other common replacements are as follows.
+//
+// EXPECT_EQ -> VERIFY_ARE_EQUAL
+// ASSERT_EQ -> VERIFY_ARE_EQUAL
+//
+// Note that whether verification throws or continues depends on
+// preprocessor settings.
+
+#ifdef VERIFY_ARE_EQUAL
+#define EXPECT_STREQ(a, b) VERIFY_ARE_EQUAL(0, strcmp(a, b))
+#define EXPECT_STREQW(a, b) VERIFY_ARE_EQUAL(0, wcscmp(a, b))
+#define VERIFY_ARE_EQUAL_CMP(a, b, ...) VERIFY_IS_TRUE(a == b, __VA_ARGS__)
+#define VERIFY_ARE_EQUAL_STR(a, b, ...) { \
+  const char *pTmpA = (a);\
+  const char *pTmpB = (b);\
+  if (0 != strcmp(pTmpA, pTmpB)) {\
+    CA2W conv(pTmpB, CP_UTF8); WEX::Logging::Log::Comment(conv);\
+    const char *pA = pTmpA; const char *pB = pTmpB; \
+    while(*pA == *pB) { pA++; pB++; } \
+    wchar_t diffMsg[32]; swprintf_s(diffMsg, _countof(diffMsg), L"diff at %u", (unsigned)(pA-pTmpA)); \
+    WEX::Logging::Log::Comment(diffMsg); \
+  } \
+  VERIFY_ARE_EQUAL(0, strcmp(pTmpA, pTmpB), __VA_ARGS__); \
+}
+#define VERIFY_ARE_EQUAL_WSTR(a, b, ...) { \
+  if (0 != wcscmp(a, b)) { WEX::Logging::Log::Comment(b);} \
+  VERIFY_ARE_EQUAL(0, wcscmp(a, b), __VA_ARGS__); \
+}
+#define ASSERT_EQ(expected, actual) VERIFY_ARE_EQUAL(expected, actual)
+#define ASSERT_NE(expected, actual) VERIFY_ARE_NOT_EQUAL(expected, actual)
+#define TEST_F(typeName, functionName) void typeName::functionName()
+#define ASSERT_HRESULT_SUCCEEDED VERIFY_SUCCEEDED
+#define EXPECT_EQ(expected, actual) VERIFY_ARE_EQUAL(expected, actual)
+#endif
+
+namespace hlsl_test {
+
+inline std::wstring
+vFormatToWString(_In_z_ _Printf_format_string_ const wchar_t *fmt, va_list argptr) {
+  std::wstring result;
+  int len = _vscwprintf(fmt, argptr);
+  result.resize(len + 1);
+  vswprintf_s((wchar_t *)result.data(), len + 1, fmt, argptr);
+  return result;
+}
+
+inline std::wstring
+FormatToWString(_In_z_ _Printf_format_string_ const wchar_t *fmt, ...) {
+  va_list args;
+  va_start(args, fmt);
+  std::wstring result(vFormatToWString(fmt, args));
+  va_end(args);
+  return result;
+}
+
+inline void LogCommentFmt(_In_z_ _Printf_format_string_ const wchar_t *fmt, ...) {
+  va_list args;
+  va_start(args, fmt);
+  std::wstring buf(vFormatToWString(fmt, args));
+  va_end(args);
+  WEX::Logging::Log::Comment(buf.data());
+}
+
+inline void LogErrorFmt(_In_z_ _Printf_format_string_ const wchar_t *fmt, ...) {
+    va_list args;
+    va_start(args, fmt);
+    std::wstring buf(vFormatToWString(fmt, args));
+    va_end(args);
+    WEX::Logging::Log::Error(buf.data());
+}
+
+inline std::wstring GetPathToHlslDataFile(const wchar_t* relative) {
+  WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
+  WEX::Common::String HlslDataDirValue;
+  ASSERT_HRESULT_SUCCEEDED(WEX::TestExecution::RuntimeParameters::TryGetValue(L"HlslDataDir", HlslDataDirValue));
+
+  wchar_t envPath[MAX_PATH];
+  wchar_t expanded[MAX_PATH];
+  swprintf_s(envPath, _countof(envPath), L"%s\\%s", reinterpret_cast<wchar_t*>(HlslDataDirValue.GetBuffer()), relative);
+  VERIFY_WIN32_BOOL_SUCCEEDED(ExpandEnvironmentStringsW(envPath, expanded, _countof(expanded)));
+  return std::wstring(expanded);
+}
+
+inline bool PathLooksAbsolute(LPCWSTR name) {
+  // Very simplified, only for the cases we care about in the test suite.
+  return name && *name && ((*name == L'\\') || (name[1] == L':'));
+}
+
+inline std::string GetFirstLine(LPCWSTR name) {
+  char firstLine[300];
+  memset(firstLine, 0, sizeof(firstLine));
+
+  const std::wstring path = PathLooksAbsolute(name)
+                                ? std::wstring(name)
+                                : hlsl_test::GetPathToHlslDataFile(name);
+  std::ifstream infile(path);
+  if (infile.bad()) {
+    std::wstring errMsg(L"Unable to read file ");
+    errMsg += path;
+    WEX::Logging::Log::Error(errMsg.c_str());
+    VERIFY_FAIL();
+  }
+
+  infile.getline(firstLine, _countof(firstLine));
+  return firstLine;
+}
+
+inline HANDLE CreateFileForReading(LPCWSTR path) {
+  HANDLE sourceHandle = CreateFileW(path, GENERIC_READ, 0, 0, OPEN_EXISTING, 0, 0);
+  if (sourceHandle == INVALID_HANDLE_VALUE) {
+    DWORD err = GetLastError();
+    std::wstring errorMessage(FormatToWString(L"Unable to open file '%s', err=%u", path, err).c_str());
+    VERIFY_SUCCEEDED(HRESULT_FROM_WIN32(err), errorMessage.c_str());
+  }
+  return sourceHandle;
+}
+
+inline HANDLE CreateNewFileForReadWrite(LPCWSTR path) {
+  HANDLE sourceHandle = CreateFileW(path, GENERIC_READ | GENERIC_WRITE, 0, 0, CREATE_ALWAYS, 0, 0);
+  if (sourceHandle == INVALID_HANDLE_VALUE) {
+    DWORD err = GetLastError();
+    std::wstring errorMessage(FormatToWString(L"Unable to create file '%s', err=%u", path, err).c_str());
+    VERIFY_SUCCEEDED(HRESULT_FROM_WIN32(err), errorMessage.c_str());
+  }
+  return sourceHandle;
+}
+
+inline bool GetTestParamBool(LPCWSTR name) {
+  WEX::Common::String ParamValue;
+  WEX::Common::String NameValue;
+  if (FAILED(WEX::TestExecution::RuntimeParameters::TryGetValue(name,
+                                                                ParamValue))) {
+    return false;
+  }
+  if (ParamValue.IsEmpty()) {
+    return false;
+  }
+  if (0 == wcscmp(ParamValue, L"*")) {
+    return true;
+  }
+  VERIFY_SUCCEEDED(WEX::TestExecution::RuntimeParameters::TryGetValue(
+      L"TestName", NameValue));
+  if (NameValue.IsEmpty()) {
+    return false;
+  }
+  return Unicode::IsStarMatchUTF16(ParamValue, ParamValue.GetLength(),
+                                   NameValue, NameValue.GetLength());
+}
+
+inline bool GetTestParamUseWARP(bool defaultVal) {
+  WEX::Common::String AdapterValue;
+  if (FAILED(WEX::TestExecution::RuntimeParameters::TryGetValue(
+          L"Adapter", AdapterValue))) {
+    return defaultVal;
+  }
+  if (defaultVal && AdapterValue.IsEmpty() ||
+      AdapterValue.CompareNoCase(L"WARP") == 0) {
+    return true;
+  }
+  return false;
+}
+
+}
+
+inline bool isdenorm(float f) {
+  return FP_SUBNORMAL == fpclassify(f);
+}
+
+inline bool isdenorm(double d) {
+  return FP_SUBNORMAL == fpclassify(d);
+}
+
+inline float ifdenorm_flushf(float a) {
+  return isdenorm(a) ? copysign(0.0f, a) : a;
+}
+
+inline bool ifdenorm_flushf_eq(float a, float b) {
+  return ifdenorm_flushf(a) == ifdenorm_flushf(b);
+}
+
+inline bool ifdenorm_flushf_eq_or_nans(float a, float b) {
+  if (isnan(a) && isnan(b)) return true;
+  return ifdenorm_flushf(a) == ifdenorm_flushf(b);
+}
+
+inline bool CompareFloatULP(const float &fsrc, const float &fref, int ULPTolerance) {
+    if (isnan(fsrc)) {
+        return isnan(fref);
+    }
+    if (isdenorm(fref)) { // Arithmetic operations of denorm may flush to sign-preserved zero
+        return (isdenorm(fsrc) || fsrc == 0) && (signbit(fsrc) == signbit(fref));
+    }
+    if (fsrc == fref) {
+        return true;
+    }
+    int diff = *((DWORD *)&fsrc) - *((DWORD *)&fref);
+    unsigned int uDiff = diff < 0 ? -diff : diff;
+    return uDiff <= (unsigned int)ULPTolerance;
+}
+
+inline bool CompareFloatEpsilon(const float &fsrc, const float &fref, float epsilon) {
+    if (isnan(fsrc)) {
+        return isnan(fref);
+    }
+    if (isdenorm(fref)) { // Arithmetic operations of denorm may flush to sign-preserved zero
+        return (isdenorm(fsrc) || fsrc == 0) && (signbit(fsrc) == signbit(fref));
+    }
+    return fsrc == fref || fabsf(fsrc - fref) < epsilon;
+}
+
+// Compare using relative error (relative error < 2^{nRelativeExp})
+inline bool CompareFloatRelativeEpsilon(const float &fsrc, const float &fref, int nRelativeExp) {
+    return CompareFloatULP(fsrc, fref, 23 - nRelativeExp);
+}
+
+// returns the number of bytes per pixel for a given dxgi format
+// add more cases if different format needed to copy back resources
+inline UINT GetByteSizeForFormat(DXGI_FORMAT value) {
+    switch (value) {
+    case DXGI_FORMAT_R32G32B32A32_TYPELESS: return 16;
+    case DXGI_FORMAT_R32G32B32A32_FLOAT: return 16;
+    case DXGI_FORMAT_R32G32B32A32_UINT: return 16;
+    case DXGI_FORMAT_R32G32B32A32_SINT: return 16;
+    case DXGI_FORMAT_R32G32B32_TYPELESS: return 12;
+    case DXGI_FORMAT_R32G32B32_FLOAT: return 12;
+    case DXGI_FORMAT_R32G32B32_UINT: return 12;
+    case DXGI_FORMAT_R32G32B32_SINT: return 12;
+    case DXGI_FORMAT_R16G16B16A16_TYPELESS: return 8;
+    case DXGI_FORMAT_R16G16B16A16_FLOAT: return 8;
+    case DXGI_FORMAT_R16G16B16A16_UNORM: return 8;
+    case DXGI_FORMAT_R16G16B16A16_UINT: return 8;
+    case DXGI_FORMAT_R16G16B16A16_SNORM: return 8;
+    case DXGI_FORMAT_R16G16B16A16_SINT: return 8;
+    case DXGI_FORMAT_R32G32_TYPELESS: return 8;
+    case DXGI_FORMAT_R32G32_FLOAT: return 8;
+    case DXGI_FORMAT_R32G32_UINT: return 8;
+    case DXGI_FORMAT_R32G32_SINT: return 8;
+    case DXGI_FORMAT_R32G8X24_TYPELESS: return 8;
+    case DXGI_FORMAT_D32_FLOAT_S8X24_UINT: return 4;
+    case DXGI_FORMAT_R32_FLOAT_X8X24_TYPELESS: return 4;
+    case DXGI_FORMAT_X32_TYPELESS_G8X24_UINT: return 4;
+    case DXGI_FORMAT_R10G10B10A2_TYPELESS: return 4;
+    case DXGI_FORMAT_R10G10B10A2_UNORM: return 4;
+    case DXGI_FORMAT_R10G10B10A2_UINT: return 4;
+    case DXGI_FORMAT_R11G11B10_FLOAT: return 4;
+    case DXGI_FORMAT_R8G8B8A8_TYPELESS: return 4;
+    case DXGI_FORMAT_R8G8B8A8_UNORM: return 4;
+    case DXGI_FORMAT_R8G8B8A8_UNORM_SRGB: return 4;
+    case DXGI_FORMAT_R8G8B8A8_UINT: return 4;
+    case DXGI_FORMAT_R8G8B8A8_SNORM: return 4;
+    case DXGI_FORMAT_R8G8B8A8_SINT: return 4;
+    case DXGI_FORMAT_R16G16_TYPELESS: return 4;
+    case DXGI_FORMAT_R16G16_FLOAT: return 4;
+    case DXGI_FORMAT_R16G16_UNORM: return 4;
+    case DXGI_FORMAT_R16G16_UINT: return 4;
+    case DXGI_FORMAT_R16G16_SNORM: return 4;
+    case DXGI_FORMAT_R16G16_SINT: return 4;
+    case DXGI_FORMAT_R32_TYPELESS: return 4;
+    case DXGI_FORMAT_D32_FLOAT: return 4;
+    case DXGI_FORMAT_R32_FLOAT: return 4;
+    case DXGI_FORMAT_R32_UINT: return 4;
+    case DXGI_FORMAT_R32_SINT: return 4;
+    case DXGI_FORMAT_R24G8_TYPELESS: return 4;
+    case DXGI_FORMAT_D24_UNORM_S8_UINT: return 4;
+    case DXGI_FORMAT_R24_UNORM_X8_TYPELESS: return 4;
+    case DXGI_FORMAT_X24_TYPELESS_G8_UINT: return 4;
+    case DXGI_FORMAT_R8G8_TYPELESS: return 2;
+    case DXGI_FORMAT_R8G8_UNORM: return 2;
+    case DXGI_FORMAT_R8G8_UINT: return 2;
+    case DXGI_FORMAT_R8G8_SNORM: return 2;
+    case DXGI_FORMAT_R8G8_SINT: return 2;
+    case DXGI_FORMAT_R16_TYPELESS: return 2;
+    case DXGI_FORMAT_R16_FLOAT: return 2;
+    case DXGI_FORMAT_D16_UNORM: return 2;
+    case DXGI_FORMAT_R16_UNORM: return 2;
+    case DXGI_FORMAT_R16_UINT: return 2;
+    case DXGI_FORMAT_R16_SNORM: return 2;
+    case DXGI_FORMAT_R16_SINT: return 2;
+    case DXGI_FORMAT_R8_TYPELESS: return 1;
+    case DXGI_FORMAT_R8_UNORM: return 1;
+    case DXGI_FORMAT_R8_UINT: return 1;
+    case DXGI_FORMAT_R8_SNORM: return 1;
+    case DXGI_FORMAT_R8_SINT: return 1;
+    case DXGI_FORMAT_A8_UNORM: return 1;
+    case DXGI_FORMAT_R1_UNORM: return 1;
+    default:
+        VERIFY_FAILED(E_INVALIDARG);
+        return 0;
+    }
+}
+
+
+#define SIMPLE_IUNKNOWN_IMPL1(_IFACE_) \
+  private: volatile ULONG m_dwRef; \
+  public:\
+  ULONG STDMETHODCALLTYPE AddRef() { return InterlockedIncrement(&m_dwRef); } \
+  ULONG STDMETHODCALLTYPE Release() { \
+    ULONG result = InterlockedDecrement(&m_dwRef); \
+    if (result == 0) delete this; \
+    return result; \
+  } \
+  HRESULT STDMETHODCALLTYPE QueryInterface(REFIID iid, void** ppvObject) { \
+    if (ppvObject == nullptr) return E_POINTER; \
+    if (IsEqualIID(iid, __uuidof(IUnknown)) || \
+      IsEqualIID(iid, __uuidof(INoMarshal)) || \
+      IsEqualIID(iid, __uuidof(_IFACE_))) { \
+      *ppvObject = reinterpret_cast<_IFACE_*>(this); \
+      reinterpret_cast<_IFACE_*>(this)->AddRef(); \
+      return S_OK; \
+    } \
+    return E_NOINTERFACE; \
+  }
+
diff --git a/tools/clang/unittests/HLSL/ShaderOpArithTable.xml b/tools/clang/unittests/HLSL/ShaderOpArithTable.xml
index 075122f4a..2f63a6edb 100644
--- a/tools/clang/unittests/HLSL/ShaderOpArithTable.xml
+++ b/tools/clang/unittests/HLSL/ShaderOpArithTable.xml
@@ -1,3420 +1,3420 @@
-<?xml version="1.0"?>
-<Data>
-    <Table Id="UnaryFloatOpTable">
-      <ParameterTypes>
-        <ParameterType Name="Warp.Version">unsigned int</ParameterType>
-        <ParameterType Name="Validation.Type">String</ParameterType>
-        <ParameterType Name="Validation.Tolerance">double</ParameterType>
-        <ParameterType Name="Validation.Input" Array="true">String</ParameterType>
-        <ParameterType Name="Validation.Expected" Array="true">String</ParameterType>
-        <ParameterType Name="Validation.NumInput">unsigned int</ParameterType>
-        <ParameterType Name="ShaderOp.Name">String</ParameterType>
-        <ParameterType Name="ShaderOp.Target">String</ParameterType>
-        <ParameterType Name="ShaderOp.EntryPoint">String</ParameterType>
-        <ParameterType Name="ShaderOp.Text">String</ParameterType>
-      </ParameterTypes>
-
-      <Row Name="sin">
-        <Parameter Name="Validation.Type">Epsilon</Parameter>
-        <Parameter Name="Validation.Tolerance">0.0008</Parameter>
-        <Parameter Name="Validation.NumInput">9</Parameter>
-        <Parameter Name="Validation.Input">
-          <Value>NaN</Value>
-          <Value>-Inf</Value>
-          <Value>-denorm</Value>
-          <Value>-0</Value>
-          <Value>0</Value>
-          <Value>denorm</Value>
-          <Value>Inf</Value>
-          <Value>-314.16</Value>
-          <Value>314.16</Value>
-        </Parameter>
-        <Parameter Name="Validation.Expected">
-          <Value>NaN</Value>
-          <Value>NaN</Value>
-          <Value>-0</Value>
-          <Value>-0</Value>
-          <Value>0</Value>
-          <Value>0</Value>
-          <Value>NaN</Value>
-          <Value>-0.0007346401</Value>
-          <Value>0.0007346401</Value>
-        </Parameter>
-        <Parameter Name="ShaderOp.Name">sin</Parameter>
-        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
-        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
-        <Parameter Name="ShaderOp.Text"><![CDATA[
-            struct SUnaryFPOp {
-                float input;
-                float output;
-            };
-            RWStructuredBuffer<SUnaryFPOp> g_buf : register(u0);
-            [numthreads(8,8,1)]
-            [RootSignature("RootFlags(0), UAV(u0)")]
-            void main(uint GI : SV_GroupIndex) {
-                SUnaryFPOp l = g_buf[GI];
-                l.output = sin(l.input);
-                g_buf[GI] = l;
-            }
-           ]]></Parameter>
-      </Row>
-
-      <Row Name="cos">
-        <Parameter Name="Validation.Type">Epsilon</Parameter>
-        <Parameter Name="Validation.Tolerance">0.0008</Parameter>
-        <Parameter Name="Validation.NumInput">9</Parameter>
-        <Parameter Name="Validation.Input">
-          <Value>NaN</Value>
-          <Value>-Inf</Value>
-          <Value>-denorm</Value>
-          <Value>-0</Value>
-          <Value>0</Value>
-          <Value>denorm</Value>
-          <Value>Inf</Value>
-          <Value>-314.16</Value>
-          <Value>314.16</Value>
-        </Parameter>
-        <Parameter Name="Validation.Expected">
-          <Value>NaN</Value>
-          <Value>NaN</Value>
-          <Value>1.0</Value>
-          <Value>1.0</Value>
-          <Value>1.0</Value>
-          <Value>1.0</Value>
-          <Value>NaN</Value>
-          <Value>0.99999973015</Value>
-          <Value>0.99999973015</Value>
-        </Parameter>
-        <Parameter Name="ShaderOp.Name">cos</Parameter>
-        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
-        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
-        <Parameter Name="ShaderOp.Text"><![CDATA[
-            struct SUnaryFPOp {
-                float input;
-                float output;
-            };
-            RWStructuredBuffer<SUnaryFPOp> g_buf : register(u0);
-            [numthreads(8,8,1)]
-            void main(uint GI : SV_GroupIndex) {
-                SUnaryFPOp l = g_buf[GI];
-                l.output = cos(l.input);
-                g_buf[GI] = l;
-            }
-           ]]></Parameter>
-      </Row>
-
-      <Row Name="tan">
-        <Parameter Name="Validation.Type">Epsilon</Parameter>
-        <Parameter Name="Validation.Tolerance">0.0008</Parameter>
-        <Parameter Name="Validation.NumInput">9</Parameter>
-        <Parameter Name="Validation.Input">
-          <Value>NaN</Value>
-          <Value>-Inf</Value>
-          <Value>-denorm</Value>
-          <Value>-0</Value>
-          <Value>0</Value>
-          <Value>denorm</Value>
-          <Value>Inf</Value>
-          <Value>-314.16</Value>
-          <Value>314.16</Value>
-        </Parameter>
-        <Parameter Name="Validation.Expected">
-          <Value>NaN</Value>
-          <Value>NaN</Value>
-          <Value>-0.0</Value>
-          <Value>-0.0</Value>
-          <Value>0.0</Value>
-          <Value>0.0</Value>
-          <Value>NaN</Value>
-          <Value>-0.000735</Value>
-          <Value>0.000735</Value>
-        </Parameter>
-        <Parameter Name="ShaderOp.Name">tan</Parameter>
-        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
-        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
-        <Parameter Name="ShaderOp.Text"><![CDATA[
-            struct SUnaryFPOp {
-                float input;
-                float output;
-            };
-            RWStructuredBuffer<SUnaryFPOp> g_buf : register(u0);
-            [numthreads(8,8,1)]
-            void main(uint GI : SV_GroupIndex) {
-                SUnaryFPOp l = g_buf[GI];
-                l.output = tan(l.input);
-                g_buf[GI] = l;
-            }
-           ]]></Parameter>
-      </Row>
-
-      <Row Name="Hcos">
-        <Parameter Name="Validation.Type">Epsilon</Parameter>
-        <Parameter Name="Validation.Tolerance">0.0008</Parameter>
-        <Parameter Name="Validation.NumInput">9</Parameter>
-        <Parameter Name="Validation.Input">
-          <Value>NaN</Value>
-          <Value>-Inf</Value>
-          <Value>-denorm</Value>
-          <Value>-0</Value>
-          <Value>0</Value>
-          <Value>denorm</Value>
-          <Value>Inf</Value>
-          <Value>1</Value>
-          <Value>-1</Value>
-        </Parameter>
-        <Parameter Name="Validation.Expected">
-          <Value>NaN</Value>
-          <Value>Inf</Value>
-          <Value>1.0</Value>
-          <Value>1.0</Value>
-          <Value>1.0</Value>
-          <Value>1.0</Value>
-          <Value>Inf</Value>
-          <Value>1.543081</Value>
-          <Value>1.543081</Value>
-        </Parameter>
-        <Parameter Name="ShaderOp.Name">hcos</Parameter>
-        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
-        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
-        <Parameter Name="ShaderOp.Text"><![CDATA[
-            struct SUnaryFPOp {
-                float input;
-                float output;
-            };
-            RWStructuredBuffer<SUnaryFPOp> g_buf : register(u0);
-            [numthreads(8,8,1)]
-            void main(uint GI : SV_GroupIndex) {
-                SUnaryFPOp l = g_buf[GI];
-                l.output = cosh(l.input);
-                g_buf[GI] = l;
-            }
-           ]]></Parameter>
-      </Row>
-
-      <Row Name="Hsin">
-        <Parameter Name="Validation.Type">Epsilon</Parameter>
-        <Parameter Name="Validation.Tolerance">0.0008</Parameter>
-        <Parameter Name="Validation.NumInput">9</Parameter>
-        <Parameter Name="Validation.Input">
-          <Value>NaN</Value>
-          <Value>-Inf</Value>
-          <Value>-denorm</Value>
-          <Value>-0</Value>
-          <Value>0</Value>
-          <Value>denorm</Value>
-          <Value>Inf</Value>
-          <Value>1</Value>
-          <Value>-1</Value>
-        </Parameter>
-        <Parameter Name="Validation.Expected">
-          <Value>NaN</Value>
-          <Value>-Inf</Value>
-          <Value>0.0</Value>
-          <Value>0.0</Value>
-          <Value>0.0</Value>
-          <Value>0.0</Value>
-          <Value>Inf</Value>
-          <Value>1.175201</Value>
-          <Value>-1.175201</Value>
-        </Parameter>
-        <Parameter Name="ShaderOp.Name">hsin</Parameter>
-        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
-        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
-        <Parameter Name="ShaderOp.Text"><![CDATA[
-            struct SUnaryFPOp {
-                float input;
-                float output;
-            };
-            RWStructuredBuffer<SUnaryFPOp> g_buf : register(u0);
-            [numthreads(8,8,1)]
-            void main(uint GI : SV_GroupIndex) {
-                SUnaryFPOp l = g_buf[GI];
-                l.output = sinh(l.input);
-                g_buf[GI] = l;
-            }
-           ]]></Parameter>
-      </Row>
-
-      <Row Name="Htan">
-        <Parameter Name="Warp.Version">16202</Parameter>
-        <Parameter Name="Validation.Type">Epsilon</Parameter>
-        <Parameter Name="Validation.Tolerance">0.0008</Parameter>
-        <Parameter Name="Validation.NumInput">9</Parameter>
-        <Parameter Name="Validation.Input">
-          <Value>NaN</Value>
-          <Value>-Inf</Value>
-          <Value>-denorm</Value>
-          <Value>-0</Value>
-          <Value>0</Value>
-          <Value>denorm</Value>
-          <Value>Inf</Value>
-          <Value>1</Value>
-          <Value>-1</Value>
-        </Parameter>
-        <Parameter Name="Validation.Expected">
-          <Value>NaN</Value>
-          <Value>-1</Value>
-          <Value>-0.0</Value>
-          <Value>-0.0</Value>
-          <Value>0.0</Value>
-          <Value>0.0</Value>
-          <Value>1</Value>
-          <Value>0.761594</Value>
-          <Value>-0.761594</Value>
-        </Parameter>
-        <Parameter Name="ShaderOp.Name">htan</Parameter>
-        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
-        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
-        <Parameter Name="ShaderOp.Text"><![CDATA[
-            struct SUnaryFPOp {
-                float input;
-                float output;
-            };
-            RWStructuredBuffer<SUnaryFPOp> g_buf : register(u0);
-            [numthreads(8,8,1)]
-            void main(uint GI : SV_GroupIndex) {
-                SUnaryFPOp l = g_buf[GI];
-                l.output = tanh(l.input);
-                g_buf[GI] = l;
-            }
-           ]]></Parameter>
-      </Row>
-
-      <Row Name="acos">
-        <Parameter Name="Validation.Type">Epsilon</Parameter>
-        <Parameter Name="Validation.Tolerance">0.0008</Parameter>
-        <Parameter Name="Validation.NumInput">11</Parameter>
-        <Parameter Name="Validation.Input">
-          <Value>NaN</Value>
-          <Value>-Inf</Value>
-          <Value>-denorm</Value>
-          <Value>-0</Value>
-          <Value>0</Value>
-          <Value>denorm</Value>
-          <Value>Inf</Value>
-          <Value>1</Value>
-          <Value>-1</Value>
-          <Value>1.5</Value>
-          <Value>-1.5</Value>
-        </Parameter>
-        <Parameter Name="Validation.Expected">
-          <Value>NaN</Value>
-          <Value>NaN</Value>
-          <Value>1.570796</Value>
-          <Value>1.570796</Value>
-          <Value>1.570796</Value>
-          <Value>1.570796</Value>
-          <Value>NaN</Value>
-          <Value>0</Value>
-          <Value>3.1415926</Value>
-          <Value>NaN</Value>
-          <Value>NaN</Value>
-        </Parameter>
-        <Parameter Name="ShaderOp.Name">acos</Parameter>
-        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
-        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
-        <Parameter Name="ShaderOp.Text"><![CDATA[
-            struct SUnaryFPOp {
-                float input;
-                float output;
-            };
-            RWStructuredBuffer<SUnaryFPOp> g_buf : register(u0);
-            [numthreads(8,8,1)]
-            void main(uint GI : SV_GroupIndex) {
-                SUnaryFPOp l = g_buf[GI];
-                l.output = acos(l.input);
-                g_buf[GI] = l;
-            }
-           ]]></Parameter>
-      </Row>
-
-      <Row Name="asin">
-        <Parameter Name="Validation.Type">Epsilon</Parameter>
-        <Parameter Name="Validation.Tolerance">0.0008</Parameter>
-        <Parameter Name="Validation.NumInput">11</Parameter>
-        <Parameter Name="Validation.Input">
-          <Value>NaN</Value>
-          <Value>-Inf</Value>
-          <Value>-denorm</Value>
-          <Value>-0</Value>
-          <Value>0</Value>
-          <Value>denorm</Value>
-          <Value>Inf</Value>
-          <Value>1</Value>
-          <Value>-1</Value>
-          <Value>1.5</Value>
-          <Value>-1.5</Value>
-        </Parameter>
-        <Parameter Name="Validation.Expected">
-          <Value>NaN</Value>
-          <Value>NaN</Value>
-          <Value>0.0</Value>
-          <Value>0.0</Value>
-          <Value>0.0</Value>
-          <Value>0.0</Value>
-          <Value>NaN</Value>
-          <Value>1.570796</Value>
-          <Value>-1.570796</Value>
-          <Value>NaN</Value>
-          <Value>NaN</Value>
-        </Parameter>
-        <Parameter Name="ShaderOp.Name">asin</Parameter>
-        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
-        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
-        <Parameter Name="ShaderOp.Text"><![CDATA[
-            struct SUnaryFPOp {
-                float input;
-                float output;
-            };
-            RWStructuredBuffer<SUnaryFPOp> g_buf : register(u0);
-            [numthreads(8,8,1)]
-            void main(uint GI : SV_GroupIndex) {
-                SUnaryFPOp l = g_buf[GI];
-                l.output = asin(l.input);
-                g_buf[GI] = l;
-            }
-           ]]></Parameter>
-      </Row>
-
-      <Row Name="atan">
-        <Parameter Name="Warp.Version">16202</Parameter>
-        <Parameter Name="Validation.Type">Epsilon</Parameter>
-        <Parameter Name="Validation.Tolerance">0.0008</Parameter>
-        <Parameter Name="Validation.NumInput">8</Parameter>
-        <Parameter Name="Validation.Input">
-          <Value>NaN</Value>
-          <Value>-Inf</Value>
-          <Value>-denorm</Value>
-          <Value>-0</Value>
-          <Value>0</Value>
-          <Value>denorm</Value>
-          <Value>Inf</Value>
-          <Value>1</Value>
-          <Value>-1</Value>
-        </Parameter>
-        <Parameter Name="Validation.Expected">
-          <Value>NaN</Value>
-          <Value>-1.570796</Value>
-          <Value>0.0</Value>
-          <Value>0.0</Value>
-          <Value>0.0</Value>
-          <Value>0.0</Value>
-          <Value>1.570796</Value>
-          <Value>0.785398163</Value>
-          <Value>-0.785398163</Value>
-        </Parameter>
-        <Parameter Name="ShaderOp.Name">atan</Parameter>
-        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
-        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
-        <Parameter Name="ShaderOp.Text"><![CDATA[
-            struct SUnaryFPOp {
-                float input;
-                float output;
-            };
-            RWStructuredBuffer<SUnaryFPOp> g_buf : register(u0);
-            [numthreads(8,8,1)]
-            void main(uint GI : SV_GroupIndex) {
-                SUnaryFPOp l = g_buf[GI];
-                l.output = atan(l.input);
-                g_buf[GI] = l;
-            }
-           ]]></Parameter>
-      </Row>
-
-      <Row Name="exp">
-        <Parameter Name="Validation.Type">Relative</Parameter>
-        <Parameter Name="Validation.Tolerance">21</Parameter>
-        <Parameter Name="Validation.NumInput">9</Parameter>
-        <Parameter Name="Validation.Input">
-          <Value>NaN</Value>
-          <Value>-Inf</Value>
-          <Value>-denorm</Value>
-          <Value>-0</Value>
-          <Value>0</Value>
-          <Value>denorm</Value>
-          <Value>Inf</Value>
-          <Value>-1</Value>
-          <Value>10</Value>
-        </Parameter>
-        <Parameter Name="Validation.Expected">
-          <Value>NaN</Value>
-          <Value>0</Value>
-          <Value>1</Value>
-          <Value>1</Value>
-          <Value>1</Value>
-          <Value>1</Value>
-          <Value>Inf</Value>
-          <Value>0.367879441</Value>
-          <Value>22026.46579</Value>
-        </Parameter>
-        <Parameter Name="ShaderOp.Name">exp</Parameter>
-        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
-        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
-        <Parameter Name="ShaderOp.Text"><![CDATA[
-            struct SUnaryFPOp {
-                float input;
-                float output;
-            };
-            RWStructuredBuffer<SUnaryFPOp> g_buf : register(u0);
-            [numthreads(8,8,1)]
-            void main(uint GI : SV_GroupIndex) {
-                SUnaryFPOp l = g_buf[GI];
-                l.output = exp(l.input);
-                g_buf[GI] = l;
-            }
-           ]]></Parameter>
-      </Row>
-
-      <Row Name="frc">
-        <Parameter Name="Validation.Type">Epsilon</Parameter>
-        <Parameter Name="Validation.Tolerance">0.0008</Parameter>
-        <Parameter Name="Validation.NumInput">11</Parameter>
-        <Parameter Name="Validation.Input">
-          <Value>NaN</Value>
-          <Value>-Inf</Value>
-          <Value>-denorm</Value>
-          <Value>-0</Value>
-          <Value>0</Value>
-          <Value>denorm</Value>
-          <Value>Inf</Value>
-          <Value>-1</Value>
-          <Value>2.718280</Value>
-          <Value>1000.599976</Value>
-          <Value>-7.389</Value>
-        </Parameter>
-        <Parameter Name="Validation.Expected">
-          <Value>NaN</Value>
-          <Value>NaN</Value>
-          <Value>0</Value>
-          <Value>0</Value>
-          <Value>0</Value>
-          <Value>0</Value>
-          <Value>NaN</Value>
-          <Value>0</Value>
-          <Value>0.718280</Value>
-          <Value>0.599976</Value>
-          <Value>0.611</Value>
-        </Parameter>
-        <Parameter Name="ShaderOp.Name">frc</Parameter>
-        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
-        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
-        <Parameter Name="ShaderOp.Text"><![CDATA[
-            struct SUnaryFPOp {
-                float input;
-                float output;
-            };
-            RWStructuredBuffer<SUnaryFPOp> g_buf : register(u0);
-            [numthreads(8,8,1)]
-            void main(uint GI : SV_GroupIndex) {
-                SUnaryFPOp l = g_buf[GI];
-                l.output = frac(l.input);
-                g_buf[GI] = l;
-            }
-           ]]></Parameter>
-      </Row>
-
-      <Row Name="log">
-        <Parameter Name="Validation.Type">Relative</Parameter>
-        <Parameter Name="Validation.Tolerance">21</Parameter>
-        <Parameter Name="Validation.NumInput">11</Parameter>
-        <Parameter Name="Validation.Input">
-          <Value>NaN</Value>
-          <Value>-Inf</Value>
-          <Value>-denorm</Value>
-          <Value>-0</Value>
-          <Value>0</Value>
-          <Value>denorm</Value>
-          <Value>Inf</Value>
-          <Value>-1</Value>
-          <Value>2.718281828</Value>
-          <Value>7.389056</Value>
-          <Value>100</Value>
-        </Parameter>
-        <Parameter Name="Validation.Expected">
-          <Value>NaN</Value>
-          <Value>NaN</Value>
-          <Value>-Inf</Value>
-          <Value>-Inf</Value>
-          <Value>-Inf</Value>
-          <Value>-Inf</Value>
-          <Value>Inf</Value>
-          <Value>NaN</Value>
-          <Value>1.0</Value>
-          <Value>1.99999998</Value>
-          <Value>4.6051701</Value>
-        </Parameter>
-        <Parameter Name="ShaderOp.Name">log</Parameter>
-        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
-        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
-        <Parameter Name="ShaderOp.Text"><![CDATA[
-            struct SUnaryFPOp {
-                float input;
-                float output;
-            };
-            RWStructuredBuffer<SUnaryFPOp> g_buf : register(u0);
-            [numthreads(8,8,1)]
-            void main(uint GI : SV_GroupIndex) {
-                SUnaryFPOp l = g_buf[GI];
-                l.output = log(l.input);
-                g_buf[GI] = l;
-            }
-           ]]></Parameter>
-      </Row>
-      <Row Name="sqrt">
-        <Parameter Name="Validation.Type">ulp</Parameter>
-        <Parameter Name="Validation.Tolerance">1</Parameter>
-        <Parameter Name="Validation.NumInput">11</Parameter>
-        <Parameter Name="Validation.Input">
-          <Value>NaN</Value>
-          <Value>-Inf</Value>
-          <Value>-denorm</Value>
-          <Value>-0</Value>
-          <Value>0</Value>
-          <Value>denorm</Value>
-          <Value>Inf</Value>
-          <Value>-1</Value>
-          <Value>2</Value>
-          <Value>16.0</Value>
-          <Value>256.0</Value>
-        </Parameter>
-        <Parameter Name="Validation.Expected">
-          <Value>NaN</Value>
-          <Value>NaN</Value>
-          <Value>-0</Value>
-          <Value>-0</Value>
-          <Value>0</Value>
-          <Value>0</Value>
-          <Value>Inf</Value>
-          <Value>NaN</Value>
-          <Value>1.41421356237</Value>
-          <Value>4.0</Value>
-          <Value>16.0</Value>
-        </Parameter>
-        <Parameter Name="ShaderOp.Name">sqrt</Parameter>
-        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
-        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
-        <Parameter Name="ShaderOp.Text"><![CDATA[
-            struct SUnaryFPOp {
-                float input;
-                float output;
-            };
-            RWStructuredBuffer<SUnaryFPOp> g_buf : register(u0);
-            [numthreads(8,8,1)]
-            void main(uint GI : SV_GroupIndex) {
-                SUnaryFPOp l = g_buf[GI];
-                l.output = sqrt(l.input);
-                g_buf[GI] = l;
-            }
-           ]]></Parameter>
-      </Row>
-
-      <Row Name="rsqrt">
-        <Parameter Name="Validation.Type">ulp</Parameter>
-        <Parameter Name="Validation.Tolerance">1</Parameter>
-        <Parameter Name="Validation.NumInput">11</Parameter>
-        <Parameter Name="Validation.Input">
-          <Value>NaN</Value>
-          <Value>-Inf</Value>
-          <Value>-denorm</Value>
-          <Value>-0</Value>
-          <Value>0</Value>
-          <Value>denorm</Value>
-          <Value>Inf</Value>
-          <Value>-1</Value>
-          <Value>16.0</Value>
-          <Value>256.0</Value>
-          <Value>65536.0</Value>
-        </Parameter>
-        <Parameter Name="Validation.Expected">
-          <Value>NaN</Value>
-          <Value>NaN</Value>
-          <Value>-Inf</Value>
-          <Value>-Inf</Value>
-          <Value>Inf</Value>
-          <Value>Inf</Value>
-          <Value>0</Value>
-          <Value>NaN</Value>
-          <Value>0.25</Value>
-          <Value>0.0625</Value>
-          <Value>0.00390625</Value>
-        </Parameter>
-        <Parameter Name="ShaderOp.Name">rsqrt</Parameter>
-        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
-        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
-        <Parameter Name="ShaderOp.Text"><![CDATA[
-            struct SUnaryFPOp {
-                float input;
-                float output;
-            };
-            RWStructuredBuffer<SUnaryFPOp> g_buf : register(u0);
-            [numthreads(8,8,1)]
-            void main(uint GI : SV_GroupIndex) {
-                SUnaryFPOp l = g_buf[GI];
-                l.output = rsqrt(l.input);
-                g_buf[GI] = l;
-            }
-           ]]></Parameter>
-      </Row>
-
-       <Row Name="rsqrt">
-        <Parameter Name="Validation.Type">ulp</Parameter>
-        <Parameter Name="Validation.Tolerance">1</Parameter>
-        <Parameter Name="Validation.NumInput">11</Parameter>
-        <Parameter Name="Validation.Input">
-          <Value>NaN</Value>
-          <Value>-Inf</Value>
-          <Value>-denorm</Value>
-          <Value>-0</Value>
-          <Value>0</Value>
-          <Value>denorm</Value>
-          <Value>Inf</Value>
-          <Value>-1</Value>
-          <Value>16.0</Value>
-          <Value>256.0</Value>
-          <Value>65536.0</Value>
-        </Parameter>
-        <Parameter Name="Validation.Expected">
-          <Value>NaN</Value>
-          <Value>NaN</Value>
-          <Value>-Inf</Value>
-          <Value>-Inf</Value>
-          <Value>Inf</Value>
-          <Value>Inf</Value>
-          <Value>0</Value>
-          <Value>NaN</Value>
-          <Value>0.25</Value>
-          <Value>0.0625</Value>
-          <Value>0.00390625</Value>
-        </Parameter>
-        <Parameter Name="ShaderOp.Name">rsqrt</Parameter>
-        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
-        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
-        <Parameter Name="ShaderOp.Text"><![CDATA[
-            struct SUnaryFPOp {
-                float input;
-                float output;
-            };
-            RWStructuredBuffer<SUnaryFPOp> g_buf : register(u0);
-            [numthreads(8,8,1)]
-            void main(uint GI : SV_GroupIndex) {
-                SUnaryFPOp l = g_buf[GI];
-                l.output = rsqrt(l.input);
-                g_buf[GI] = l;
-            }
-           ]]></Parameter>
-      </Row>
-
-      <Row Name="round_ne">
-        <Parameter Name="Validation.Type">Epsilon</Parameter>
-        <Parameter Name="Validation.Tolerance">0</Parameter>
-        <Parameter Name="Validation.NumInput">16</Parameter>
-        <Parameter Name="Validation.Input">
-          <Value>NaN</Value>
-          <Value>-Inf</Value>
-          <Value>-denorm</Value>
-          <Value>-0</Value>
-          <Value>0</Value>
-          <Value>denorm</Value>
-          <Value>Inf</Value>
-          <Value>10.0</Value>
-          <Value>10.4</Value>
-          <Value>10.5</Value>
-          <Value>10.6</Value>
-          <Value>11.5</Value>
-          <Value>-10.0</Value>
-          <Value>-10.4</Value>
-          <Value>-10.5</Value>
-          <Value>-10.6</Value>
-        </Parameter>
-        <Parameter Name="Validation.Expected">
-          <Value>NaN</Value>
-          <Value>-Inf</Value>
-          <Value>-0</Value>
-          <Value>-0</Value>
-          <Value>0</Value>
-          <Value>0</Value>
-          <Value>Inf</Value>
-          <Value>10.0</Value>
-          <Value>10.0</Value>
-          <Value>10.0</Value>
-          <Value>11.0</Value>
-          <Value>12.0</Value>
-          <Value>-10.0</Value>
-          <Value>-10.0</Value>
-          <Value>-10.0</Value>
-          <Value>-11.0</Value>
-        </Parameter>
-        <Parameter Name="ShaderOp.Name">round_ne</Parameter>
-        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
-        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
-        <Parameter Name="ShaderOp.Text"><![CDATA[
-            struct SUnaryFPOp {
-                float input;
-                float output;
-            };
-            RWStructuredBuffer<SUnaryFPOp> g_buf : register(u0);
-            [numthreads(8,8,1)]
-            void main(uint GI : SV_GroupIndex) {
-                SUnaryFPOp l = g_buf[GI];
-                l.output = round(l.input);
-                g_buf[GI] = l;
-            }
-           ]]></Parameter>
-      </Row>
-
-      <Row Name="round_ni">
-        <Parameter Name="Validation.Type">Epsilon</Parameter>
-        <Parameter Name="Validation.Tolerance">0</Parameter>
-        <Parameter Name="Validation.NumInput">15</Parameter>
-        <Parameter Name="Validation.Input">
-          <Value>NaN</Value>
-          <Value>-Inf</Value>
-          <Value>-denorm</Value>
-          <Value>-0</Value>
-          <Value>0</Value>
-          <Value>denorm</Value>
-          <Value>Inf</Value>
-          <Value>10.0</Value>
-          <Value>10.4</Value>
-          <Value>10.5</Value>
-          <Value>10.6</Value>
-          <Value>-10.0</Value>
-          <Value>-10.4</Value>
-          <Value>-10.5</Value>
-          <Value>-10.6</Value>
-        </Parameter>
-        <Parameter Name="Validation.Expected">
-          <Value>NaN</Value>
-          <Value>-Inf</Value>
-          <Value>-0</Value>
-          <Value>-0</Value>
-          <Value>0</Value>
-          <Value>0</Value>
-          <Value>Inf</Value>
-          <Value>10.0</Value>
-          <Value>10.0</Value>
-          <Value>10.0</Value>
-          <Value>10.0</Value>
-          <Value>-10.0</Value>
-          <Value>-11.0</Value>
-          <Value>-11.0</Value>
-          <Value>-11.0</Value>
-        </Parameter>
-        <Parameter Name="ShaderOp.Name">round_ni</Parameter>
-        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
-        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
-        <Parameter Name="ShaderOp.Text"><![CDATA[
-            struct SUnaryFPOp {
-                float input;
-                float output;
-            };
-            RWStructuredBuffer<SUnaryFPOp> g_buf : register(u0);
-            [numthreads(8,8,1)]
-            void main(uint GI : SV_GroupIndex) {
-                SUnaryFPOp l = g_buf[GI];
-                l.output = floor(l.input);
-                g_buf[GI] = l;
-            }
-           ]]></Parameter>
-       </Row>
-
-       <Row Name="round_pi">
-        <Parameter Name="Validation.Type">Epsilon</Parameter>
-        <Parameter Name="Validation.Tolerance">0</Parameter>
-        <Parameter Name="Validation.NumInput">15</Parameter>
-        <Parameter Name="Validation.Input">
-          <Value>NaN</Value>
-          <Value>-Inf</Value>
-          <Value>-denorm</Value>
-          <Value>-0</Value>
-          <Value>0</Value>
-          <Value>denorm</Value>
-          <Value>Inf</Value>
-          <Value>10.0</Value>
-          <Value>10.4</Value>
-          <Value>10.5</Value>
-          <Value>10.6</Value>
-          <Value>-10.0</Value>
-          <Value>-10.4</Value>
-          <Value>-10.5</Value>
-          <Value>-10.6</Value>
-        </Parameter>
-        <Parameter Name="Validation.Expected">
-          <Value>NaN</Value>
-          <Value>-Inf</Value>
-          <Value>-0</Value>
-          <Value>-0</Value>
-          <Value>0</Value>
-          <Value>0</Value>
-          <Value>Inf</Value>
-          <Value>10.0</Value>
-          <Value>11.0</Value>
-          <Value>11.0</Value>
-          <Value>11.0</Value>
-          <Value>-10.0</Value>
-          <Value>-10.0</Value>
-          <Value>-10.0</Value>
-          <Value>-10.0</Value>
-        </Parameter>
-        <Parameter Name="ShaderOp.Name">round_pi</Parameter>
-        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
-        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
-        <Parameter Name="ShaderOp.Text"><![CDATA[
-            struct SUnaryFPOp {
-                float input;
-                float output;
-            };
-            RWStructuredBuffer<SUnaryFPOp> g_buf : register(u0);
-            [numthreads(8,8,1)]
-            void main(uint GI : SV_GroupIndex) {
-                SUnaryFPOp l = g_buf[GI];
-                l.output = ceil(l.input);
-                g_buf[GI] = l;
-            }
-           ]]></Parameter>
-      </Row>
-
-      <Row Name="round_z">
-        <Parameter Name="Validation.Type">Epsilon</Parameter>
-        <Parameter Name="Validation.Tolerance">0</Parameter>
-        <Parameter Name="Validation.NumInput">15</Parameter>
-        <Parameter Name="Validation.Input">
-          <Value>NaN</Value>
-          <Value>-Inf</Value>
-          <Value>-denorm</Value>
-          <Value>-0</Value>
-          <Value>0</Value>
-          <Value>denorm</Value>
-          <Value>Inf</Value>
-          <Value>10.0</Value>
-          <Value>10.4</Value>
-          <Value>10.5</Value>
-          <Value>10.6</Value>
-          <Value>-10.0</Value>
-          <Value>-10.4</Value>
-          <Value>-10.5</Value>
-          <Value>-10.6</Value>
-        </Parameter>
-        <Parameter Name="Validation.Expected">
-          <Value>NaN</Value>
-          <Value>-Inf</Value>
-          <Value>-0</Value>
-          <Value>-0</Value>
-          <Value>0</Value>
-          <Value>0</Value>
-          <Value>Inf</Value>
-          <Value>10.0</Value>
-          <Value>10.0</Value>
-          <Value>10.0</Value>
-          <Value>10.0</Value>
-          <Value>-10.0</Value>
-          <Value>-10.0</Value>
-          <Value>-10.0</Value>
-          <Value>-10.0</Value>
-        </Parameter>
-        <Parameter Name="ShaderOp.Name">round_z</Parameter>
-        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
-        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
-        <Parameter Name="ShaderOp.Text"><![CDATA[
-            struct SUnaryFPOp {
-                float input;
-                float output;
-            };
-            RWStructuredBuffer<SUnaryFPOp> g_buf : register(u0);
-            [numthreads(8,8,1)]
-            void main(uint GI : SV_GroupIndex) {
-                SUnaryFPOp l = g_buf[GI];
-                l.output = trunc(l.input);
-                g_buf[GI] = l;
-            }
-           ]]></Parameter>
-      </Row>
-
-      <Row Name="IsNaN">
-        <Parameter Name="Validation.Type">Epsilon</Parameter>
-        <Parameter Name="Validation.Tolerance">0</Parameter>
-        <Parameter Name="Validation.NumInput">9</Parameter>
-        <Parameter Name="Validation.Input">
-          <Value>NaN</Value>
-          <Value>-Inf</Value>
-          <Value>-denorm</Value>
-          <Value>-0</Value>
-          <Value>0</Value>
-          <Value>denorm</Value>
-          <Value>Inf</Value>
-          <Value>1.0</Value>
-          <Value>-1.0</Value>
-        </Parameter>
-        <Parameter Name="Validation.Expected">
-          <Value>1</Value>
-          <Value>0</Value>
-          <Value>0</Value>
-          <Value>0</Value>
-          <Value>0</Value>
-          <Value>0</Value>
-          <Value>0</Value>
-          <Value>0</Value>
-          <Value>0</Value>
-        </Parameter>
-        <Parameter Name="ShaderOp.Name">IsNaN</Parameter>
-        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
-        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
-        <Parameter Name="ShaderOp.Text"><![CDATA[
-            struct SUnaryFPOp {
-                float input;
-                float output;
-            };
-            RWStructuredBuffer<SUnaryFPOp> g_buf : register(u0);
-            [numthreads(8,8,1)]
-            void main(uint GI : SV_GroupIndex) {
-                SUnaryFPOp l = g_buf[GI];
-                if (isnan(l.input))
-                    l.output = 1;
-                else
-                    l.output = 0;
-                g_buf[GI] = l;
-            }
-           ]]></Parameter>
-      </Row>
-      <Row Name="IsInf">
-        <Parameter Name="Validation.Type">Epsilon</Parameter>
-        <Parameter Name="Validation.Tolerance">0</Parameter>
-        <Parameter Name="Validation.NumInput">9</Parameter>
-        <Parameter Name="Validation.Input">
-          <Value>NaN</Value>
-          <Value>-Inf</Value>
-          <Value>-denorm</Value>
-          <Value>-0</Value>
-          <Value>0</Value>
-          <Value>denorm</Value>
-          <Value>Inf</Value>
-          <Value>1.0</Value>
-          <Value>-1.0</Value>
-        </Parameter>
-        <Parameter Name="Validation.Expected">
-          <Value>0</Value>
-          <Value>1</Value>
-          <Value>0</Value>
-          <Value>0</Value>
-          <Value>0</Value>
-          <Value>0</Value>
-          <Value>1</Value>
-          <Value>0</Value>
-          <Value>0</Value>
-        </Parameter>
-        <Parameter Name="ShaderOp.Name">IsInf</Parameter>
-        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
-        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
-        <Parameter Name="ShaderOp.Text"><![CDATA[
-            struct SUnaryFPOp {
-                float input;
-                float output;
-            };
-            RWStructuredBuffer<SUnaryFPOp> g_buf : register(u0);
-            [numthreads(8,8,1)]
-            void main(uint GI : SV_GroupIndex) {
-                SUnaryFPOp l = g_buf[GI];
-                if (isinf(l.input))
-                    l.output = 1;
-                else
-                    l.output = 0;
-                g_buf[GI] = l;
-            }
-           ]]></Parameter>
-      </Row>
-      <Row Name="IsFinite">
-        <Parameter Name="Warp.Version">16202</Parameter>
-        <Parameter Name="Validation.Type">Epsilon</Parameter>
-        <Parameter Name="Validation.Tolerance">0</Parameter>
-        <Parameter Name="Validation.NumInput">8</Parameter>
-        <Parameter Name="Validation.Input">
-          <Value>NaN</Value>
-          <Value>-Inf</Value>
-          <Value>-denorm</Value>
-          <Value>-0</Value>
-          <Value>0</Value>
-          <Value>denorm</Value>
-          <Value>Inf</Value>
-          <Value>1.0</Value>
-          <Value>-1.0</Value>
-        </Parameter>
-        <Parameter Name="Validation.Expected">
-          <Value>0</Value>
-          <Value>0</Value>
-          <Value>1</Value>
-          <Value>1</Value>
-          <Value>1</Value>
-          <Value>1</Value>
-          <Value>0</Value>
-          <Value>1</Value>
-          <Value>1</Value>
-        </Parameter>
-        <Parameter Name="ShaderOp.Name">IsFinite</Parameter>
-        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
-        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
-        <Parameter Name="ShaderOp.Text"><![CDATA[
-            struct SUnaryFPOp {
-                float input;
-                float output;
-            };
-            RWStructuredBuffer<SUnaryFPOp> g_buf : register(u0);
-            [numthreads(8,8,1)]
-            void main(uint GI : SV_GroupIndex) {
-                SUnaryFPOp l = g_buf[GI];
-                if (isfinite(l.input))
-                    l.output = 1;
-                else
-                    l.output = 0;
-                g_buf[GI] = l;
-            }
-           ]]></Parameter>
-      </Row>
-
-      <Row Name="FAbs">
-        <Parameter Name="Validation.Type">Epsilon</Parameter>
-        <Parameter Name="Validation.Tolerance">0</Parameter>
-        <Parameter Name="Validation.NumInput">9</Parameter>
-        <Parameter Name="Validation.Input">
-          <Value>NaN</Value>
-          <Value>-Inf</Value>
-          <Value>-denorm</Value>
-          <Value>-0</Value>
-          <Value>0</Value>
-          <Value>denorm</Value>
-          <Value>Inf</Value>
-          <Value>1.0</Value>
-          <Value>-1.0</Value>
-        </Parameter>
-        <Parameter Name="Validation.Expected">
-          <Value>NaN</Value>
-          <Value>Inf</Value>
-          <Value>denorm</Value>
-          <Value>0</Value>
-          <Value>0</Value>
-          <Value>denorm</Value>
-          <Value>Inf</Value>
-          <Value>1</Value>
-          <Value>1</Value>
-        </Parameter>
-        <Parameter Name="ShaderOp.Name">FAbs</Parameter>
-        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
-        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
-        <Parameter Name="ShaderOp.Text"><![CDATA[
-            struct SUnaryFPOp {
-                float input;
-                float output;
-            };
-            RWStructuredBuffer<SUnaryFPOp> g_buf : register(u0);
-            [numthreads(8,8,1)]
-            void main(uint GI : SV_GroupIndex) {
-                SUnaryFPOp l = g_buf[GI];
-                l.output = abs(l.input);
-                g_buf[GI] = l;
-            }
-           ]]></Parameter>
-      </Row>
-    </Table>
-    <Table Id="BinaryFloatOpTable">
-      <ParameterTypes>
-        <ParameterType Name="Validation.Type">String</ParameterType>
-        <ParameterType Name="Validation.Tolerance">double</ParameterType>
-        <ParameterType Name="Validation.Input1" Array="true">String</ParameterType>
-        <ParameterType Name="Validation.Input2" Array="true">String</ParameterType>
-        <ParameterType Name="Validation.Expected1" Array="true">String</ParameterType>
-        <ParameterType Name="Validation.Expected2" Array="true">String</ParameterType>
-        <ParameterType Name="Validation.NumInput">unsigned int</ParameterType>
-        <ParameterType Name="ShaderOp.Name">String</ParameterType>
-        <ParameterType Name="ShaderOp.Target">String</ParameterType>
-        <ParameterType Name="ShaderOp.EntryPoint">String</ParameterType>
-        <ParameterType Name="ShaderOp.Text">String</ParameterType>
-      </ParameterTypes>
-
-      <Row Name="MinMax">
-        <Parameter Name="Validation.Type">epsilon</Parameter>
-        <Parameter Name="Validation.Tolerance">0</Parameter>
-        <Parameter Name="Validation.NumInput">17</Parameter>
-        <Parameter Name="Validation.Input1">
-            <Value>-inf</Value>
-            <Value>-inf</Value>
-            <Value>-inf</Value>
-            <Value>-inf</Value>
-            <Value>inf</Value>
-            <Value>inf</Value>
-            <Value>inf</Value>
-            <Value>inf</Value>
-            <Value>NaN</Value>
-            <Value>NaN</Value>
-            <Value>NaN</Value>
-            <Value>NaN</Value>
-            <Value>1.0</Value>
-            <Value>1.0</Value>
-            <Value>-1.0</Value>
-            <Value>-1.0</Value>
-            <Value>1.0</Value>
-        </Parameter>
-        <Parameter Name="Validation.Input2">
-            <Value>-inf</Value>
-            <Value>inf</Value>
-            <Value>1.0</Value>
-            <Value>NaN</Value>
-            <Value>-inf</Value>
-            <Value>inf</Value>
-            <Value>1.0</Value>
-            <Value>NaN</Value>
-            <Value>-inf</Value>
-            <Value>inf</Value>
-            <Value>1.0</Value>
-            <Value>NaN</Value>
-            <Value>-inf</Value>
-            <Value>inf</Value>
-            <Value>1.0</Value>
-            <Value>NaN</Value>
-            <Value>-1.0</Value>
-        </Parameter>
-        <Parameter Name="Validation.Expected1">
-            <Value>-inf</Value>
-            <Value>-inf</Value>
-            <Value>-inf</Value>
-            <Value>-inf</Value>
-            <Value>-inf</Value>
-            <Value>inf</Value>
-            <Value>1.0</Value>
-            <Value>inf</Value>
-            <Value>-inf</Value>
-            <Value>inf</Value>
-            <Value>1.0</Value>
-            <Value>NaN</Value>
-            <Value>-inf</Value>
-            <Value>1.0</Value>
-            <Value>-1.0</Value>
-            <Value>-1.0</Value>
-            <Value>-1.0</Value>
-        </Parameter>
-        <Parameter Name="Validation.Expected2">
-            <Value>-inf</Value>
-            <Value>inf</Value>
-            <Value>1.0</Value>
-            <Value>-inf</Value>
-            <Value>inf</Value>
-            <Value>inf</Value>
-            <Value>inf</Value>
-            <Value>inf</Value>
-            <Value>-inf</Value>
-            <Value>inf</Value>
-            <Value>1.0</Value>
-            <Value>NaN</Value>
-            <Value>1.0</Value>
-            <Value>inf</Value>
-            <Value>1.0</Value>
-            <Value>-1.0</Value>
-            <Value>1.0</Value>
-        </Parameter>
-        <Parameter Name="ShaderOp.Name">MinMax</Parameter>
-        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
-        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
-        <Parameter Name="ShaderOp.Text">
-            <![CDATA[
-            struct SBinaryFPOp {
-                float input1;
-                float input2;
-                float output1;
-                float output2;
-            };
-            RWStructuredBuffer<SBinaryFPOp> g_buf : register(u0);
-            [numthreads(8,8,1)]
-            void main(uint GI : SV_GroupIndex) {
-                SBinaryFPOp l = g_buf[GI];
-                l.output1 = min(l.input1, l.input2);
-                l.output2 = max(l.input1, l.input2);
-                g_buf[GI] = l;
-            };
-            ]]>
-        </Parameter>
-      </Row>
-
-      </Table>
-
-      <Table Id="TertiaryFloatOpTable">
-        <ParameterTypes>
-            <ParameterType Name="Description">String</ParameterType>
-            <ParameterType Name="ShaderOp.Name">String</ParameterType>
-            <ParameterType Name="ShaderOp.Target">String</ParameterType>
-            <ParameterType Name="ShaderOp.EntryPoint">String</ParameterType>
-            <ParameterType Name="ShaderOp.Text">String</ParameterType>
-            <ParameterType Name="Validation.Type">String</ParameterType>
-            <ParameterType Name="Validation.Tolerance">int</ParameterType>
-            <ParameterType Name="Validation.Input1" Array="true">String</ParameterType>
-            <ParameterType Name="Validation.Input2" Array="true">String</ParameterType>
-            <ParameterType Name="Validation.Input3" Array="true">String</ParameterType>
-            <ParameterType Name="Validation.Expected" Array="true">String</ParameterType>
-            <ParameterType Name="Validation.NumInput">unsigned int</ParameterType>
-        </ParameterTypes>
-
-        <Row Name="FMad">
-            <Parameter Name="ShaderOp.Name">FMad</Parameter>
-            <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
-            <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
-            <Parameter Name="ShaderOp.Text">
-            <![CDATA[
-                struct STertiaryFloatOp {
-                    float input1;
-                    float input2;
-                    float input3;
-                    float output;
-                };
-                RWStructuredBuffer<STertiaryFloatOp> g_buf : register(u0);
-                [numthreads(8,8,1)]
-                void main(uint GI : SV_GroupIndex) {
-                    STertiaryFloatOp l = g_buf[GI];
-                    l.output = mad(l.input1, l.input2, l.input3);
-                    g_buf[GI] = l;
-                };
-            ]]>
-            </Parameter>
-            <Parameter Name="Validation.Type">epsilon</Parameter>
-            <Parameter Name="Validation.Tolerance">0.0008</Parameter>
-            <Parameter Name="Validation.Input1">
-                <Value>NaN</Value>
-                <Value>-Inf</Value>
-                <Value>-denorm</Value>
-                <Value>-0</Value>
-                <Value>0</Value>
-                <Value>denorm</Value>
-                <Value>Inf</Value>
-                <Value>1.0</Value>
-                <Value>-1.0</Value>
-                <Value>0</Value>
-                <Value>1</Value>
-                <Value>1.5</Value>
-            </Parameter>
-            <Parameter Name="Validation.Input2">
-                <Value>NaN</Value>
-                <Value>-Inf</Value>
-                <Value>-denorm</Value>
-                <Value>-0</Value>
-                <Value>0</Value>
-                <Value>denorm</Value>
-                <Value>Inf</Value>
-                <Value>1.0</Value>
-                <Value>-1.0</Value>
-                <Value>0</Value>
-                <Value>1</Value>
-                <Value>10</Value>
-            </Parameter>
-            <Parameter Name="Validation.Input3">
-                <Value>NaN</Value>
-                <Value>-Inf</Value>
-                <Value>-denorm</Value>
-                <Value>-0</Value>
-                <Value>0</Value>
-                <Value>denorm</Value>
-                <Value>Inf</Value>
-                <Value>1.0</Value>
-                <Value>-1.0</Value>
-                <Value>1</Value>
-                <Value>0</Value>
-                <Value>-5.5</Value>
-            </Parameter>
-
-            <Parameter Name="Validation.Expected">
-                <Value>NaN</Value>
-                <Value>NaN</Value>
-                <Value>0</Value>
-                <Value>0</Value>
-                <Value>0</Value>
-                <Value>0</Value>
-                <Value>Inf</Value>
-                <Value>2</Value>
-                <Value>0</Value>
-                <Value>1</Value>
-                <Value>1</Value>
-                <Value>9.5</Value>
-            </Parameter>
-            <Parameter Name="Validation.NumInput">12</Parameter>
-      </Row>
-      </Table>
-
-    <Table Id="UnaryIntOpTable">
-      <ParameterTypes>
-        <ParameterType Name="ShaderOp.Name">String</ParameterType>
-        <ParameterType Name="ShaderOp.Target">String</ParameterType>
-        <ParameterType Name="ShaderOp.EntryPoint">String</ParameterType>
-        <ParameterType Name="ShaderOp.Text">String</ParameterType>
-        <ParameterType Name="Validation.Input" Array="true">int</ParameterType>
-        <ParameterType Name="Validation.Expected" Array="true">int</ParameterType>
-        <ParameterType Name="Validation.Tolerance">int</ParameterType>
-        <ParameterType Name="Validation.NumInput">unsigned int</ParameterType>
-      </ParameterTypes>
-
-      <Row Name="Bfrev">
-        <Parameter Name="ShaderOp.Name">bfrev</Parameter>
-        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
-        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
-        <Parameter Name="ShaderOp.Text">
-        <![CDATA[
-            struct SUnaryIntOp {
-                int input;
-                int output;
-            };
-            RWStructuredBuffer<SUnaryIntOp> g_buf : register(u0);
-            [numthreads(8,8,1)]
-            void main(uint GI : SV_GroupIndex) {
-                SUnaryIntOp l = g_buf[GI];
-                l.output = reversebits(l.input);
-                g_buf[GI] = l;
-            };
-        ]]>
-        </Parameter>
-
-        <Parameter Name="Validation.Tolerance">0</Parameter>
-        <Parameter Name="Validation.NumInput">9</Parameter>
-        <Parameter Name="Validation.Input">
-            <Value>-2147483648</Value>
-            <Value>-65536</Value>
-            <Value>-8</Value>
-            <Value>-1</Value>
-            <Value>0</Value>
-            <Value>1</Value>
-            <Value>8</Value>
-            <Value>65536</Value>
-            <Value>2147483647</Value>
-        </Parameter>
-        <Parameter Name="Validation.Expected">
-            <Value>1</Value>
-            <Value>65535</Value>
-            <Value>536870911</Value>
-            <Value>-1</Value>
-            <Value>0</Value>
-            <Value>-2147483648</Value>
-            <Value>268435456</Value>
-            <Value>32768</Value>
-            <Value>-2</Value>
-        </Parameter>
-      </Row>
-
-      <Row Name="FirstbitSHi">
-        <Parameter Name="ShaderOp.Name">Firstbithi</Parameter>
-        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
-        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
-        <Parameter Name="ShaderOp.Text">
-        <![CDATA[
-            struct SUnaryIntOp {
-                int input;
-                int output;
-            };
-            RWStructuredBuffer<SUnaryIntOp> g_buf : register(u0);
-            [numthreads(8,8,1)]
-            void main(uint GI : SV_GroupIndex) {
-                SUnaryIntOp l = g_buf[GI];
-                l.output = firstbithigh(l.input);
-                g_buf[GI] = l;
-            };
-        ]]>
-        </Parameter>
-
-        <Parameter Name="Validation.Tolerance">0</Parameter>
-        <Parameter Name="Validation.NumInput">9</Parameter>
-        <Parameter Name="Validation.Input">
-            <Value>-2147483648</Value>
-            <Value>-65536</Value>
-            <Value>-8</Value>
-            <Value>-1</Value>
-            <Value>0</Value>
-            <Value>1</Value>
-            <Value>8</Value>
-            <Value>65536</Value>
-            <Value>2147483647</Value>
-        </Parameter>
-        <Parameter Name="Validation.Expected">
-            <Value>30</Value>
-            <Value>15</Value>
-            <Value>2</Value>
-            <Value>-1</Value>
-            <Value>-1</Value>
-            <Value>0</Value>
-            <Value>3</Value>
-            <Value>16</Value>
-            <Value>30</Value>
-        </Parameter>
-      </Row>
-
-      <Row Name="FirstBitLo">
-        <Parameter Name="ShaderOp.Name">Firstbitlo</Parameter>
-        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
-        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
-        <Parameter Name="ShaderOp.Text">
-        <![CDATA[
-            struct SUnaryIntOp {
-                int input;
-                int output;
-            };
-            RWStructuredBuffer<SUnaryIntOp> g_buf : register(u0);
-            [numthreads(8,8,1)]
-            void main(uint GI : SV_GroupIndex) {
-                SUnaryIntOp l = g_buf[GI];
-                l.output = firstbitlow(l.input);
-                g_buf[GI] = l;
-            };
-        ]]>
-        </Parameter>
-
-        <Parameter Name="Validation.Tolerance">0</Parameter>
-        <Parameter Name="Validation.NumInput">9</Parameter>
-        <Parameter Name="Validation.Input">
-            <Value>-2147483648</Value>
-            <Value>-65536</Value>
-            <Value>-8</Value>
-            <Value>-1</Value>
-            <Value>0</Value>
-            <Value>1</Value>
-            <Value>8</Value>
-            <Value>65536</Value>
-            <Value>2147483647</Value>
-        </Parameter>
-        <Parameter Name="Validation.Expected">
-            <Value>31</Value>
-            <Value>16</Value>
-            <Value>3</Value>
-            <Value>0</Value>
-            <Value>-1</Value>
-            <Value>0</Value>
-            <Value>3</Value>
-            <Value>16</Value>
-            <Value>0</Value>
-        </Parameter>
-      </Row>
-
-      <Row Name="Countbits">
-        <Parameter Name="ShaderOp.Name">Countbits</Parameter>
-        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
-        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
-        <Parameter Name="ShaderOp.Text">
-        <![CDATA[
-            struct SUnaryIntOp {
-                int input;
-                int output;
-            };
-            RWStructuredBuffer<SUnaryIntOp> g_buf : register(u0);
-            [numthreads(8,8,1)]
-            void main(uint GI : SV_GroupIndex) {
-                SUnaryIntOp l = g_buf[GI];
-                l.output = countbits(l.input);
-                g_buf[GI] = l;
-            };
-        ]]>
-        </Parameter>
-
-        <Parameter Name="Validation.Tolerance">0</Parameter>
-        <Parameter Name="Validation.NumInput">9</Parameter>
-        <Parameter Name="Validation.Input">
-            <Value>-2147483648</Value>
-            <Value>-65536</Value>
-            <Value>-8</Value>
-            <Value>-1</Value>
-            <Value>0</Value>
-            <Value>1</Value>
-            <Value>8</Value>
-            <Value>65536</Value>
-            <Value>2147483647</Value>
-        </Parameter>
-        <Parameter Name="Validation.Expected">
-            <Value>1</Value>
-            <Value>16</Value>
-            <Value>29</Value>
-            <Value>32</Value>
-            <Value>0</Value>
-            <Value>1</Value>
-            <Value>1</Value>
-            <Value>1</Value>
-            <Value>31</Value>
-        </Parameter>
-      </Row>
-    </Table>
-
-    <Table Id="UnaryUintOpTable">
-      <ParameterTypes>
-        <ParameterType Name="ShaderOp.Name">String</ParameterType>
-        <ParameterType Name="ShaderOp.Target">String</ParameterType>
-        <ParameterType Name="ShaderOp.EntryPoint">String</ParameterType>
-        <ParameterType Name="ShaderOp.Text">String</ParameterType>
-        <ParameterType Name="Validation.Input" Array="true">unsigned int</ParameterType>
-        <ParameterType Name="Validation.Expected" Array="true">unsigned int</ParameterType>
-        <ParameterType Name="Validation.Tolerance">int</ParameterType>
-        <ParameterType Name="Validation.NumInput">unsigned int</ParameterType>
-      </ParameterTypes>
-
-      <Row Name="FirstbitHi">
-        <Parameter Name="ShaderOp.Name">Firstbithi</Parameter>
-        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
-        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
-        <Parameter Name="ShaderOp.Text">
-        <![CDATA[
-            struct SUnaryUintOp {
-                uint input;
-                uint output;
-            };
-            RWStructuredBuffer<SUnaryUintOp> g_buf : register(u0);
-            [numthreads(8,8,1)]
-            void main(uint GI : SV_GroupIndex) {
-                SUnaryUintOp l = g_buf[GI];
-                l.output = firstbithigh(l.input);
-                g_buf[GI] = l;
-            };
-        ]]>
-        </Parameter>
-        <Parameter Name="Validation.Tolerance">0</Parameter>
-        <Parameter Name="Validation.NumInput">6</Parameter>
-        <Parameter Name="Validation.Input">
-            <Value>0</Value>
-            <Value>1</Value>
-            <Value>8</Value>
-            <Value>65536</Value>
-            <Value>2147483647</Value>
-            <Value>4294967295</Value>
-        </Parameter>
-        <Parameter Name="Validation.Expected">
-            <Value>-1</Value>
-            <Value>0</Value>
-            <Value>3</Value>
-            <Value>16</Value>
-            <Value>30</Value>
-            <Value>31</Value>
-        </Parameter>
-      </Row>
-    </Table>
-
-    <Table Id="BinaryIntOpTable">
-      <ParameterTypes>
-        <ParameterType Name="ShaderOp.Name">String</ParameterType>
-        <ParameterType Name="ShaderOp.Target">String</ParameterType>
-        <ParameterType Name="ShaderOp.EntryPoint">String</ParameterType>
-        <ParameterType Name="ShaderOp.Text">String</ParameterType>
-        <ParameterType Name="Validation.NumExpected">int</ParameterType>
-        <ParameterType Name="Validation.Input1" Array="true">int</ParameterType>
-        <ParameterType Name="Validation.Input2" Array="true">int</ParameterType>
-        <ParameterType Name="Validation.Expected1" Array="true">int</ParameterType>
-        <ParameterType Name="Validation.Expected2" Array="true">int</ParameterType>
-        <ParameterType Name="Validation.Tolerance">int</ParameterType>
-        <ParameterType Name="Validation.NumInput">unsigned int</ParameterType>
-      </ParameterTypes>
-
-      <Row Name="IMax">
-        <Parameter Name="ShaderOp.Name">IMax</Parameter>
-        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
-        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
-        <Parameter Name="ShaderOp.Text">
-        <![CDATA[
-            struct SBinaryIntOp {
-                int input1;
-                int input2;
-                int output1;
-                int output2;
-            };
-            RWStructuredBuffer<SBinaryIntOp> g_buf : register(u0);
-            [numthreads(8,8,1)]
-            void main(uint GI : SV_GroupIndex) {
-                SBinaryIntOp l = g_buf[GI];
-                l.output1 = max(l.input1, l.input2);
-                g_buf[GI] = l;
-            };
-        ]]>
-        </Parameter>
-        <Parameter Name="Validation.NumExpected">1</Parameter>
-        <Parameter Name="Validation.NumInput">6</Parameter>
-        <Parameter Name="Validation.Input1">
-            <Value>-2147483648</Value>
-            <Value>-10</Value>
-            <Value>0</Value>
-            <Value>0</Value>
-            <Value>10</Value>
-            <Value>2147483647</Value>
-        </Parameter>
-        <Parameter Name="Validation.Input2">
-            <Value>0</Value>
-            <Value>10</Value>
-            <Value>-10</Value>
-            <Value>10</Value>
-            <Value>10</Value>
-            <Value>0</Value>
-        </Parameter>
-        <Parameter Name="Validation.Expected1">
-            <Value>0</Value>
-            <Value>10</Value>
-            <Value>0</Value>
-            <Value>10</Value>
-            <Value>10</Value>
-            <Value>2147483647</Value>
-        </Parameter>
-        <Parameter Name="Validation.Tolerance">0</Parameter>
-      </Row>
-      <Row Name="IMin">
-        <Parameter Name="ShaderOp.Name">IMin</Parameter>
-        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
-        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
-        <Parameter Name="ShaderOp.Text">
-        <![CDATA[
-            struct SBinaryIntOp {
-                int input1;
-                int input2;
-                int output1;
-                int output2;
-            };
-            RWStructuredBuffer<SBinaryIntOp> g_buf : register(u0);
-            [numthreads(8,8,1)]
-            void main(uint GI : SV_GroupIndex) {
-                SBinaryIntOp l = g_buf[GI];
-                l.output1 = min(l.input1, l.input2);
-                g_buf[GI] = l;
-            };
-        ]]>
-        </Parameter>
-        <Parameter Name="Validation.NumExpected">1</Parameter>
-        <Parameter Name="Validation.NumInput">6</Parameter>
-        <Parameter Name="Validation.Input1">
-            <Value>-2147483648</Value>
-            <Value>-10</Value>
-            <Value>0</Value>
-            <Value>0</Value>
-            <Value>10</Value>
-            <Value>2147483647</Value>
-        </Parameter>
-        <Parameter Name="Validation.Input2">
-            <Value>0</Value>
-            <Value>10</Value>
-            <Value>-10</Value>
-            <Value>10</Value>
-            <Value>10</Value>
-            <Value>0</Value>
-        </Parameter>
-        <Parameter Name="Validation.Expected1">
-            <Value>-2147483648</Value>
-            <Value>-10</Value>
-            <Value>-10</Value>
-            <Value>0</Value>
-            <Value>10</Value>
-            <Value>0</Value>
-        </Parameter>
-        <Parameter Name="Validation.Tolerance">0</Parameter>
-      </Row>
-      <Row Name="IMul">
-        <Parameter Name="ShaderOp.Name">Mul</Parameter>
-        <Parameter Name="ShaderOp.Description">integer multiplication. Note that this calls llvm "mul" operation and not IMul</Parameter>
-        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
-        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
-        <Parameter Name="ShaderOp.Text">
-        <![CDATA[
-            struct SBinaryIntOp {
-                int input1;
-                int input2;
-                int output1;
-                int output2;
-            };
-            RWStructuredBuffer<SBinaryIntOp> g_buf : register(u0);
-            [numthreads(8,8,1)]
-            void main(uint GI : SV_GroupIndex) {
-                SBinaryIntOp l = g_buf[GI];
-                l.output1 = l.input1 * l.input2;
-                g_buf[GI] = l;
-            };
-        ]]>
-        </Parameter>
-        <Parameter Name="Validation.NumExpected">1</Parameter>
-        <Parameter Name="Validation.NumInput">9</Parameter>
-        <Parameter Name="Validation.Input1">
-            <Value>-2147483648</Value>
-            <Value>-10</Value>
-            <Value>-1</Value>
-            <Value>0</Value>
-            <Value>1</Value>
-            <Value>10</Value>
-            <Value>10000</Value>
-            <Value>2147483647</Value>
-            <Value>2147483647</Value>
-        </Parameter>
-        <Parameter Name="Validation.Input2">
-            <Value>-10</Value>
-            <Value>-10</Value>
-            <Value>10</Value>
-            <Value>0</Value>
-            <Value>256</Value>
-            <Value>4</Value>
-            <Value>10001</Value>
-            <Value>0</Value>
-            <Value>2147483647</Value>
-        </Parameter>
-        <Parameter Name="Validation.Expected1">
-            <Value>0</Value>
-            <Value>100</Value>
-            <Value>-10</Value>
-            <Value>0</Value>
-            <Value>256</Value>
-            <Value>40</Value>
-            <Value>100010000</Value>
-            <Value>0</Value>
-            <Value>1</Value>
-        </Parameter>
-        <Parameter Name="Validation.Tolerance">0</Parameter>
-      </Row>
-    </Table>
-
-    <Table Id="BinaryUintOpTable">
-      <ParameterTypes>
-        <ParameterType Name="ShaderOp.Name">String</ParameterType>
-        <ParameterType Name="ShaderOp.Target">String</ParameterType>
-        <ParameterType Name="ShaderOp.EntryPoint">String</ParameterType>
-        <ParameterType Name="ShaderOp.Text">String</ParameterType>
-        <ParameterType Name="Validation.NumExpected">int</ParameterType>
-        <ParameterType Name="Validation.Input1" Array="true">unsigned int</ParameterType>
-        <ParameterType Name="Validation.Input2" Array="true">unsigned int</ParameterType>
-        <ParameterType Name="Validation.Expected1" Array="true">unsigned int</ParameterType>
-        <ParameterType Name="Validation.Expected2" Array="true">unsigned int</ParameterType>
-        <ParameterType Name="Validation.Tolerance">int</ParameterType>
-        <ParameterType Name="Validation.NumInput">unsigned int</ParameterType>
-      </ParameterTypes>
-      <Row Name="UMax">
-        <Parameter Name="ShaderOp.Name">UMax</Parameter>
-        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
-        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
-        <Parameter Name="ShaderOp.Text">
-        <![CDATA[
-            struct SBinaryUintOp {
-                uint input1;
-                uint input2;
-                uint output1;
-                uint output2;
-            };
-            RWStructuredBuffer<SBinaryUintOp> g_buf : register(u0);
-            [numthreads(8,8,1)]
-            void main(uint GI : SV_GroupIndex) {
-                SBinaryUintOp l = g_buf[GI];
-                l.output1 = max(l.input1, l.input2);
-                g_buf[GI] = l;
-            };
-        ]]>
-        </Parameter>
-        <Parameter Name="Validation.NumExpected">1</Parameter>
-        <Parameter Name="Validation.NumInput">6</Parameter>
-        <Parameter Name="Validation.Input1">
-            <Value>0</Value>
-            <Value>0</Value>
-            <Value>10</Value>
-            <Value>10000</Value>
-            <Value>2147483647</Value>
-            <Value>4294967295</Value>
-        </Parameter>
-        <Parameter Name="Validation.Input2">
-            <Value>0</Value>
-            <Value>256</Value>
-            <Value>4</Value>
-            <Value>10001</Value>
-            <Value>0</Value>
-            <Value>4294967295</Value>
-        </Parameter>
-        <Parameter Name="Validation.Expected1">
-            <Value>0</Value>
-            <Value>256</Value>
-            <Value>10</Value>
-            <Value>10001</Value>
-            <Value>2147483647</Value>
-            <Value>4294967295</Value>
-        </Parameter>
-        <Parameter Name="Validation.Tolerance">0</Parameter>
-      </Row>
-      <Row Name="UMin">
-        <Parameter Name="ShaderOp.Name">UMin</Parameter>
-        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
-        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
-        <Parameter Name="ShaderOp.Text">
-        <![CDATA[
-            struct SBinaryUintOp {
-                uint input1;
-                uint input2;
-                uint output1;
-                uint output2;
-            };
-            RWStructuredBuffer<SBinaryUintOp> g_buf : register(u0);
-            [numthreads(8,8,1)]
-            void main(uint GI : SV_GroupIndex) {
-                SBinaryUintOp l = g_buf[GI];
-                l.output1 = min(l.input1, l.input2);
-                g_buf[GI] = l;
-            };
-        ]]>
-        </Parameter>
-        <Parameter Name="Validation.NumExpected">1</Parameter>
-        <Parameter Name="Validation.NumInput">6</Parameter>
-        <Parameter Name="Validation.Input1">
-            <Value>0</Value>
-            <Value>0</Value>
-            <Value>10</Value>
-            <Value>10000</Value>
-            <Value>2147483647</Value>
-            <Value>4294967295</Value>
-        </Parameter>
-        <Parameter Name="Validation.Input2">
-            <Value>0</Value>
-            <Value>256</Value>
-            <Value>4</Value>
-            <Value>10001</Value>
-            <Value>0</Value>
-            <Value>4294967295</Value>
-        </Parameter>
-        <Parameter Name="Validation.Expected1">
-            <Value>0</Value>
-            <Value>0</Value>
-            <Value>4</Value>
-            <Value>10000</Value>
-            <Value>0</Value>
-            <Value>4294967295</Value>
-        </Parameter>
-        <Parameter Name="Validation.Tolerance">0</Parameter>
-      </Row>
-
-      <Row Name="UMul">
-        <Parameter Name="ShaderOp.Name">Mul</Parameter>
-        <Parameter Name="ShaderOp.Description">integer multiplication. Note that this calls llvm "mul" operation and not IMul</Parameter>
-        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
-        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
-        <Parameter Name="ShaderOp.Text">
-        <![CDATA[
-            struct SBinaryUintOp {
-                uint input1;
-                uint input2;
-                uint output1;
-                uint output2;
-            };
-            RWStructuredBuffer<SBinaryUintOp> g_buf : register(u0);
-            [numthreads(8,8,1)]
-            void main(uint GI : SV_GroupIndex) {
-                SBinaryUintOp l = g_buf[GI];
-                l.output1 = l.input1 * l.input2;
-                g_buf[GI] = l;
-            };
-        ]]>
-        </Parameter>
-        <Parameter Name="Validation.NumExpected">1</Parameter>
-        <Parameter Name="Validation.NumInput">5</Parameter>
-        <Parameter Name="Validation.Input1">
-            <Value>0</Value>
-            <Value>1</Value>
-            <Value>10</Value>
-            <Value>10000</Value>
-            <Value>2147483647</Value>
-        </Parameter>
-        <Parameter Name="Validation.Input2">
-            <Value>0</Value>
-            <Value>256</Value>
-            <Value>4</Value>
-            <Value>10001</Value>
-            <Value>0</Value>
-        </Parameter>
-        <Parameter Name="Validation.Expected1">
-            <Value>0</Value>
-            <Value>256</Value>
-            <Value>40</Value>
-            <Value>100010000</Value>
-            <Value>0</Value>
-        </Parameter>
-        <Parameter Name="Validation.Tolerance">0</Parameter>
-      </Row>
-
-      <Row Name="UDiv">
-        <Parameter Name="ShaderOp.Name">UDiv</Parameter>
-        <Parameter Name="ShaderOp.Description">integer division. Note that this calls llvm "div" and "rem" operations and not UDiv</Parameter>
-        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
-        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
-        <Parameter Name="ShaderOp.Text">
-        <![CDATA[
-            struct SBinaryUintOp {
-                uint input1;
-                uint input2;
-                uint output1;
-                uint output2;
-            };
-            RWStructuredBuffer<SBinaryUintOp> g_buf : register(u0);
-            [numthreads(8,8,1)]
-            void main(uint GI : SV_GroupIndex) {
-                SBinaryUintOp l = g_buf[GI];
-                l.output1 = l.input1 / l.input2;
-                l.output2 = l.input1 % l.input2;
-                g_buf[GI] = l;
-            };
-        ]]>
-        </Parameter>
-        <Parameter Name="Operation.Unsigned">true</Parameter>
-        <Parameter Name="Validation.NumExpected">2</Parameter>
-        <Parameter Name="Validation.NumInput">7</Parameter>
-        <Parameter Name="Validation.Input1">
-            <Value>1</Value>
-            <Value>1</Value>
-            <Value>10</Value>
-            <Value>10000</Value>
-            <Value>2147483647</Value>
-            <Value>2147483647</Value>
-            <Value>0xffffffff</Value>
-        </Parameter>
-        <Parameter Name="Validation.Input2">
-            <Value>0</Value>
-            <Value>256</Value>
-            <Value>4</Value>
-            <Value>10001</Value>
-            <Value>0</Value>
-            <Value>2147483647</Value>
-            <Value>1</Value>
-        </Parameter>
-        <Parameter Name="Validation.Expected1">
-            <Value>0xffffffff</Value>
-            <Value>0</Value>
-            <Value>2</Value>
-            <Value>0</Value>
-            <Value>0xffffffff</Value>
-            <Value>1</Value>
-            <Value>0xffffffff</Value>
-        </Parameter>
-        <Parameter Name="Validation.Expected2">
-            <Value>0xffffffff</Value>
-            <Value>1</Value>
-            <Value>2</Value>
-            <Value>10000</Value>
-            <Value>0xffffffff</Value>
-            <Value>0</Value>
-            <Value>0</Value>
-        </Parameter>
-        <Parameter Name="Validation.Tolerance">0</Parameter>
-      </Row>
-
-       <Row Name="UAddc">
-        <Parameter Name="ShaderOp.Name">UAddc</Parameter>
-        <Parameter Name="ShaderOp.Description">UAddc is called through AddUint64 intrinsic</Parameter>
-        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
-        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
-        <Parameter Name="ShaderOp.Text">
-        <![CDATA[
-            struct SBinaryUintOp {
-                uint input1;
-                uint input2;
-                uint output1;
-                uint output2;
-            };
-            RWStructuredBuffer<SBinaryUintOp> g_buf : register(u0);
-            [numthreads(8,8,1)]
-            void main(uint GI : SV_GroupIndex) {
-                SBinaryUintOp l = g_buf[GI];
-                uint2 x = uint2(l.input1, l.input2);
-                uint2 y = AddUint64(x, x);
-                l.output1 = y.x;
-                l.output2 = y.y;
-                g_buf[GI] = l;
-            };
-        ]]>
-        </Parameter>
-        <Parameter Name="Operation.Unsigned">true</Parameter>
-        <Parameter Name="Validation.NumExpected">2</Parameter>
-        <Parameter Name="Validation.NumInput">6</Parameter>
-        <Parameter Name="Validation.Input1">
-            <Value>1</Value>
-            <Value>1</Value>
-            <Value>10000</Value>
-            <Value>0x80000000</Value>
-            <Value>0x7fffffff</Value>
-            <Value>0xffffffff</Value>
-        </Parameter>
-        <Parameter Name="Validation.Input2">
-            <Value>0</Value>
-            <Value>256</Value>
-            <Value>10001</Value>
-            <Value>1</Value>
-            <Value>0x7fffffff</Value>
-            <Value>0x7fffffff</Value>
-        </Parameter>
-        <Parameter Name="Validation.Expected1">
-            <Value>2</Value>
-            <Value>2</Value>
-            <Value>20000</Value>
-            <Value>0</Value>
-            <Value>0xfffffffe</Value>
-            <Value>0xfffffffe</Value>
-        </Parameter>
-        <Parameter Name="Validation.Expected2">
-            <Value>0</Value>
-            <Value>512</Value>
-            <Value>20002</Value>
-            <Value>3</Value>
-            <Value>0xfffffffe</Value>
-            <Value>0xffffffff</Value>
-        </Parameter>
-        <Parameter Name="Validation.Tolerance">0</Parameter>
-      </Row>
-    </Table>
-
-    <Table Id="TertiaryIntOpTable">
-      <ParameterTypes>
-        <ParameterType Name="Description">String</ParameterType>
-        <ParameterType Name="ShaderOp.Name">String</ParameterType>
-        <ParameterType Name="ShaderOp.Target">String</ParameterType>
-        <ParameterType Name="ShaderOp.EntryPoint">String</ParameterType>
-        <ParameterType Name="ShaderOp.Text">String</ParameterType>
-        <ParameterType Name="Validation.Type">String</ParameterType>
-        <ParameterType Name="Validation.Tolerance">int</ParameterType>
-        <ParameterType Name="Validation.Input1" Array="true">int</ParameterType>
-        <ParameterType Name="Validation.Input2" Array="true">int</ParameterType>
-        <ParameterType Name="Validation.Input3" Array="true">int</ParameterType>
-        <ParameterType Name="Validation.Expected" Array="true">int</ParameterType>
-        <ParameterType Name="Validation.NumInput">unsigned int</ParameterType>
-      </ParameterTypes>
-
-      <Row Name="IMad">
-        <Parameter Name="ShaderOp.Name">msad4</Parameter>
-        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
-        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
-        <Parameter Name="ShaderOp.Text">
-        <![CDATA[
-            struct STertiaryIntOp {
-                int input1;
-                int input2;
-                int input3;
-                int output;
-            };
-            RWStructuredBuffer<STertiaryIntOp> g_buf : register(u0);
-            [numthreads(8,8,1)]
-            void main(uint GI : SV_GroupIndex) {
-                STertiaryIntOp l = g_buf[GI];
-                l.output = mad(l.input1, l.input2, l.input3);
-                g_buf[GI] = l;
-            };
-        ]]>
-        </Parameter>
-        <Parameter Name="Validation.Type">epsilon</Parameter>
-        <Parameter Name="Validation.Tolerance">0</Parameter>
-        <Parameter Name="Validation.Input1">
-            <Value>-2147483647</Value>
-            <Value>-256</Value>
-            <Value>-1</Value>
-            <Value>0</Value>
-            <Value>1</Value>
-            <Value>2</Value>
-            <Value>16</Value>
-            <Value>2147483647</Value>
-            <Value>1</Value>
-            <Value>-1</Value>
-            <Value>1</Value>
-            <Value>10</Value>
-        </Parameter>
-        <Parameter Name="Validation.Input2">
-            <Value>1</Value>
-            <Value>-256</Value>
-            <Value>-1</Value>
-            <Value>0</Value>
-            <Value>1</Value>
-            <Value>3</Value>
-            <Value>16</Value>
-            <Value>0</Value>
-            <Value>1</Value>
-            <Value>-1</Value>
-            <Value>10</Value>
-            <Value>100</Value>
-        </Parameter>
-        <Parameter Name="Validation.Input3">
-            <Value>0</Value>
-            <Value>0</Value>
-            <Value>0</Value>
-            <Value>0</Value>
-            <Value>1</Value>
-            <Value>3</Value>
-            <Value>1</Value>
-            <Value>255</Value>
-            <Value>2147483646</Value>
-            <Value>-2147483647</Value>
-            <Value>-10</Value>
-            <Value>-2000</Value>
-        </Parameter>
-
-        <Parameter Name="Validation.Expected">
-            <Value>-2147483647</Value>
-            <Value>65536</Value>
-            <Value>1</Value>
-            <Value>0</Value>
-            <Value>2</Value>
-            <Value>9</Value>
-            <Value>257</Value>
-            <Value>255</Value>
-            <Value>2147483647</Value>
-            <Value>-2147483646</Value>
-            <Value>0</Value>
-            <Value>-1000</Value>
-        </Parameter>
-        <Parameter Name="Validation.NumInput">12</Parameter>
-      </Row>
-      </Table>
-
-      <Table Id="TertiaryUintOpTable">
-        <ParameterTypes>
-            <ParameterType Name="Description">String</ParameterType>
-            <ParameterType Name="ShaderOp.Name">String</ParameterType>
-            <ParameterType Name="ShaderOp.Target">String</ParameterType>
-            <ParameterType Name="ShaderOp.EntryPoint">String</ParameterType>
-            <ParameterType Name="ShaderOp.Text">String</ParameterType>
-            <ParameterType Name="Validation.Type">String</ParameterType>
-            <ParameterType Name="Validation.Tolerance">int</ParameterType>
-            <ParameterType Name="Validation.Input1" Array="true">int</ParameterType>
-            <ParameterType Name="Validation.Input2" Array="true">int</ParameterType>
-            <ParameterType Name="Validation.Input3" Array="true">int</ParameterType>
-            <ParameterType Name="Validation.Expected" Array="true">int</ParameterType>
-            <ParameterType Name="Validation.NumInput">unsigned int</ParameterType>
-        </ParameterTypes>
-
-        <Row Name="UMad">
-            <Parameter Name="ShaderOp.Name">UMad</Parameter>
-            <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
-            <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
-            <Parameter Name="ShaderOp.Text">
-            <![CDATA[
-                struct STertiaryUintOp {
-                    uint input1;
-                    uint input2;
-                    uint input3;
-                    uint output;
-                };
-                RWStructuredBuffer<STertiaryUintOp> g_buf : register(u0);
-                [numthreads(8,8,1)]
-                void main(uint GI : SV_GroupIndex) {
-                    STertiaryUintOp l = g_buf[GI];
-                    l.output = mad(l.input1, l.input2, l.input3);
-                    g_buf[GI] = l;
-                };
-            ]]>
-            </Parameter>
-            <Parameter Name="Validation.Type">epsilon</Parameter>
-            <Parameter Name="Validation.Tolerance">0</Parameter>
-            <Parameter Name="Validation.Input1">
-                <Value>0</Value>
-                <Value>1</Value>
-                <Value>2</Value>
-                <Value>16</Value>
-                <Value>2147483647</Value>
-                <Value>0</Value>
-                <Value>10</Value>
-            </Parameter>
-            <Parameter Name="Validation.Input2">
-                <Value>0</Value>
-                <Value>1</Value>
-                <Value>2</Value>
-                <Value>16</Value>
-                <Value>1</Value>
-                <Value>0</Value>
-                <Value>10</Value>
-            </Parameter>
-            <Parameter Name="Validation.Input3">
-                <Value>0</Value>
-                <Value>0</Value>
-                <Value>1</Value>
-                <Value>15</Value>
-                <Value>0</Value>
-                <Value>10</Value>
-                <Value>10</Value>
-            </Parameter>
-
-            <Parameter Name="Validation.Expected">
-                <Value>0</Value>
-                <Value>1</Value>
-                <Value>5</Value>
-                <Value>271</Value>
-                <Value>2147483647</Value>
-                <Value>10</Value>
-                <Value>110</Value>
-            </Parameter>
-            <Parameter Name="Validation.NumInput">7</Parameter>
-      </Row>
-      </Table>
-
-      <Table Id="DotOpTable">
-        <ParameterTypes>
-            <ParameterType Name="Description">String</ParameterType>
-            <ParameterType Name="ShaderOp.Name">String</ParameterType>
-            <ParameterType Name="ShaderOp.Target">String</ParameterType>
-            <ParameterType Name="ShaderOp.EntryPoint">String</ParameterType>
-            <ParameterType Name="ShaderOp.Text">String</ParameterType>
-            <ParameterType Name="Validation.Type">String</ParameterType>
-            <ParameterType Name="Validation.Tolerance">int</ParameterType>
-            <ParameterType Name="Validation.Input1" Array="true">String</ParameterType>
-            <ParameterType Name="Validation.Input2" Array="true">String</ParameterType>
-            <ParameterType Name="Validation.dot2" Array="true">String</ParameterType>
-            <ParameterType Name="Validation.dot3" Array="true">String</ParameterType>
-            <ParameterType Name="Validation.dot4" Array="true">String</ParameterType>
-            <ParameterType Name="Validation.NumInput">unsigned int</ParameterType>
-        </ParameterTypes>
-
-        <Row Name="Dot">
-            <Parameter Name="ShaderOp.Name">Dot</Parameter>
-            <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
-            <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
-            <Parameter Name="ShaderOp.Text">
-            <![CDATA[
-                struct SDotOp {
-                   float4 input1;
-                   float4 input2;
-                   float o_dot2;
-                   float o_dot3;
-                   float o_dot4;
-                };
-                RWStructuredBuffer<SDotOp> g_buf : register(u0);
-                [numthreads(8,8,1)]
-                void main(uint GI : SV_GroupIndex) {
-                    SDotOp l = g_buf[GI];
-                    l.o_dot2 = dot(l.input1.xy, l.input2.xy);
-                    l.o_dot3 = dot(l.input1.xyz, l.input2.xyz);
-                    l.o_dot4 = dot(l.input1.xyzw, l.input2.xyzw);
-                    g_buf[GI] = l;
-                };
-            ]]>
-            </Parameter>
-            <Parameter Name="Validation.Type">epsilon</Parameter>
-            <Parameter Name="Validation.Tolerance">0.008</Parameter>
-            <Parameter Name="Validation.Input1">
-                <Value>NaN,NaN,NaN,NaN</Value>
-                <Value>-Inf,-Inf,-Inf,-Inf</Value>
-                <Value>-denorm,-denorm,-denorm,-denorm</Value>
-                <Value>-0,-0,-0,-0</Value>
-                <Value>0,0,0,0</Value>
-                <Value>denorm,denorm,denorm,denorm</Value>
-                <Value>Inf,Inf,Inf,Inf</Value>
-                <Value>1,1,1,1</Value>
-                <Value>-10,0,0,10</Value>
-                <Value>Inf,Inf,Inf,-Inf</Value>
-            </Parameter>
-            <Parameter Name="Validation.Input2">
-                <Value>NaN,NaN,NaN,NaN</Value>
-                <Value>-Inf,-Inf,-Inf,-Inf</Value>
-                <Value>-denorm,-denorm,-denorm,-denorm</Value>
-                <Value>-0,-0,-0,-0</Value>
-                <Value>0,0,0,0</Value>
-                <Value>denorm,denorm,denorm,denorm</Value>
-                <Value>Inf,Inf,Inf,Inf</Value>
-                <Value>1,1,1,1</Value>
-                <Value>10,0,0,10</Value>
-                <Value>Inf,Inf,Inf,Inf</Value>
-            </Parameter>
-            <Parameter Name="Validation.dot2">
-                <Value>NaN</Value>
-                <Value>Inf</Value>
-                <Value>0</Value>
-                <Value>0</Value>
-                <Value>0</Value>
-                <Value>0</Value>
-                <Value>Inf</Value>
-                <Value>2</Value>
-                <Value>-100</Value>
-                <Value>Inf</Value>
-            </Parameter>
-            <Parameter Name="Validation.dot3">
-                <Value>NaN</Value>
-                <Value>Inf</Value>
-                <Value>0</Value>
-                <Value>0</Value>
-                <Value>0</Value>
-                <Value>0</Value>
-                <Value>Inf</Value>
-                <Value>3</Value>
-                <Value>-100</Value>
-                <Value>Inf</Value>
-            </Parameter>
-            <Parameter Name="Validation.dot4">
-                <Value>NaN</Value>
-                <Value>Inf</Value>
-                <Value>0</Value>
-                <Value>0</Value>
-                <Value>0</Value>
-                <Value>0</Value>
-                <Value>Inf</Value>
-                <Value>4</Value>
-                <Value>0</Value>
-                <Value>NaN</Value>
-            </Parameter>
-            <Parameter Name="Validation.NumInput">10</Parameter>
-        </Row>
-      </Table>
-
-      <Table Id="MSad4Table">
-        <ParameterTypes>
-            <ParameterType Name="Description">String</ParameterType>
-            <ParameterType Name="ShaderOp.Text">String</ParameterType>
-            <ParameterType Name="Validation.Tolerance">int</ParameterType>
-            <ParameterType Name="Validation.Reference" Array="true">unsigned int</ParameterType>
-            <ParameterType Name="Validation.Source" Array="true">String</ParameterType>
-            <ParameterType Name="Validation.Accum" Array="true">String</ParameterType>
-            <ParameterType Name="Validation.Expected" Array="true">String</ParameterType>
-            <ParameterType Name="Validation.NumInput">unsigned int</ParameterType>
-        </ParameterTypes>
-        <Row Name="MSad4">
-            <Parameter Name="Description">Msad4 intrinsic calls both Bfi and Msad</Parameter>
-            <Parameter Name="ShaderOp.Text">
-                <![CDATA[
-                    struct SMsad4 {
-                        uint ref;
-                        uint2 source;
-                        uint4 accum;
-                        uint4 result;
-                    };
-                    RWStructuredBuffer<SMsad4> g_buf : register(u0);
-                    [numthreads(8,8,1)]
-                    void main(uint GI : SV_GroupIndex) {
-                        SMsad4 l = g_buf[GI];
-                        l.result = msad4(l.ref, l.source, l.accum);
-                        g_buf[GI] = l;
-                    }
-                ]]>
-            </Parameter>
-            <Parameter Name="Validation.Tolerance">0</Parameter>
-            <Parameter Name="Validation.NumInput">4</Parameter>
-            <Parameter Name="Validation.Reference">
-                <Value>0xA100B2C3</Value>
-                <Value>0x00000000</Value>
-                <Value>0xFFFF01C1</Value>
-                <Value>0xFFFFFFFF</Value>
-            </Parameter>
-            <Parameter Name="Validation.Source">
-                <Value>0xD7B0C372, 0x4F57C2A3</Value>
-                <Value>0xFFFFFFFF, 0x00000000</Value>
-                <Value>0x38A03AEF, 0x38194DA3</Value>
-                <Value>0xFFFFFFFF, 0x00000000</Value>
-            </Parameter>
-            <Parameter Name="Validation.Accum">
-                <Value>1,2,3,4</Value>
-                <Value>1,2,3,4</Value>
-                <Value>0,0,0,0</Value>
-                <Value>10,10,10,10</Value>
-            </Parameter>
-            <Parameter Name="Validation.Expected">
-                <Value>153,6,92,113</Value>
-                <Value>1,2,3,4</Value>
-                <Value>397,585,358,707</Value>
-                <Value>10,265,520,775</Value>
-            </Parameter>
-        </Row>
-       </Table>
-
-       <Table Id="WaveIntrinsicsActiveIntTable">
-        <ParameterTypes>
-            <ParameterType Name="ShaderOp.Name">String</ParameterType>
-            <ParameterType Name="ShaderOp.Text">String</ParameterType>
-            <ParameterType Name="Validation.NumInputSet">unsigned int</ParameterType>
-            <ParameterType Name="Validation.InputSet1" Array="true">int</ParameterType>
-            <ParameterType Name="Validation.InputSet2" Array="true">int</ParameterType>
-            <ParameterType Name="Validation.InputSet3" Array="true">int</ParameterType>
-            <ParameterType Name="Validation.InputSet4" Array="true">int</ParameterType>
-        </ParameterTypes>
-
-        <Row Name="WaveActiveSum">
-            <Parameter Name="ShaderOp.Name">WaveActiveSum</Parameter>
-            <Parameter Name="ShaderOp.Text">
-                <![CDATA[
-                    struct PerThreadData {
-                        int firstLaneId;
-                        int mask;
-                        int input;
-                        int output;
-                    };
-                    RWStructuredBuffer<PerThreadData> g_sb : register(u0);
-                    [numthreads(8,12,1)]
-                    void main(uint GI : SV_GroupIndex) {
-                        PerThreadData pts = g_sb[GI];
-                        pts.firstLaneId = WaveReadLaneFirst(GI);
-                        if (pts.mask != 0) {
-                            pts.output = WaveActiveSum(pts.input);
-                        }
-                        else {
-                            pts.output = WaveActiveSum(pts.input);
-                        }
-                        g_sb[GI] = pts;
-                    }
-                ]]>
-            </Parameter>
-            <Parameter Name="Validation.NumInputSet">3</Parameter>
-            <Parameter Name="Validation.InputSet1">
-                <Value>1</Value>
-                <Value>2</Value>
-                <Value>3</Value>
-                <Value>4</Value>
-            </Parameter>
-            <Parameter Name="Validation.InputSet2">
-                <Value>0</Value>
-            </Parameter>
-            <Parameter Name="Validation.InputSet3">
-                <Value>2</Value>
-                <Value>4</Value>
-                <Value>8</Value>
-                <Value>-64</Value>
-            </Parameter>
-        </Row>
-
-         <Row Name="WaveActiveProduct">
-            <Parameter Name="ShaderOp.Name">WaveActiveProduct</Parameter>
-            <Parameter Name="ShaderOp.Text">
-                <![CDATA[
-                    struct PerThreadData {
-                        int firstLaneId;
-                        int mask;
-                        int input;
-                        int output;
-                    };
-                    RWStructuredBuffer<PerThreadData> g_sb : register(u0);
-                    [numthreads(8,12,1)]
-                    void main(uint GI : SV_GroupIndex) {
-                        PerThreadData pts = g_sb[GI];
-                        pts.firstLaneId = WaveReadLaneFirst(GI);
-                        if (pts.mask != 0) {
-                            pts.output = WaveActiveProduct(pts.input);
-                        }
-                        else {
-                            pts.output = WaveActiveProduct(pts.input);
-                        }
-                        g_sb[GI] = pts;
-                    }
-                ]]>
-            </Parameter>
-            <Parameter Name="Validation.NumInputSet">3</Parameter>
-            <Parameter Name="Validation.InputSet1">
-                <Value>1</Value>
-                <Value>2</Value>
-                <Value>3</Value>
-                <Value>4</Value>
-            </Parameter>
-            <Parameter Name="Validation.InputSet2">
-                <Value>0</Value>
-            </Parameter>
-            <Parameter Name="Validation.InputSet3">
-                <Value>1</Value>
-                <Value>2</Value>
-                <Value>4</Value>
-                <Value>-64</Value>
-            </Parameter>
-        </Row>
-
-        <Row Name="WaveActiveCountBits">
-            <Parameter Name="ShaderOp.Name">WaveActiveCountBits</Parameter>
-            <Parameter Name="ShaderOp.Text">
-                <![CDATA[
-                    struct PerThreadData {
-                        int firstLaneId;
-                        int mask;
-                        int input;
-                        int output;
-                    };
-                    RWStructuredBuffer<PerThreadData> g_sb : register(u0);
-                    [numthreads(8,12,1)]
-                    void main(uint GI : SV_GroupIndex) {
-                        PerThreadData pts = g_sb[GI];
-                        pts.firstLaneId = WaveReadLaneFirst(GI);
-                        if (pts.mask != 0) {
-                            pts.output = WaveActiveCountBits(pts.input > 3);
-                        }
-                        else {
-                            pts.output = WaveActiveCountBits(pts.input > 3);
-                        }
-                        g_sb[GI] = pts;
-                    }
-                ]]>
-            </Parameter>
-            <Parameter Name="Validation.NumInputSet">4</Parameter>
-            <Parameter Name="Validation.InputSet1">
-                <Value>1</Value>
-                <Value>2</Value>
-                <Value>3</Value>
-                <Value>4</Value>
-            </Parameter>
-            <Parameter Name="Validation.InputSet2">
-                <Value>0</Value>
-            </Parameter>
-            <Parameter Name="Validation.InputSet3">
-                <Value>1</Value>
-                <Value>10</Value>
-                <Value>-4</Value>
-                <Value>-64</Value>
-            </Parameter>
-            <Parameter Name="Validation.InputSet4">
-                <Value>-100</Value>
-                <Value>-1000</Value>
-                <Value>300</Value>
-            </Parameter>
-        </Row>
-
-        <Row Name="WaveActiveMax">
-            <Parameter Name="ShaderOp.Name">WaveActiveMax</Parameter>
-            <Parameter Name="ShaderOp.Text">
-                <![CDATA[
-                    struct PerThreadData {
-                        int firstLaneId;
-                        int mask;
-                        int input;
-                        int output;
-                    };
-                    RWStructuredBuffer<PerThreadData> g_sb : register(u0);
-                    [numthreads(8,12,1)]
-                    void main(uint GI : SV_GroupIndex) {
-                        PerThreadData pts = g_sb[GI];
-                        pts.firstLaneId = WaveReadLaneFirst(GI);
-                        if (pts.mask != 0) {
-                            pts.output = WaveActiveMax(pts.input);
-                        }
-                        else {
-                            pts.output = WaveActiveMax(pts.input);
-                        }
-                        g_sb[GI] = pts;
-                    }
-                ]]>
-            </Parameter>
-            <Parameter Name="Validation.NumInputSet">4</Parameter>
-            <Parameter Name="Validation.InputSet1">
-                <Value>1</Value>
-                <Value>2</Value>
-                <Value>3</Value>
-                <Value>4</Value>
-            </Parameter>
-            <Parameter Name="Validation.InputSet2">
-                <Value>0</Value>
-            </Parameter>
-            <Parameter Name="Validation.InputSet3">
-                <Value>1</Value>
-                <Value>10</Value>
-                <Value>-4</Value>
-                <Value>-64</Value>
-            </Parameter>
-            <Parameter Name="Validation.InputSet4">
-                <Value>-100</Value>
-                <Value>-1000</Value>
-                <Value>300</Value>
-            </Parameter>
-        </Row>
-
-        <Row Name="WaveActiveMin">
-            <Parameter Name="ShaderOp.Name">WaveActiveMin</Parameter>
-            <Parameter Name="ShaderOp.Text">
-                <![CDATA[
-                    struct PerThreadData {
-                        int firstLaneId;
-                        int mask;
-                        int input;
-                        int output;
-                    };
-                    RWStructuredBuffer<PerThreadData> g_sb : register(u0);
-                    [numthreads(8,12,1)]
-                    void main(uint GI : SV_GroupIndex) {
-                        PerThreadData pts = g_sb[GI];
-                        pts.firstLaneId = WaveReadLaneFirst(GI);
-                        if (pts.mask != 0) {
-                            pts.output = WaveActiveMin(pts.input);
-                        }
-                        else {
-                            pts.output = WaveActiveMin(pts.input);
-                        }
-                        g_sb[GI] = pts;
-                    }
-                ]]>
-            </Parameter>
-            <Parameter Name="Validation.NumInputSet">4</Parameter>
-            <Parameter Name="Validation.InputSet1">
-                <Value>1</Value>
-                <Value>2</Value>
-                <Value>3</Value>
-                <Value>4</Value>
-                <Value>5</Value>
-                <Value>6</Value>
-                <Value>7</Value>
-                <Value>8</Value>
-                <Value>9</Value>
-                <Value>10</Value>
-            </Parameter>
-            <Parameter Name="Validation.InputSet2">
-                <Value>0</Value>
-            </Parameter>
-            <Parameter Name="Validation.InputSet3">
-                <Value>1</Value>
-                <Value>10</Value>
-                <Value>-4</Value>
-                <Value>-64</Value>
-            </Parameter>
-            <Parameter Name="Validation.InputSet4">
-                <Value>-100</Value>
-                <Value>-1000</Value>
-                <Value>300</Value>
-            </Parameter>
-        </Row>
-
-        <Row Name="WaveActiveAllEqual">
-            <Parameter Name="ShaderOp.Name">WaveActiveAllEqual</Parameter>
-            <Parameter Name="ShaderOp.Text">
-                <![CDATA[
-                    struct PerThreadData {
-                        int firstLaneId;
-                        int mask;
-                        int input;
-                        int output;
-                    };
-                    RWStructuredBuffer<PerThreadData> g_sb : register(u0);
-                    [numthreads(8,12,1)]
-                    void main(uint GI : SV_GroupIndex) {
-                        PerThreadData pts = g_sb[GI];
-                        pts.firstLaneId = WaveReadLaneFirst(GI);
-                        if (pts.mask != 0) {
-                            pts.output = WaveActiveAllEqual(pts.input);
-                        }
-                        else {
-                            pts.output = WaveActiveAllEqual(pts.input);
-                        }
-                        g_sb[GI] = pts;
-                    }
-                ]]>
-            </Parameter>
-            <Parameter Name="Validation.NumInputSet">3</Parameter>
-            <Parameter Name="Validation.InputSet1">
-                <Value>1</Value>
-                <Value>2</Value>
-                <Value>3</Value>
-                <Value>4</Value>
-                <Value>1</Value>
-                <Value>1</Value>
-                <Value>1</Value>
-                <Value>1</Value>
-            </Parameter>
-            <Parameter Name="Validation.InputSet2">
-                <Value>3</Value>
-            </Parameter>
-            <Parameter Name="Validation.InputSet3">
-                <Value>-10</Value>
-            </Parameter>
-        </Row>
-
-        <Row Name="WaveActiveAnyTrue">
-            <Parameter Name="ShaderOp.Name">WaveActiveAnyTrue</Parameter>
-            <Parameter Name="ShaderOp.Text">
-                <![CDATA[
-                    struct PerThreadData {
-                        int firstLaneId;
-                        int mask;
-                        bool input;
-                        bool output;
-                    };
-                    RWStructuredBuffer<PerThreadData> g_sb : register(u0);
-                    [numthreads(8,12,1)]
-                    void main(uint GI : SV_GroupIndex) {
-                        PerThreadData pts = g_sb[GI];
-                        pts.firstLaneId = WaveReadLaneFirst(GI);
-                        if (pts.mask != 0) {
-                            pts.output = WaveActiveAnyTrue(pts.input);
-                        }
-                        else {
-                            pts.output = WaveActiveAnyTrue(pts.input);
-                        }
-                        g_sb[GI] = pts;
-                    }
-                ]]>
-            </Parameter>
-            <Parameter Name="Validation.NumInputSet">3</Parameter>
-            <Parameter Name="Validation.InputSet1">
-                <Value>1</Value>
-                <Value>0</Value>
-                <Value>1</Value>
-                <Value>0</Value>
-                <Value>1</Value>
-            </Parameter>
-            <Parameter Name="Validation.InputSet2">
-                <Value>1</Value>
-            </Parameter>
-            <Parameter Name="Validation.InputSet3">
-                <Value>0</Value>
-            </Parameter>
-        </Row>
-
-        <Row Name="WaveActiveAllTrue">
-            <Parameter Name="ShaderOp.Name">WaveActiveAllTrue</Parameter>
-            <Parameter Name="ShaderOp.Text">
-                <![CDATA[
-                    struct PerThreadData {
-                        int firstLaneId;
-                        int mask;
-                        bool input;
-                        bool output;
-                    };
-                    RWStructuredBuffer<PerThreadData> g_sb : register(u0);
-                    [numthreads(8,12,1)]
-                    void main(uint GI : SV_GroupIndex) {
-                        PerThreadData pts = g_sb[GI];
-                        pts.firstLaneId = WaveReadLaneFirst(GI);
-                        if (pts.mask != 0) {
-                            pts.output = WaveActiveAllTrue(pts.input);
-                        }
-                        else {
-                            pts.output = WaveActiveAllTrue(pts.input);
-                        }
-                        g_sb[GI] = pts;
-                    }
-                ]]>
-            </Parameter>
-            <Parameter Name="Validation.NumInputSet">3</Parameter>
-            <Parameter Name="Validation.InputSet1">
-                <Value>1</Value>
-                <Value>0</Value>
-                <Value>1</Value>
-                <Value>0</Value>
-                <Value>1</Value>
-            </Parameter>
-            <Parameter Name="Validation.InputSet2">
-                <Value>1</Value>
-            </Parameter>
-            <Parameter Name="Validation.InputSet3">
-                <Value>1</Value>
-            </Parameter>
-        </Row>
-      </Table>
-
-      <Table Id="WaveIntrinsicsActiveUintTable">
-        <ParameterTypes>
-            <ParameterType Name="ShaderOp.Name">String</ParameterType>
-            <ParameterType Name="ShaderOp.Text">String</ParameterType>
-            <ParameterType Name="Validation.NumInputSet">unsigned int</ParameterType>
-            <ParameterType Name="Validation.InputSet1" Array="true">unsigned int</ParameterType>
-            <ParameterType Name="Validation.InputSet2" Array="true">unsigned int</ParameterType>
-            <ParameterType Name="Validation.InputSet3" Array="true">unsigned int</ParameterType>
-            <ParameterType Name="Validation.InputSet4" Array="true">unsigned int</ParameterType>
-        </ParameterTypes>
-
-       <Row Name="WaveActiveUSum">
-            <Parameter Name="ShaderOp.Name">WaveActiveUSum</Parameter>
-            <Parameter Name="ShaderOp.Text">
-                <![CDATA[
-                    struct PerThreadData {
-                        int firstLaneId;
-                        int mask;
-                        uint input;
-                        uint output;
-                    };
-                    RWStructuredBuffer<PerThreadData> g_sb : register(u0);
-                    [numthreads(8,12,1)]
-                    void main(uint GI : SV_GroupIndex) {
-                        PerThreadData pts = g_sb[GI];
-                        pts.firstLaneId = WaveReadLaneFirst(GI);
-                        if (pts.mask != 0) {
-                            pts.output = WaveActiveSum(pts.input);
-                        }
-                        else {
-                            pts.output = WaveActiveSum(pts.input);
-                        }
-                        g_sb[GI] = pts;
-                    }
-                ]]>
-            </Parameter>
-            <Parameter Name="Validation.NumInputSet">3</Parameter>
-            <Parameter Name="Validation.InputSet1">
-                <Value>1</Value>
-                <Value>2</Value>
-                <Value>3</Value>
-                <Value>4</Value>
-            </Parameter>
-            <Parameter Name="Validation.InputSet2">
-                <Value>0</Value>
-            </Parameter>
-            <Parameter Name="Validation.InputSet3">
-                <Value>2</Value>
-                <Value>4</Value>
-                <Value>8</Value>
-                <Value>64</Value>
-            </Parameter>
-        </Row>
-
-         <Row Name="WaveActiveUProduct">
-            <Parameter Name="ShaderOp.Name">WaveActiveUProduct</Parameter>
-            <Parameter Name="ShaderOp.Text">
-                <![CDATA[
-                    struct PerThreadData {
-                        int firstLaneId;
-                        int mask;
-                        uint input;
-                        uint output;
-                    };
-                    RWStructuredBuffer<PerThreadData> g_sb : register(u0);
-                    [numthreads(8,12,1)]
-                    void main(uint GI : SV_GroupIndex) {
-                        PerThreadData pts = g_sb[GI];
-                        pts.firstLaneId = WaveReadLaneFirst(GI);
-                        if (pts.mask != 0) {
-                            pts.output = WaveActiveProduct(pts.input);
-                        }
-                        else {
-                            pts.output = WaveActiveProduct(pts.input);
-                        }
-                        g_sb[GI] = pts;
-                    }
-                ]]>
-            </Parameter>
-            <Parameter Name="Validation.NumInputSet">3</Parameter>
-            <Parameter Name="Validation.InputSet1">
-                <Value>1</Value>
-                <Value>2</Value>
-                <Value>3</Value>
-                <Value>4</Value>
-            </Parameter>
-            <Parameter Name="Validation.InputSet2">
-                <Value>0</Value>
-            </Parameter>
-            <Parameter Name="Validation.InputSet3">
-                <Value>1</Value>
-                <Value>2</Value>
-                <Value>4</Value>
-                <Value>64</Value>
-            </Parameter>
-        </Row>
-
-        <Row Name="WaveActiveUMax">
-            <Parameter Name="ShaderOp.Name">WaveActiveUMax</Parameter>
-            <Parameter Name="ShaderOp.Text">
-                <![CDATA[
-                    struct PerThreadData {
-                        int firstLaneId;
-                        int mask;
-                        uint input;
-                        uint output;
-                    };
-                    RWStructuredBuffer<PerThreadData> g_sb : register(u0);
-                    [numthreads(8,12,1)]
-                    void main(uint GI : SV_GroupIndex) {
-                        PerThreadData pts = g_sb[GI];
-                        pts.firstLaneId = WaveReadLaneFirst(GI);
-                        if (pts.mask != 0) {
-                            pts.output = WaveActiveMax(pts.input);
-                        }
-                        else {
-                            pts.output = WaveActiveMax(pts.input);
-                        }
-                        g_sb[GI] = pts;
-                    }
-                ]]>
-            </Parameter>
-            <Parameter Name="Validation.NumInputSet">3</Parameter>
-            <Parameter Name="Validation.InputSet1">
-                <Value>1</Value>
-                <Value>2</Value>
-                <Value>3</Value>
-                <Value>4</Value>
-            </Parameter>
-            <Parameter Name="Validation.InputSet2">
-                <Value>0</Value>
-            </Parameter>
-            <Parameter Name="Validation.InputSet3">
-                <Value>1</Value>
-                <Value>10</Value>
-                <Value>4</Value>
-                <Value>64</Value>
-            </Parameter>
-        </Row>
-
-        <Row Name="WaveActiveUMin">
-            <Parameter Name="ShaderOp.Name">WaveActiveUMin</Parameter>
-            <Parameter Name="ShaderOp.Text">
-                <![CDATA[
-                    struct PerThreadData {
-                        int firstLaneId;
-                        int mask;
-                        uint input;
-                        uint output;
-                    };
-                    RWStructuredBuffer<PerThreadData> g_sb : register(u0);
-                    [numthreads(8,12,1)]
-                    void main(uint GI : SV_GroupIndex) {
-                        PerThreadData pts = g_sb[GI];
-                        pts.firstLaneId = WaveReadLaneFirst(GI);
-                        if (pts.mask != 0) {
-                            pts.output = WaveActiveMin(pts.input);
-                        }
-                        else {
-                            pts.output = WaveActiveMin(pts.input);
-                        }
-                        g_sb[GI] = pts;
-                    }
-                ]]>
-            </Parameter>
-            <Parameter Name="Validation.NumInputSet">3</Parameter>
-            <Parameter Name="Validation.InputSet1">
-                <Value>1</Value>
-                <Value>2</Value>
-                <Value>3</Value>
-                <Value>4</Value>
-                <Value>5</Value>
-                <Value>6</Value>
-                <Value>7</Value>
-                <Value>8</Value>
-                <Value>9</Value>
-                <Value>10</Value>
-            </Parameter>
-            <Parameter Name="Validation.InputSet2">
-                <Value>0</Value>
-            </Parameter>
-            <Parameter Name="Validation.InputSet3">
-                <Value>1</Value>
-                <Value>10</Value>
-                <Value>4</Value>
-                <Value>64</Value>
-            </Parameter>
-        </Row>
-
-        <Row Name="WaveActiveBitOr">
-            <Parameter Name="ShaderOp.Name">WaveActiveBitOr</Parameter>
-            <Parameter Name="ShaderOp.Text">
-                <![CDATA[
-                    struct PerThreadData {
-                        int firstLaneId;
-                        int mask;
-                        uint input;
-                        uint output;
-                    };
-                    RWStructuredBuffer<PerThreadData> g_sb : register(u0);
-                    [numthreads(8,12,1)]
-                    void main(uint GI : SV_GroupIndex) {
-                        PerThreadData pts = g_sb[GI];
-                        pts.firstLaneId = WaveReadLaneFirst(GI);
-                        if (pts.mask != 0) {
-                            pts.output = WaveActiveBitOr(pts.input);
-                        }
-                        else {
-                            pts.output = WaveActiveBitOr(pts.input);
-                        }
-                        g_sb[GI] = pts;
-                    }
-                ]]>
-            </Parameter>
-            <Parameter Name="Validation.NumInputSet">4</Parameter>
-            <Parameter Name="Validation.InputSet1">
-                <Value>0xe0000000</Value>
-                <Value>0x0d000000</Value>
-                <Value>0x00b00000</Value>
-                <Value>0x00070000</Value>
-                <Value>0x0000e000</Value>
-                <Value>0x00000d00</Value>
-                <Value>0x000000b0</Value>
-                <Value>0x00000007</Value>
-            </Parameter>
-            <Parameter Name="Validation.InputSet2">
-                <Value>0xedb7edb7</Value>
-                <Value>0xdb7edb7e</Value>
-                <Value>0xb7edb7ed</Value>
-                <Value>0x7edb7edb</Value>
-            </Parameter>
-            <Parameter Name="Validation.InputSet3">
-                <Value>0x12481248</Value>
-                <Value>0x24812481</Value>
-                <Value>0x48124812</Value>
-                <Value>0x81248124</Value>
-            </Parameter>
-            <Parameter Name="Validation.InputSet4">
-                <Value>0x00000000</Value>
-                <Value>0xffffffff</Value>
-            </Parameter>
-        </Row>
-
-        <Row Name="WaveActiveBitAnd">
-            <Parameter Name="ShaderOp.Name">WaveActiveBitAnd</Parameter>
-            <Parameter Name="ShaderOp.Text">
-                <![CDATA[
-                    struct PerThreadData {
-                        int firstLaneId;
-                        int mask;
-                        uint input;
-                        uint output;
-                    };
-                    RWStructuredBuffer<PerThreadData> g_sb : register(u0);
-                    [numthreads(8,12,1)]
-                    void main(uint GI : SV_GroupIndex) {
-                        PerThreadData pts = g_sb[GI];
-                        pts.firstLaneId = WaveReadLaneFirst(GI);
-                        if (pts.mask != 0) {
-                            pts.output = WaveActiveBitAnd(pts.input);
-                        }
-                        else {
-                            pts.output = WaveActiveBitAnd(pts.input);
-                        }
-                        g_sb[GI] = pts;
-                    }
-                ]]>
-            </Parameter>
-            <Parameter Name="Validation.NumInputSet">4</Parameter>
-            <Parameter Name="Validation.InputSet1">
-                <Value>0xefffffff</Value>
-                <Value>0xfdffffff</Value>
-                <Value>0xffbfffff</Value>
-                <Value>0xfff7ffff</Value>
-                <Value>0xffffefff</Value>
-                <Value>0xfffffdff</Value>
-                <Value>0xffffffbf</Value>
-                <Value>0xfffffff7</Value>
-            </Parameter>
-            <Parameter Name="Validation.InputSet2">
-                <Value>0xedb7edb7</Value>
-                <Value>0xdb7edb7e</Value>
-                <Value>0xb7edb7ed</Value>
-                <Value>0x7edb7edb</Value>
-            </Parameter>
-            <Parameter Name="Validation.InputSet3">
-                <Value>0x12481248</Value>
-                <Value>0x24812481</Value>
-                <Value>0x48124812</Value>
-                <Value>0x81248124</Value>
-            </Parameter>
-            <Parameter Name="Validation.InputSet4">
-                <Value>0x00000000</Value>
-                <Value>0xffffffff</Value>
-            </Parameter>
-        </Row>
-
-        <Row Name="WaveActiveBitXor">
-            <Parameter Name="ShaderOp.Name">WaveActiveBitXor</Parameter>
-            <Parameter Name="ShaderOp.Text">
-                <![CDATA[
-                    struct PerThreadData {
-                        int firstLaneId;
-                        int mask;
-                        uint input;
-                        uint output;
-                    };
-                    RWStructuredBuffer<PerThreadData> g_sb : register(u0);
-                    [numthreads(8,12,1)]
-                    void main(uint GI : SV_GroupIndex) {
-                        PerThreadData pts = g_sb[GI];
-                        pts.firstLaneId = WaveReadLaneFirst(GI);
-                        if (pts.mask != 0) {
-                            pts.output = WaveActiveBitXor(pts.input);
-                        }
-                        else {
-                            pts.output = WaveActiveBitXor(pts.input);
-                        }
-                        g_sb[GI] = pts;
-                    }
-                ]]>
-            </Parameter>
-            <Parameter Name="Validation.NumInputSet">3</Parameter>
-            <Parameter Name="Validation.InputSet1">
-                <Value>0xe0000000</Value>
-                <Value>0x0d000000</Value>
-                <Value>0x00b00000</Value>
-                <Value>0x00070000</Value>
-                <Value>0x0000e000</Value>
-                <Value>0x00000d00</Value>
-                <Value>0x000000b0</Value>
-                <Value>0x00000007</Value>
-            </Parameter>
-            <Parameter Name="Validation.InputSet2">
-                <Value>0xedb7edb7</Value>
-                <Value>0xdb7edb7e</Value>
-                <Value>0xb7edb7ed</Value>
-                <Value>0x7edb7edb</Value>
-            </Parameter>
-            <Parameter Name="Validation.InputSet3">
-                <Value>0x12481248</Value>
-                <Value>0x24812481</Value>
-                <Value>0x48124812</Value>
-                <Value>0x81248124</Value>
-            </Parameter>
-            <Parameter Name="Validation.InputSet4">
-                <Value>0x00000000</Value>
-                <Value>0xffffffff</Value>
-            </Parameter>
-        </Row>
-    </Table>
-
-    <Table Id="WaveIntrinsicsPrefixIntTable">
-        <ParameterTypes>
-            <ParameterType Name="ShaderOp.Name">String</ParameterType>
-            <ParameterType Name="ShaderOp.Text">String</ParameterType>
-            <ParameterType Name="Validation.NumInputSet">unsigned int</ParameterType>
-            <ParameterType Name="Validation.InputSet1" Array="true">int</ParameterType>
-            <ParameterType Name="Validation.InputSet2" Array="true">int</ParameterType>
-            <ParameterType Name="Validation.InputSet3" Array="true">int</ParameterType>
-            <ParameterType Name="Validation.InputSet4" Array="true">int</ParameterType>
-        </ParameterTypes>
-        <Row Name="WavePrefixCountBits">
-            <Parameter Name="ShaderOp.Name">WavePrefixCountBits</Parameter>
-            <Parameter Name="ShaderOp.Text">
-                <![CDATA[
-                    struct PerThreadData {
-                        int firstLaneId;
-                        int mask;
-                        int input;
-                        int output;
-                    };
-                    RWStructuredBuffer<PerThreadData> g_sb : register(u0);
-                    [numthreads(8,12,1)]
-                    void main(uint GI : SV_GroupIndex) {
-                        PerThreadData pts = g_sb[GI];
-                        pts.firstLaneId = WaveReadLaneFirst(GI);
-                        if (pts.mask != 0) {
-                            pts.output = WavePrefixCountBits(pts.input > 3);
-                        }
-                        else {
-                            pts.output = WavePrefixCountBits(pts.input > 3);
-                        }
-                        g_sb[GI] = pts;
-                    }
-                ]]>
-            </Parameter>
-            <Parameter Name="Validation.NumInputSet">4</Parameter>
-            <Parameter Name="Validation.InputSet1">
-                <Value>1</Value>
-                <Value>2</Value>
-                <Value>3</Value>
-                <Value>4</Value>
-                <Value>5</Value>
-            </Parameter>
-            <Parameter Name="Validation.InputSet2">
-                <Value>0</Value>
-            </Parameter>
-            <Parameter Name="Validation.InputSet3">
-                <Value>1</Value>
-                <Value>10</Value>
-                <Value>-4</Value>
-                <Value>-64</Value>
-            </Parameter>
-            <Parameter Name="Validation.InputSet4">
-                <Value>-100</Value>
-                <Value>-1000</Value>
-                <Value>300</Value>
-            </Parameter>
-        </Row>
-
-        <Row Name="WavePrefixSum">
-            <Parameter Name="ShaderOp.Name">WavePrefixSum</Parameter>
-            <Parameter Name="ShaderOp.Text">
-                <![CDATA[
-                    struct PerThreadData {
-                        int firstLaneId;
-                        int mask;
-                        int input;
-                        int output;
-                    };
-                    RWStructuredBuffer<PerThreadData> g_sb : register(u0);
-                    [numthreads(8,12,1)]
-                    void main(uint GI : SV_GroupIndex) {
-                        PerThreadData pts = g_sb[GI];
-                        pts.firstLaneId = WaveReadLaneFirst(GI);
-                        if (pts.mask != 0) {
-                            pts.output = WavePrefixSum(pts.input);
-                        }
-                        else {
-                            pts.output = WavePrefixSum(pts.input);
-                        }
-                        g_sb[GI] = pts;
-                    }
-                ]]>
-            </Parameter>
-            <Parameter Name="Validation.NumInputSet">3</Parameter>
-            <Parameter Name="Validation.InputSet1">
-                <Value>1</Value>
-                <Value>2</Value>
-                <Value>3</Value>
-                <Value>4</Value>
-                <Value>5</Value>
-            </Parameter>
-            <Parameter Name="Validation.InputSet2">
-                <Value>0</Value>
-                <Value>1</Value>
-            </Parameter>
-            <Parameter Name="Validation.InputSet3">
-                <Value>1</Value>
-                <Value>2</Value>
-                <Value>4</Value>
-                <Value>-64</Value>
-                <Value>128</Value>
-            </Parameter>
-        </Row>
-
-         <Row Name="WavePrefixProduct">
-            <Parameter Name="ShaderOp.Name">WavePrefixProduct</Parameter>
-            <Parameter Name="ShaderOp.Text">
-                <![CDATA[
-                    struct PerThreadData {
-                        int firstLaneId;
-                        int mask;
-                        int input;
-                        int output;
-                    };
-                    RWStructuredBuffer<PerThreadData> g_sb : register(u0);
-                    [numthreads(8,12,1)]
-                    void main(uint GI : SV_GroupIndex) {
-                        PerThreadData pts = g_sb[GI];
-                        pts.firstLaneId = WaveReadLaneFirst(GI);
-                        if (pts.mask != 0) {
-                            pts.output = WavePrefixProduct(pts.input);
-                        }
-                        else {
-                            pts.output = WavePrefixProduct(pts.input);
-                        }
-                        g_sb[GI] = pts;
-                    }
-                ]]>
-            </Parameter>
-            <Parameter Name="Validation.NumInputSet">3</Parameter>
-            <Parameter Name="Validation.InputSet1">
-                <Value>1</Value>
-                <Value>2</Value>
-                <Value>3</Value>
-                <Value>4</Value>
-                <Value>5</Value>
-            </Parameter>
-            <Parameter Name="Validation.InputSet2">
-                <Value>0</Value>
-                <Value>1</Value>
-            </Parameter>
-            <Parameter Name="Validation.InputSet3">
-                <Value>1</Value>
-                <Value>2</Value>
-                <Value>4</Value>
-                <Value>-64</Value>
-                <Value>128</Value>
-            </Parameter>
-        </Row>
-    </Table>
-
-    <Table Id="WaveIntrinsicsPrefixUintTable">
-        <ParameterTypes>
-            <ParameterType Name="ShaderOp.Name">String</ParameterType>
-            <ParameterType Name="ShaderOp.Text">String</ParameterType>
-            <ParameterType Name="Validation.NumInputSet">unsigned int</ParameterType>
-            <ParameterType Name="Validation.InputSet1" Array="true">unsigned int</ParameterType>
-            <ParameterType Name="Validation.InputSet2" Array="true">unsigned int</ParameterType>
-            <ParameterType Name="Validation.InputSet3" Array="true">unsigned int</ParameterType>
-            <ParameterType Name="Validation.InputSet4" Array="true">unsigned int</ParameterType>
-        </ParameterTypes>
-        <Row Name="WavePrefixCountBits">
-            <Parameter Name="ShaderOp.Name">WavePrefixCountBits</Parameter>
-            <Parameter Name="ShaderOp.Text">
-                <![CDATA[
-                    struct PerThreadData {
-                        int firstLaneId;
-                        int mask;
-                        uint input;
-                        uint output;
-                    };
-                    RWStructuredBuffer<PerThreadData> g_sb : register(u0);
-                    [numthreads(8,12,1)]
-                    void main(uint GI : SV_GroupIndex) {
-                        PerThreadData pts = g_sb[GI];
-                        pts.firstLaneId = WaveReadLaneFirst(GI);
-                        if (pts.mask != 0) {
-                            pts.output = WavePrefixCountBits(pts.input > 3);
-                        }
-                        else {
-                            pts.output = WavePrefixCountBits(pts.input > 3);
-                        }
-                        g_sb[GI] = pts;
-                    }
-                ]]>
-            </Parameter>
-            <Parameter Name="Validation.NumInputSet">3</Parameter>
-            <Parameter Name="Validation.InputSet1">
-                <Value>1</Value>
-                <Value>2</Value>
-                <Value>3</Value>
-                <Value>4</Value>
-                <Value>5</Value>
-            </Parameter>
-            <Parameter Name="Validation.InputSet2">
-                <Value>0</Value>
-            </Parameter>
-            <Parameter Name="Validation.InputSet3">
-                <Value>1</Value>
-                <Value>10</Value>
-            </Parameter>
-            <Parameter Name="Validation.InputSet4">
-                <Value>100</Value>
-                <Value>300</Value>
-            </Parameter>
-        </Row>
-
-        <Row Name="WavePrefixSum">
-            <Parameter Name="ShaderOp.Name">WavePrefixSum</Parameter>
-            <Parameter Name="ShaderOp.Text">
-                <![CDATA[
-                    struct PerThreadData {
-                        int firstLaneId;
-                        int mask;
-                        uint input;
-                        uint output;
-                    };
-                    RWStructuredBuffer<PerThreadData> g_sb : register(u0);
-                    [numthreads(8,12,1)]
-                    void main(uint GI : SV_GroupIndex) {
-                        PerThreadData pts = g_sb[GI];
-                        pts.firstLaneId = WaveReadLaneFirst(GI);
-                        if (pts.mask != 0) {
-                            pts.output = WavePrefixSum(pts.input);
-                        }
-                        else {
-                            pts.output = WavePrefixSum(pts.input);
-                        }
-                        g_sb[GI] = pts;
-                    }
-                ]]>
-            </Parameter>
-            <Parameter Name="Validation.NumInputSet">3</Parameter>
-            <Parameter Name="Validation.InputSet1">
-                <Value>1</Value>
-                <Value>2</Value>
-                <Value>3</Value>
-                <Value>4</Value>
-                <Value>5</Value>
-            </Parameter>
-            <Parameter Name="Validation.InputSet2">
-                <Value>0</Value>
-                <Value>1</Value>
-            </Parameter>
-            <Parameter Name="Validation.InputSet3">
-                <Value>1</Value>
-                <Value>2</Value>
-                <Value>4</Value>
-                <Value>128</Value>
-            </Parameter>
-        </Row>
-
-         <Row Name="WavePrefixProduct">
-            <Parameter Name="ShaderOp.Name">WavePrefixProduct</Parameter>
-            <Parameter Name="ShaderOp.Text">
-                <![CDATA[
-                    struct PerThreadData {
-                        int firstLaneId;
-                        int mask;
-                        uint input;
-                        uint output;
-                    };
-                    RWStructuredBuffer<PerThreadData> g_sb : register(u0);
-                    [numthreads(8,12,1)]
-                    void main(uint GI : SV_GroupIndex) {
-                        PerThreadData pts = g_sb[GI];
-                        pts.firstLaneId = WaveReadLaneFirst(GI);
-                        if (pts.mask != 0) {
-                            pts.output = WavePrefixProduct(pts.input);
-                        }
-                        else {
-                            pts.output = WavePrefixProduct(pts.input);
-                        }
-                        g_sb[GI] = pts;
-                    }
-                ]]>
-            </Parameter>
-            <Parameter Name="Validation.NumInputSet">3</Parameter>
-            <Parameter Name="Validation.InputSet1">
-                <Value>1</Value>
-                <Value>2</Value>
-                <Value>3</Value>
-                <Value>4</Value>
-                <Value>5</Value>
-            </Parameter>
-            <Parameter Name="Validation.InputSet2">
-                <Value>0</Value>
-                <Value>1</Value>
-            </Parameter>
-            <Parameter Name="Validation.InputSet3">
-                <Value>1</Value>
-                <Value>2</Value>
-                <Value>4</Value>
-                <Value>128</Value>
-            </Parameter>
-        </Row>
-    </Table>
-</Data>
+<?xml version="1.0"?>
+<Data>
+    <Table Id="UnaryFloatOpTable">
+      <ParameterTypes>
+        <ParameterType Name="Warp.Version">unsigned int</ParameterType>
+        <ParameterType Name="Validation.Type">String</ParameterType>
+        <ParameterType Name="Validation.Tolerance">double</ParameterType>
+        <ParameterType Name="Validation.Input" Array="true">String</ParameterType>
+        <ParameterType Name="Validation.Expected" Array="true">String</ParameterType>
+        <ParameterType Name="Validation.NumInput">unsigned int</ParameterType>
+        <ParameterType Name="ShaderOp.Name">String</ParameterType>
+        <ParameterType Name="ShaderOp.Target">String</ParameterType>
+        <ParameterType Name="ShaderOp.EntryPoint">String</ParameterType>
+        <ParameterType Name="ShaderOp.Text">String</ParameterType>
+      </ParameterTypes>
+
+      <Row Name="sin">
+        <Parameter Name="Validation.Type">Epsilon</Parameter>
+        <Parameter Name="Validation.Tolerance">0.0008</Parameter>
+        <Parameter Name="Validation.NumInput">9</Parameter>
+        <Parameter Name="Validation.Input">
+          <Value>NaN</Value>
+          <Value>-Inf</Value>
+          <Value>-denorm</Value>
+          <Value>-0</Value>
+          <Value>0</Value>
+          <Value>denorm</Value>
+          <Value>Inf</Value>
+          <Value>-314.16</Value>
+          <Value>314.16</Value>
+        </Parameter>
+        <Parameter Name="Validation.Expected">
+          <Value>NaN</Value>
+          <Value>NaN</Value>
+          <Value>-0</Value>
+          <Value>-0</Value>
+          <Value>0</Value>
+          <Value>0</Value>
+          <Value>NaN</Value>
+          <Value>-0.0007346401</Value>
+          <Value>0.0007346401</Value>
+        </Parameter>
+        <Parameter Name="ShaderOp.Name">sin</Parameter>
+        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
+        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
+        <Parameter Name="ShaderOp.Text"><![CDATA[
+            struct SUnaryFPOp {
+                float input;
+                float output;
+            };
+            RWStructuredBuffer<SUnaryFPOp> g_buf : register(u0);
+            [numthreads(8,8,1)]
+            [RootSignature("RootFlags(0), UAV(u0)")]
+            void main(uint GI : SV_GroupIndex) {
+                SUnaryFPOp l = g_buf[GI];
+                l.output = sin(l.input);
+                g_buf[GI] = l;
+            }
+           ]]></Parameter>
+      </Row>
+
+      <Row Name="cos">
+        <Parameter Name="Validation.Type">Epsilon</Parameter>
+        <Parameter Name="Validation.Tolerance">0.0008</Parameter>
+        <Parameter Name="Validation.NumInput">9</Parameter>
+        <Parameter Name="Validation.Input">
+          <Value>NaN</Value>
+          <Value>-Inf</Value>
+          <Value>-denorm</Value>
+          <Value>-0</Value>
+          <Value>0</Value>
+          <Value>denorm</Value>
+          <Value>Inf</Value>
+          <Value>-314.16</Value>
+          <Value>314.16</Value>
+        </Parameter>
+        <Parameter Name="Validation.Expected">
+          <Value>NaN</Value>
+          <Value>NaN</Value>
+          <Value>1.0</Value>
+          <Value>1.0</Value>
+          <Value>1.0</Value>
+          <Value>1.0</Value>
+          <Value>NaN</Value>
+          <Value>0.99999973015</Value>
+          <Value>0.99999973015</Value>
+        </Parameter>
+        <Parameter Name="ShaderOp.Name">cos</Parameter>
+        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
+        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
+        <Parameter Name="ShaderOp.Text"><![CDATA[
+            struct SUnaryFPOp {
+                float input;
+                float output;
+            };
+            RWStructuredBuffer<SUnaryFPOp> g_buf : register(u0);
+            [numthreads(8,8,1)]
+            void main(uint GI : SV_GroupIndex) {
+                SUnaryFPOp l = g_buf[GI];
+                l.output = cos(l.input);
+                g_buf[GI] = l;
+            }
+           ]]></Parameter>
+      </Row>
+
+      <Row Name="tan">
+        <Parameter Name="Validation.Type">Epsilon</Parameter>
+        <Parameter Name="Validation.Tolerance">0.0008</Parameter>
+        <Parameter Name="Validation.NumInput">9</Parameter>
+        <Parameter Name="Validation.Input">
+          <Value>NaN</Value>
+          <Value>-Inf</Value>
+          <Value>-denorm</Value>
+          <Value>-0</Value>
+          <Value>0</Value>
+          <Value>denorm</Value>
+          <Value>Inf</Value>
+          <Value>-314.16</Value>
+          <Value>314.16</Value>
+        </Parameter>
+        <Parameter Name="Validation.Expected">
+          <Value>NaN</Value>
+          <Value>NaN</Value>
+          <Value>-0.0</Value>
+          <Value>-0.0</Value>
+          <Value>0.0</Value>
+          <Value>0.0</Value>
+          <Value>NaN</Value>
+          <Value>-0.000735</Value>
+          <Value>0.000735</Value>
+        </Parameter>
+        <Parameter Name="ShaderOp.Name">tan</Parameter>
+        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
+        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
+        <Parameter Name="ShaderOp.Text"><![CDATA[
+            struct SUnaryFPOp {
+                float input;
+                float output;
+            };
+            RWStructuredBuffer<SUnaryFPOp> g_buf : register(u0);
+            [numthreads(8,8,1)]
+            void main(uint GI : SV_GroupIndex) {
+                SUnaryFPOp l = g_buf[GI];
+                l.output = tan(l.input);
+                g_buf[GI] = l;
+            }
+           ]]></Parameter>
+      </Row>
+
+      <Row Name="Hcos">
+        <Parameter Name="Validation.Type">Epsilon</Parameter>
+        <Parameter Name="Validation.Tolerance">0.0008</Parameter>
+        <Parameter Name="Validation.NumInput">9</Parameter>
+        <Parameter Name="Validation.Input">
+          <Value>NaN</Value>
+          <Value>-Inf</Value>
+          <Value>-denorm</Value>
+          <Value>-0</Value>
+          <Value>0</Value>
+          <Value>denorm</Value>
+          <Value>Inf</Value>
+          <Value>1</Value>
+          <Value>-1</Value>
+        </Parameter>
+        <Parameter Name="Validation.Expected">
+          <Value>NaN</Value>
+          <Value>Inf</Value>
+          <Value>1.0</Value>
+          <Value>1.0</Value>
+          <Value>1.0</Value>
+          <Value>1.0</Value>
+          <Value>Inf</Value>
+          <Value>1.543081</Value>
+          <Value>1.543081</Value>
+        </Parameter>
+        <Parameter Name="ShaderOp.Name">hcos</Parameter>
+        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
+        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
+        <Parameter Name="ShaderOp.Text"><![CDATA[
+            struct SUnaryFPOp {
+                float input;
+                float output;
+            };
+            RWStructuredBuffer<SUnaryFPOp> g_buf : register(u0);
+            [numthreads(8,8,1)]
+            void main(uint GI : SV_GroupIndex) {
+                SUnaryFPOp l = g_buf[GI];
+                l.output = cosh(l.input);
+                g_buf[GI] = l;
+            }
+           ]]></Parameter>
+      </Row>
+
+      <Row Name="Hsin">
+        <Parameter Name="Validation.Type">Epsilon</Parameter>
+        <Parameter Name="Validation.Tolerance">0.0008</Parameter>
+        <Parameter Name="Validation.NumInput">9</Parameter>
+        <Parameter Name="Validation.Input">
+          <Value>NaN</Value>
+          <Value>-Inf</Value>
+          <Value>-denorm</Value>
+          <Value>-0</Value>
+          <Value>0</Value>
+          <Value>denorm</Value>
+          <Value>Inf</Value>
+          <Value>1</Value>
+          <Value>-1</Value>
+        </Parameter>
+        <Parameter Name="Validation.Expected">
+          <Value>NaN</Value>
+          <Value>-Inf</Value>
+          <Value>0.0</Value>
+          <Value>0.0</Value>
+          <Value>0.0</Value>
+          <Value>0.0</Value>
+          <Value>Inf</Value>
+          <Value>1.175201</Value>
+          <Value>-1.175201</Value>
+        </Parameter>
+        <Parameter Name="ShaderOp.Name">hsin</Parameter>
+        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
+        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
+        <Parameter Name="ShaderOp.Text"><![CDATA[
+            struct SUnaryFPOp {
+                float input;
+                float output;
+            };
+            RWStructuredBuffer<SUnaryFPOp> g_buf : register(u0);
+            [numthreads(8,8,1)]
+            void main(uint GI : SV_GroupIndex) {
+                SUnaryFPOp l = g_buf[GI];
+                l.output = sinh(l.input);
+                g_buf[GI] = l;
+            }
+           ]]></Parameter>
+      </Row>
+
+      <Row Name="Htan">
+        <Parameter Name="Warp.Version">16202</Parameter>
+        <Parameter Name="Validation.Type">Epsilon</Parameter>
+        <Parameter Name="Validation.Tolerance">0.0008</Parameter>
+        <Parameter Name="Validation.NumInput">9</Parameter>
+        <Parameter Name="Validation.Input">
+          <Value>NaN</Value>
+          <Value>-Inf</Value>
+          <Value>-denorm</Value>
+          <Value>-0</Value>
+          <Value>0</Value>
+          <Value>denorm</Value>
+          <Value>Inf</Value>
+          <Value>1</Value>
+          <Value>-1</Value>
+        </Parameter>
+        <Parameter Name="Validation.Expected">
+          <Value>NaN</Value>
+          <Value>-1</Value>
+          <Value>-0.0</Value>
+          <Value>-0.0</Value>
+          <Value>0.0</Value>
+          <Value>0.0</Value>
+          <Value>1</Value>
+          <Value>0.761594</Value>
+          <Value>-0.761594</Value>
+        </Parameter>
+        <Parameter Name="ShaderOp.Name">htan</Parameter>
+        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
+        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
+        <Parameter Name="ShaderOp.Text"><![CDATA[
+            struct SUnaryFPOp {
+                float input;
+                float output;
+            };
+            RWStructuredBuffer<SUnaryFPOp> g_buf : register(u0);
+            [numthreads(8,8,1)]
+            void main(uint GI : SV_GroupIndex) {
+                SUnaryFPOp l = g_buf[GI];
+                l.output = tanh(l.input);
+                g_buf[GI] = l;
+            }
+           ]]></Parameter>
+      </Row>
+
+      <Row Name="acos">
+        <Parameter Name="Validation.Type">Epsilon</Parameter>
+        <Parameter Name="Validation.Tolerance">0.0008</Parameter>
+        <Parameter Name="Validation.NumInput">11</Parameter>
+        <Parameter Name="Validation.Input">
+          <Value>NaN</Value>
+          <Value>-Inf</Value>
+          <Value>-denorm</Value>
+          <Value>-0</Value>
+          <Value>0</Value>
+          <Value>denorm</Value>
+          <Value>Inf</Value>
+          <Value>1</Value>
+          <Value>-1</Value>
+          <Value>1.5</Value>
+          <Value>-1.5</Value>
+        </Parameter>
+        <Parameter Name="Validation.Expected">
+          <Value>NaN</Value>
+          <Value>NaN</Value>
+          <Value>1.570796</Value>
+          <Value>1.570796</Value>
+          <Value>1.570796</Value>
+          <Value>1.570796</Value>
+          <Value>NaN</Value>
+          <Value>0</Value>
+          <Value>3.1415926</Value>
+          <Value>NaN</Value>
+          <Value>NaN</Value>
+        </Parameter>
+        <Parameter Name="ShaderOp.Name">acos</Parameter>
+        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
+        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
+        <Parameter Name="ShaderOp.Text"><![CDATA[
+            struct SUnaryFPOp {
+                float input;
+                float output;
+            };
+            RWStructuredBuffer<SUnaryFPOp> g_buf : register(u0);
+            [numthreads(8,8,1)]
+            void main(uint GI : SV_GroupIndex) {
+                SUnaryFPOp l = g_buf[GI];
+                l.output = acos(l.input);
+                g_buf[GI] = l;
+            }
+           ]]></Parameter>
+      </Row>
+
+      <Row Name="asin">
+        <Parameter Name="Validation.Type">Epsilon</Parameter>
+        <Parameter Name="Validation.Tolerance">0.0008</Parameter>
+        <Parameter Name="Validation.NumInput">11</Parameter>
+        <Parameter Name="Validation.Input">
+          <Value>NaN</Value>
+          <Value>-Inf</Value>
+          <Value>-denorm</Value>
+          <Value>-0</Value>
+          <Value>0</Value>
+          <Value>denorm</Value>
+          <Value>Inf</Value>
+          <Value>1</Value>
+          <Value>-1</Value>
+          <Value>1.5</Value>
+          <Value>-1.5</Value>
+        </Parameter>
+        <Parameter Name="Validation.Expected">
+          <Value>NaN</Value>
+          <Value>NaN</Value>
+          <Value>0.0</Value>
+          <Value>0.0</Value>
+          <Value>0.0</Value>
+          <Value>0.0</Value>
+          <Value>NaN</Value>
+          <Value>1.570796</Value>
+          <Value>-1.570796</Value>
+          <Value>NaN</Value>
+          <Value>NaN</Value>
+        </Parameter>
+        <Parameter Name="ShaderOp.Name">asin</Parameter>
+        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
+        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
+        <Parameter Name="ShaderOp.Text"><![CDATA[
+            struct SUnaryFPOp {
+                float input;
+                float output;
+            };
+            RWStructuredBuffer<SUnaryFPOp> g_buf : register(u0);
+            [numthreads(8,8,1)]
+            void main(uint GI : SV_GroupIndex) {
+                SUnaryFPOp l = g_buf[GI];
+                l.output = asin(l.input);
+                g_buf[GI] = l;
+            }
+           ]]></Parameter>
+      </Row>
+
+      <Row Name="atan">
+        <Parameter Name="Warp.Version">16202</Parameter>
+        <Parameter Name="Validation.Type">Epsilon</Parameter>
+        <Parameter Name="Validation.Tolerance">0.0008</Parameter>
+        <Parameter Name="Validation.NumInput">8</Parameter>
+        <Parameter Name="Validation.Input">
+          <Value>NaN</Value>
+          <Value>-Inf</Value>
+          <Value>-denorm</Value>
+          <Value>-0</Value>
+          <Value>0</Value>
+          <Value>denorm</Value>
+          <Value>Inf</Value>
+          <Value>1</Value>
+          <Value>-1</Value>
+        </Parameter>
+        <Parameter Name="Validation.Expected">
+          <Value>NaN</Value>
+          <Value>-1.570796</Value>
+          <Value>0.0</Value>
+          <Value>0.0</Value>
+          <Value>0.0</Value>
+          <Value>0.0</Value>
+          <Value>1.570796</Value>
+          <Value>0.785398163</Value>
+          <Value>-0.785398163</Value>
+        </Parameter>
+        <Parameter Name="ShaderOp.Name">atan</Parameter>
+        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
+        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
+        <Parameter Name="ShaderOp.Text"><![CDATA[
+            struct SUnaryFPOp {
+                float input;
+                float output;
+            };
+            RWStructuredBuffer<SUnaryFPOp> g_buf : register(u0);
+            [numthreads(8,8,1)]
+            void main(uint GI : SV_GroupIndex) {
+                SUnaryFPOp l = g_buf[GI];
+                l.output = atan(l.input);
+                g_buf[GI] = l;
+            }
+           ]]></Parameter>
+      </Row>
+
+      <Row Name="exp">
+        <Parameter Name="Validation.Type">Relative</Parameter>
+        <Parameter Name="Validation.Tolerance">21</Parameter>
+        <Parameter Name="Validation.NumInput">9</Parameter>
+        <Parameter Name="Validation.Input">
+          <Value>NaN</Value>
+          <Value>-Inf</Value>
+          <Value>-denorm</Value>
+          <Value>-0</Value>
+          <Value>0</Value>
+          <Value>denorm</Value>
+          <Value>Inf</Value>
+          <Value>-1</Value>
+          <Value>10</Value>
+        </Parameter>
+        <Parameter Name="Validation.Expected">
+          <Value>NaN</Value>
+          <Value>0</Value>
+          <Value>1</Value>
+          <Value>1</Value>
+          <Value>1</Value>
+          <Value>1</Value>
+          <Value>Inf</Value>
+          <Value>0.367879441</Value>
+          <Value>22026.46579</Value>
+        </Parameter>
+        <Parameter Name="ShaderOp.Name">exp</Parameter>
+        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
+        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
+        <Parameter Name="ShaderOp.Text"><![CDATA[
+            struct SUnaryFPOp {
+                float input;
+                float output;
+            };
+            RWStructuredBuffer<SUnaryFPOp> g_buf : register(u0);
+            [numthreads(8,8,1)]
+            void main(uint GI : SV_GroupIndex) {
+                SUnaryFPOp l = g_buf[GI];
+                l.output = exp(l.input);
+                g_buf[GI] = l;
+            }
+           ]]></Parameter>
+      </Row>
+
+      <Row Name="frc">
+        <Parameter Name="Validation.Type">Epsilon</Parameter>
+        <Parameter Name="Validation.Tolerance">0.0008</Parameter>
+        <Parameter Name="Validation.NumInput">11</Parameter>
+        <Parameter Name="Validation.Input">
+          <Value>NaN</Value>
+          <Value>-Inf</Value>
+          <Value>-denorm</Value>
+          <Value>-0</Value>
+          <Value>0</Value>
+          <Value>denorm</Value>
+          <Value>Inf</Value>
+          <Value>-1</Value>
+          <Value>2.718280</Value>
+          <Value>1000.599976</Value>
+          <Value>-7.389</Value>
+        </Parameter>
+        <Parameter Name="Validation.Expected">
+          <Value>NaN</Value>
+          <Value>NaN</Value>
+          <Value>0</Value>
+          <Value>0</Value>
+          <Value>0</Value>
+          <Value>0</Value>
+          <Value>NaN</Value>
+          <Value>0</Value>
+          <Value>0.718280</Value>
+          <Value>0.599976</Value>
+          <Value>0.611</Value>
+        </Parameter>
+        <Parameter Name="ShaderOp.Name">frc</Parameter>
+        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
+        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
+        <Parameter Name="ShaderOp.Text"><![CDATA[
+            struct SUnaryFPOp {
+                float input;
+                float output;
+            };
+            RWStructuredBuffer<SUnaryFPOp> g_buf : register(u0);
+            [numthreads(8,8,1)]
+            void main(uint GI : SV_GroupIndex) {
+                SUnaryFPOp l = g_buf[GI];
+                l.output = frac(l.input);
+                g_buf[GI] = l;
+            }
+           ]]></Parameter>
+      </Row>
+
+      <Row Name="log">
+        <Parameter Name="Validation.Type">Relative</Parameter>
+        <Parameter Name="Validation.Tolerance">21</Parameter>
+        <Parameter Name="Validation.NumInput">11</Parameter>
+        <Parameter Name="Validation.Input">
+          <Value>NaN</Value>
+          <Value>-Inf</Value>
+          <Value>-denorm</Value>
+          <Value>-0</Value>
+          <Value>0</Value>
+          <Value>denorm</Value>
+          <Value>Inf</Value>
+          <Value>-1</Value>
+          <Value>2.718281828</Value>
+          <Value>7.389056</Value>
+          <Value>100</Value>
+        </Parameter>
+        <Parameter Name="Validation.Expected">
+          <Value>NaN</Value>
+          <Value>NaN</Value>
+          <Value>-Inf</Value>
+          <Value>-Inf</Value>
+          <Value>-Inf</Value>
+          <Value>-Inf</Value>
+          <Value>Inf</Value>
+          <Value>NaN</Value>
+          <Value>1.0</Value>
+          <Value>1.99999998</Value>
+          <Value>4.6051701</Value>
+        </Parameter>
+        <Parameter Name="ShaderOp.Name">log</Parameter>
+        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
+        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
+        <Parameter Name="ShaderOp.Text"><![CDATA[
+            struct SUnaryFPOp {
+                float input;
+                float output;
+            };
+            RWStructuredBuffer<SUnaryFPOp> g_buf : register(u0);
+            [numthreads(8,8,1)]
+            void main(uint GI : SV_GroupIndex) {
+                SUnaryFPOp l = g_buf[GI];
+                l.output = log(l.input);
+                g_buf[GI] = l;
+            }
+           ]]></Parameter>
+      </Row>
+      <Row Name="sqrt">
+        <Parameter Name="Validation.Type">ulp</Parameter>
+        <Parameter Name="Validation.Tolerance">1</Parameter>
+        <Parameter Name="Validation.NumInput">11</Parameter>
+        <Parameter Name="Validation.Input">
+          <Value>NaN</Value>
+          <Value>-Inf</Value>
+          <Value>-denorm</Value>
+          <Value>-0</Value>
+          <Value>0</Value>
+          <Value>denorm</Value>
+          <Value>Inf</Value>
+          <Value>-1</Value>
+          <Value>2</Value>
+          <Value>16.0</Value>
+          <Value>256.0</Value>
+        </Parameter>
+        <Parameter Name="Validation.Expected">
+          <Value>NaN</Value>
+          <Value>NaN</Value>
+          <Value>-0</Value>
+          <Value>-0</Value>
+          <Value>0</Value>
+          <Value>0</Value>
+          <Value>Inf</Value>
+          <Value>NaN</Value>
+          <Value>1.41421356237</Value>
+          <Value>4.0</Value>
+          <Value>16.0</Value>
+        </Parameter>
+        <Parameter Name="ShaderOp.Name">sqrt</Parameter>
+        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
+        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
+        <Parameter Name="ShaderOp.Text"><![CDATA[
+            struct SUnaryFPOp {
+                float input;
+                float output;
+            };
+            RWStructuredBuffer<SUnaryFPOp> g_buf : register(u0);
+            [numthreads(8,8,1)]
+            void main(uint GI : SV_GroupIndex) {
+                SUnaryFPOp l = g_buf[GI];
+                l.output = sqrt(l.input);
+                g_buf[GI] = l;
+            }
+           ]]></Parameter>
+      </Row>
+
+      <Row Name="rsqrt">
+        <Parameter Name="Validation.Type">ulp</Parameter>
+        <Parameter Name="Validation.Tolerance">1</Parameter>
+        <Parameter Name="Validation.NumInput">11</Parameter>
+        <Parameter Name="Validation.Input">
+          <Value>NaN</Value>
+          <Value>-Inf</Value>
+          <Value>-denorm</Value>
+          <Value>-0</Value>
+          <Value>0</Value>
+          <Value>denorm</Value>
+          <Value>Inf</Value>
+          <Value>-1</Value>
+          <Value>16.0</Value>
+          <Value>256.0</Value>
+          <Value>65536.0</Value>
+        </Parameter>
+        <Parameter Name="Validation.Expected">
+          <Value>NaN</Value>
+          <Value>NaN</Value>
+          <Value>-Inf</Value>
+          <Value>-Inf</Value>
+          <Value>Inf</Value>
+          <Value>Inf</Value>
+          <Value>0</Value>
+          <Value>NaN</Value>
+          <Value>0.25</Value>
+          <Value>0.0625</Value>
+          <Value>0.00390625</Value>
+        </Parameter>
+        <Parameter Name="ShaderOp.Name">rsqrt</Parameter>
+        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
+        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
+        <Parameter Name="ShaderOp.Text"><![CDATA[
+            struct SUnaryFPOp {
+                float input;
+                float output;
+            };
+            RWStructuredBuffer<SUnaryFPOp> g_buf : register(u0);
+            [numthreads(8,8,1)]
+            void main(uint GI : SV_GroupIndex) {
+                SUnaryFPOp l = g_buf[GI];
+                l.output = rsqrt(l.input);
+                g_buf[GI] = l;
+            }
+           ]]></Parameter>
+      </Row>
+
+       <Row Name="rsqrt">
+        <Parameter Name="Validation.Type">ulp</Parameter>
+        <Parameter Name="Validation.Tolerance">1</Parameter>
+        <Parameter Name="Validation.NumInput">11</Parameter>
+        <Parameter Name="Validation.Input">
+          <Value>NaN</Value>
+          <Value>-Inf</Value>
+          <Value>-denorm</Value>
+          <Value>-0</Value>
+          <Value>0</Value>
+          <Value>denorm</Value>
+          <Value>Inf</Value>
+          <Value>-1</Value>
+          <Value>16.0</Value>
+          <Value>256.0</Value>
+          <Value>65536.0</Value>
+        </Parameter>
+        <Parameter Name="Validation.Expected">
+          <Value>NaN</Value>
+          <Value>NaN</Value>
+          <Value>-Inf</Value>
+          <Value>-Inf</Value>
+          <Value>Inf</Value>
+          <Value>Inf</Value>
+          <Value>0</Value>
+          <Value>NaN</Value>
+          <Value>0.25</Value>
+          <Value>0.0625</Value>
+          <Value>0.00390625</Value>
+        </Parameter>
+        <Parameter Name="ShaderOp.Name">rsqrt</Parameter>
+        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
+        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
+        <Parameter Name="ShaderOp.Text"><![CDATA[
+            struct SUnaryFPOp {
+                float input;
+                float output;
+            };
+            RWStructuredBuffer<SUnaryFPOp> g_buf : register(u0);
+            [numthreads(8,8,1)]
+            void main(uint GI : SV_GroupIndex) {
+                SUnaryFPOp l = g_buf[GI];
+                l.output = rsqrt(l.input);
+                g_buf[GI] = l;
+            }
+           ]]></Parameter>
+      </Row>
+
+      <Row Name="round_ne">
+        <Parameter Name="Validation.Type">Epsilon</Parameter>
+        <Parameter Name="Validation.Tolerance">0</Parameter>
+        <Parameter Name="Validation.NumInput">16</Parameter>
+        <Parameter Name="Validation.Input">
+          <Value>NaN</Value>
+          <Value>-Inf</Value>
+          <Value>-denorm</Value>
+          <Value>-0</Value>
+          <Value>0</Value>
+          <Value>denorm</Value>
+          <Value>Inf</Value>
+          <Value>10.0</Value>
+          <Value>10.4</Value>
+          <Value>10.5</Value>
+          <Value>10.6</Value>
+          <Value>11.5</Value>
+          <Value>-10.0</Value>
+          <Value>-10.4</Value>
+          <Value>-10.5</Value>
+          <Value>-10.6</Value>
+        </Parameter>
+        <Parameter Name="Validation.Expected">
+          <Value>NaN</Value>
+          <Value>-Inf</Value>
+          <Value>-0</Value>
+          <Value>-0</Value>
+          <Value>0</Value>
+          <Value>0</Value>
+          <Value>Inf</Value>
+          <Value>10.0</Value>
+          <Value>10.0</Value>
+          <Value>10.0</Value>
+          <Value>11.0</Value>
+          <Value>12.0</Value>
+          <Value>-10.0</Value>
+          <Value>-10.0</Value>
+          <Value>-10.0</Value>
+          <Value>-11.0</Value>
+        </Parameter>
+        <Parameter Name="ShaderOp.Name">round_ne</Parameter>
+        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
+        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
+        <Parameter Name="ShaderOp.Text"><![CDATA[
+            struct SUnaryFPOp {
+                float input;
+                float output;
+            };
+            RWStructuredBuffer<SUnaryFPOp> g_buf : register(u0);
+            [numthreads(8,8,1)]
+            void main(uint GI : SV_GroupIndex) {
+                SUnaryFPOp l = g_buf[GI];
+                l.output = round(l.input);
+                g_buf[GI] = l;
+            }
+           ]]></Parameter>
+      </Row>
+
+      <Row Name="round_ni">
+        <Parameter Name="Validation.Type">Epsilon</Parameter>
+        <Parameter Name="Validation.Tolerance">0</Parameter>
+        <Parameter Name="Validation.NumInput">15</Parameter>
+        <Parameter Name="Validation.Input">
+          <Value>NaN</Value>
+          <Value>-Inf</Value>
+          <Value>-denorm</Value>
+          <Value>-0</Value>
+          <Value>0</Value>
+          <Value>denorm</Value>
+          <Value>Inf</Value>
+          <Value>10.0</Value>
+          <Value>10.4</Value>
+          <Value>10.5</Value>
+          <Value>10.6</Value>
+          <Value>-10.0</Value>
+          <Value>-10.4</Value>
+          <Value>-10.5</Value>
+          <Value>-10.6</Value>
+        </Parameter>
+        <Parameter Name="Validation.Expected">
+          <Value>NaN</Value>
+          <Value>-Inf</Value>
+          <Value>-0</Value>
+          <Value>-0</Value>
+          <Value>0</Value>
+          <Value>0</Value>
+          <Value>Inf</Value>
+          <Value>10.0</Value>
+          <Value>10.0</Value>
+          <Value>10.0</Value>
+          <Value>10.0</Value>
+          <Value>-10.0</Value>
+          <Value>-11.0</Value>
+          <Value>-11.0</Value>
+          <Value>-11.0</Value>
+        </Parameter>
+        <Parameter Name="ShaderOp.Name">round_ni</Parameter>
+        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
+        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
+        <Parameter Name="ShaderOp.Text"><![CDATA[
+            struct SUnaryFPOp {
+                float input;
+                float output;
+            };
+            RWStructuredBuffer<SUnaryFPOp> g_buf : register(u0);
+            [numthreads(8,8,1)]
+            void main(uint GI : SV_GroupIndex) {
+                SUnaryFPOp l = g_buf[GI];
+                l.output = floor(l.input);
+                g_buf[GI] = l;
+            }
+           ]]></Parameter>
+       </Row>
+
+       <Row Name="round_pi">
+        <Parameter Name="Validation.Type">Epsilon</Parameter>
+        <Parameter Name="Validation.Tolerance">0</Parameter>
+        <Parameter Name="Validation.NumInput">15</Parameter>
+        <Parameter Name="Validation.Input">
+          <Value>NaN</Value>
+          <Value>-Inf</Value>
+          <Value>-denorm</Value>
+          <Value>-0</Value>
+          <Value>0</Value>
+          <Value>denorm</Value>
+          <Value>Inf</Value>
+          <Value>10.0</Value>
+          <Value>10.4</Value>
+          <Value>10.5</Value>
+          <Value>10.6</Value>
+          <Value>-10.0</Value>
+          <Value>-10.4</Value>
+          <Value>-10.5</Value>
+          <Value>-10.6</Value>
+        </Parameter>
+        <Parameter Name="Validation.Expected">
+          <Value>NaN</Value>
+          <Value>-Inf</Value>
+          <Value>-0</Value>
+          <Value>-0</Value>
+          <Value>0</Value>
+          <Value>0</Value>
+          <Value>Inf</Value>
+          <Value>10.0</Value>
+          <Value>11.0</Value>
+          <Value>11.0</Value>
+          <Value>11.0</Value>
+          <Value>-10.0</Value>
+          <Value>-10.0</Value>
+          <Value>-10.0</Value>
+          <Value>-10.0</Value>
+        </Parameter>
+        <Parameter Name="ShaderOp.Name">round_pi</Parameter>
+        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
+        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
+        <Parameter Name="ShaderOp.Text"><![CDATA[
+            struct SUnaryFPOp {
+                float input;
+                float output;
+            };
+            RWStructuredBuffer<SUnaryFPOp> g_buf : register(u0);
+            [numthreads(8,8,1)]
+            void main(uint GI : SV_GroupIndex) {
+                SUnaryFPOp l = g_buf[GI];
+                l.output = ceil(l.input);
+                g_buf[GI] = l;
+            }
+           ]]></Parameter>
+      </Row>
+
+      <Row Name="round_z">
+        <Parameter Name="Validation.Type">Epsilon</Parameter>
+        <Parameter Name="Validation.Tolerance">0</Parameter>
+        <Parameter Name="Validation.NumInput">15</Parameter>
+        <Parameter Name="Validation.Input">
+          <Value>NaN</Value>
+          <Value>-Inf</Value>
+          <Value>-denorm</Value>
+          <Value>-0</Value>
+          <Value>0</Value>
+          <Value>denorm</Value>
+          <Value>Inf</Value>
+          <Value>10.0</Value>
+          <Value>10.4</Value>
+          <Value>10.5</Value>
+          <Value>10.6</Value>
+          <Value>-10.0</Value>
+          <Value>-10.4</Value>
+          <Value>-10.5</Value>
+          <Value>-10.6</Value>
+        </Parameter>
+        <Parameter Name="Validation.Expected">
+          <Value>NaN</Value>
+          <Value>-Inf</Value>
+          <Value>-0</Value>
+          <Value>-0</Value>
+          <Value>0</Value>
+          <Value>0</Value>
+          <Value>Inf</Value>
+          <Value>10.0</Value>
+          <Value>10.0</Value>
+          <Value>10.0</Value>
+          <Value>10.0</Value>
+          <Value>-10.0</Value>
+          <Value>-10.0</Value>
+          <Value>-10.0</Value>
+          <Value>-10.0</Value>
+        </Parameter>
+        <Parameter Name="ShaderOp.Name">round_z</Parameter>
+        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
+        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
+        <Parameter Name="ShaderOp.Text"><![CDATA[
+            struct SUnaryFPOp {
+                float input;
+                float output;
+            };
+            RWStructuredBuffer<SUnaryFPOp> g_buf : register(u0);
+            [numthreads(8,8,1)]
+            void main(uint GI : SV_GroupIndex) {
+                SUnaryFPOp l = g_buf[GI];
+                l.output = trunc(l.input);
+                g_buf[GI] = l;
+            }
+           ]]></Parameter>
+      </Row>
+
+      <Row Name="IsNaN">
+        <Parameter Name="Validation.Type">Epsilon</Parameter>
+        <Parameter Name="Validation.Tolerance">0</Parameter>
+        <Parameter Name="Validation.NumInput">9</Parameter>
+        <Parameter Name="Validation.Input">
+          <Value>NaN</Value>
+          <Value>-Inf</Value>
+          <Value>-denorm</Value>
+          <Value>-0</Value>
+          <Value>0</Value>
+          <Value>denorm</Value>
+          <Value>Inf</Value>
+          <Value>1.0</Value>
+          <Value>-1.0</Value>
+        </Parameter>
+        <Parameter Name="Validation.Expected">
+          <Value>1</Value>
+          <Value>0</Value>
+          <Value>0</Value>
+          <Value>0</Value>
+          <Value>0</Value>
+          <Value>0</Value>
+          <Value>0</Value>
+          <Value>0</Value>
+          <Value>0</Value>
+        </Parameter>
+        <Parameter Name="ShaderOp.Name">IsNaN</Parameter>
+        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
+        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
+        <Parameter Name="ShaderOp.Text"><![CDATA[
+            struct SUnaryFPOp {
+                float input;
+                float output;
+            };
+            RWStructuredBuffer<SUnaryFPOp> g_buf : register(u0);
+            [numthreads(8,8,1)]
+            void main(uint GI : SV_GroupIndex) {
+                SUnaryFPOp l = g_buf[GI];
+                if (isnan(l.input))
+                    l.output = 1;
+                else
+                    l.output = 0;
+                g_buf[GI] = l;
+            }
+           ]]></Parameter>
+      </Row>
+      <Row Name="IsInf">
+        <Parameter Name="Validation.Type">Epsilon</Parameter>
+        <Parameter Name="Validation.Tolerance">0</Parameter>
+        <Parameter Name="Validation.NumInput">9</Parameter>
+        <Parameter Name="Validation.Input">
+          <Value>NaN</Value>
+          <Value>-Inf</Value>
+          <Value>-denorm</Value>
+          <Value>-0</Value>
+          <Value>0</Value>
+          <Value>denorm</Value>
+          <Value>Inf</Value>
+          <Value>1.0</Value>
+          <Value>-1.0</Value>
+        </Parameter>
+        <Parameter Name="Validation.Expected">
+          <Value>0</Value>
+          <Value>1</Value>
+          <Value>0</Value>
+          <Value>0</Value>
+          <Value>0</Value>
+          <Value>0</Value>
+          <Value>1</Value>
+          <Value>0</Value>
+          <Value>0</Value>
+        </Parameter>
+        <Parameter Name="ShaderOp.Name">IsInf</Parameter>
+        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
+        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
+        <Parameter Name="ShaderOp.Text"><![CDATA[
+            struct SUnaryFPOp {
+                float input;
+                float output;
+            };
+            RWStructuredBuffer<SUnaryFPOp> g_buf : register(u0);
+            [numthreads(8,8,1)]
+            void main(uint GI : SV_GroupIndex) {
+                SUnaryFPOp l = g_buf[GI];
+                if (isinf(l.input))
+                    l.output = 1;
+                else
+                    l.output = 0;
+                g_buf[GI] = l;
+            }
+           ]]></Parameter>
+      </Row>
+      <Row Name="IsFinite">
+        <Parameter Name="Warp.Version">16202</Parameter>
+        <Parameter Name="Validation.Type">Epsilon</Parameter>
+        <Parameter Name="Validation.Tolerance">0</Parameter>
+        <Parameter Name="Validation.NumInput">8</Parameter>
+        <Parameter Name="Validation.Input">
+          <Value>NaN</Value>
+          <Value>-Inf</Value>
+          <Value>-denorm</Value>
+          <Value>-0</Value>
+          <Value>0</Value>
+          <Value>denorm</Value>
+          <Value>Inf</Value>
+          <Value>1.0</Value>
+          <Value>-1.0</Value>
+        </Parameter>
+        <Parameter Name="Validation.Expected">
+          <Value>0</Value>
+          <Value>0</Value>
+          <Value>1</Value>
+          <Value>1</Value>
+          <Value>1</Value>
+          <Value>1</Value>
+          <Value>0</Value>
+          <Value>1</Value>
+          <Value>1</Value>
+        </Parameter>
+        <Parameter Name="ShaderOp.Name">IsFinite</Parameter>
+        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
+        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
+        <Parameter Name="ShaderOp.Text"><![CDATA[
+            struct SUnaryFPOp {
+                float input;
+                float output;
+            };
+            RWStructuredBuffer<SUnaryFPOp> g_buf : register(u0);
+            [numthreads(8,8,1)]
+            void main(uint GI : SV_GroupIndex) {
+                SUnaryFPOp l = g_buf[GI];
+                if (isfinite(l.input))
+                    l.output = 1;
+                else
+                    l.output = 0;
+                g_buf[GI] = l;
+            }
+           ]]></Parameter>
+      </Row>
+
+      <Row Name="FAbs">
+        <Parameter Name="Validation.Type">Epsilon</Parameter>
+        <Parameter Name="Validation.Tolerance">0</Parameter>
+        <Parameter Name="Validation.NumInput">9</Parameter>
+        <Parameter Name="Validation.Input">
+          <Value>NaN</Value>
+          <Value>-Inf</Value>
+          <Value>-denorm</Value>
+          <Value>-0</Value>
+          <Value>0</Value>
+          <Value>denorm</Value>
+          <Value>Inf</Value>
+          <Value>1.0</Value>
+          <Value>-1.0</Value>
+        </Parameter>
+        <Parameter Name="Validation.Expected">
+          <Value>NaN</Value>
+          <Value>Inf</Value>
+          <Value>denorm</Value>
+          <Value>0</Value>
+          <Value>0</Value>
+          <Value>denorm</Value>
+          <Value>Inf</Value>
+          <Value>1</Value>
+          <Value>1</Value>
+        </Parameter>
+        <Parameter Name="ShaderOp.Name">FAbs</Parameter>
+        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
+        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
+        <Parameter Name="ShaderOp.Text"><![CDATA[
+            struct SUnaryFPOp {
+                float input;
+                float output;
+            };
+            RWStructuredBuffer<SUnaryFPOp> g_buf : register(u0);
+            [numthreads(8,8,1)]
+            void main(uint GI : SV_GroupIndex) {
+                SUnaryFPOp l = g_buf[GI];
+                l.output = abs(l.input);
+                g_buf[GI] = l;
+            }
+           ]]></Parameter>
+      </Row>
+    </Table>
+    <Table Id="BinaryFloatOpTable">
+      <ParameterTypes>
+        <ParameterType Name="Validation.Type">String</ParameterType>
+        <ParameterType Name="Validation.Tolerance">double</ParameterType>
+        <ParameterType Name="Validation.Input1" Array="true">String</ParameterType>
+        <ParameterType Name="Validation.Input2" Array="true">String</ParameterType>
+        <ParameterType Name="Validation.Expected1" Array="true">String</ParameterType>
+        <ParameterType Name="Validation.Expected2" Array="true">String</ParameterType>
+        <ParameterType Name="Validation.NumInput">unsigned int</ParameterType>
+        <ParameterType Name="ShaderOp.Name">String</ParameterType>
+        <ParameterType Name="ShaderOp.Target">String</ParameterType>
+        <ParameterType Name="ShaderOp.EntryPoint">String</ParameterType>
+        <ParameterType Name="ShaderOp.Text">String</ParameterType>
+      </ParameterTypes>
+
+      <Row Name="MinMax">
+        <Parameter Name="Validation.Type">epsilon</Parameter>
+        <Parameter Name="Validation.Tolerance">0</Parameter>
+        <Parameter Name="Validation.NumInput">17</Parameter>
+        <Parameter Name="Validation.Input1">
+            <Value>-inf</Value>
+            <Value>-inf</Value>
+            <Value>-inf</Value>
+            <Value>-inf</Value>
+            <Value>inf</Value>
+            <Value>inf</Value>
+            <Value>inf</Value>
+            <Value>inf</Value>
+            <Value>NaN</Value>
+            <Value>NaN</Value>
+            <Value>NaN</Value>
+            <Value>NaN</Value>
+            <Value>1.0</Value>
+            <Value>1.0</Value>
+            <Value>-1.0</Value>
+            <Value>-1.0</Value>
+            <Value>1.0</Value>
+        </Parameter>
+        <Parameter Name="Validation.Input2">
+            <Value>-inf</Value>
+            <Value>inf</Value>
+            <Value>1.0</Value>
+            <Value>NaN</Value>
+            <Value>-inf</Value>
+            <Value>inf</Value>
+            <Value>1.0</Value>
+            <Value>NaN</Value>
+            <Value>-inf</Value>
+            <Value>inf</Value>
+            <Value>1.0</Value>
+            <Value>NaN</Value>
+            <Value>-inf</Value>
+            <Value>inf</Value>
+            <Value>1.0</Value>
+            <Value>NaN</Value>
+            <Value>-1.0</Value>
+        </Parameter>
+        <Parameter Name="Validation.Expected1">
+            <Value>-inf</Value>
+            <Value>-inf</Value>
+            <Value>-inf</Value>
+            <Value>-inf</Value>
+            <Value>-inf</Value>
+            <Value>inf</Value>
+            <Value>1.0</Value>
+            <Value>inf</Value>
+            <Value>-inf</Value>
+            <Value>inf</Value>
+            <Value>1.0</Value>
+            <Value>NaN</Value>
+            <Value>-inf</Value>
+            <Value>1.0</Value>
+            <Value>-1.0</Value>
+            <Value>-1.0</Value>
+            <Value>-1.0</Value>
+        </Parameter>
+        <Parameter Name="Validation.Expected2">
+            <Value>-inf</Value>
+            <Value>inf</Value>
+            <Value>1.0</Value>
+            <Value>-inf</Value>
+            <Value>inf</Value>
+            <Value>inf</Value>
+            <Value>inf</Value>
+            <Value>inf</Value>
+            <Value>-inf</Value>
+            <Value>inf</Value>
+            <Value>1.0</Value>
+            <Value>NaN</Value>
+            <Value>1.0</Value>
+            <Value>inf</Value>
+            <Value>1.0</Value>
+            <Value>-1.0</Value>
+            <Value>1.0</Value>
+        </Parameter>
+        <Parameter Name="ShaderOp.Name">MinMax</Parameter>
+        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
+        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
+        <Parameter Name="ShaderOp.Text">
+            <![CDATA[
+            struct SBinaryFPOp {
+                float input1;
+                float input2;
+                float output1;
+                float output2;
+            };
+            RWStructuredBuffer<SBinaryFPOp> g_buf : register(u0);
+            [numthreads(8,8,1)]
+            void main(uint GI : SV_GroupIndex) {
+                SBinaryFPOp l = g_buf[GI];
+                l.output1 = min(l.input1, l.input2);
+                l.output2 = max(l.input1, l.input2);
+                g_buf[GI] = l;
+            };
+            ]]>
+        </Parameter>
+      </Row>
+
+      </Table>
+
+      <Table Id="TertiaryFloatOpTable">
+        <ParameterTypes>
+            <ParameterType Name="Description">String</ParameterType>
+            <ParameterType Name="ShaderOp.Name">String</ParameterType>
+            <ParameterType Name="ShaderOp.Target">String</ParameterType>
+            <ParameterType Name="ShaderOp.EntryPoint">String</ParameterType>
+            <ParameterType Name="ShaderOp.Text">String</ParameterType>
+            <ParameterType Name="Validation.Type">String</ParameterType>
+            <ParameterType Name="Validation.Tolerance">int</ParameterType>
+            <ParameterType Name="Validation.Input1" Array="true">String</ParameterType>
+            <ParameterType Name="Validation.Input2" Array="true">String</ParameterType>
+            <ParameterType Name="Validation.Input3" Array="true">String</ParameterType>
+            <ParameterType Name="Validation.Expected" Array="true">String</ParameterType>
+            <ParameterType Name="Validation.NumInput">unsigned int</ParameterType>
+        </ParameterTypes>
+
+        <Row Name="FMad">
+            <Parameter Name="ShaderOp.Name">FMad</Parameter>
+            <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
+            <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
+            <Parameter Name="ShaderOp.Text">
+            <![CDATA[
+                struct STertiaryFloatOp {
+                    float input1;
+                    float input2;
+                    float input3;
+                    float output;
+                };
+                RWStructuredBuffer<STertiaryFloatOp> g_buf : register(u0);
+                [numthreads(8,8,1)]
+                void main(uint GI : SV_GroupIndex) {
+                    STertiaryFloatOp l = g_buf[GI];
+                    l.output = mad(l.input1, l.input2, l.input3);
+                    g_buf[GI] = l;
+                };
+            ]]>
+            </Parameter>
+            <Parameter Name="Validation.Type">epsilon</Parameter>
+            <Parameter Name="Validation.Tolerance">0.0008</Parameter>
+            <Parameter Name="Validation.Input1">
+                <Value>NaN</Value>
+                <Value>-Inf</Value>
+                <Value>-denorm</Value>
+                <Value>-0</Value>
+                <Value>0</Value>
+                <Value>denorm</Value>
+                <Value>Inf</Value>
+                <Value>1.0</Value>
+                <Value>-1.0</Value>
+                <Value>0</Value>
+                <Value>1</Value>
+                <Value>1.5</Value>
+            </Parameter>
+            <Parameter Name="Validation.Input2">
+                <Value>NaN</Value>
+                <Value>-Inf</Value>
+                <Value>-denorm</Value>
+                <Value>-0</Value>
+                <Value>0</Value>
+                <Value>denorm</Value>
+                <Value>Inf</Value>
+                <Value>1.0</Value>
+                <Value>-1.0</Value>
+                <Value>0</Value>
+                <Value>1</Value>
+                <Value>10</Value>
+            </Parameter>
+            <Parameter Name="Validation.Input3">
+                <Value>NaN</Value>
+                <Value>-Inf</Value>
+                <Value>-denorm</Value>
+                <Value>-0</Value>
+                <Value>0</Value>
+                <Value>denorm</Value>
+                <Value>Inf</Value>
+                <Value>1.0</Value>
+                <Value>-1.0</Value>
+                <Value>1</Value>
+                <Value>0</Value>
+                <Value>-5.5</Value>
+            </Parameter>
+
+            <Parameter Name="Validation.Expected">
+                <Value>NaN</Value>
+                <Value>NaN</Value>
+                <Value>0</Value>
+                <Value>0</Value>
+                <Value>0</Value>
+                <Value>0</Value>
+                <Value>Inf</Value>
+                <Value>2</Value>
+                <Value>0</Value>
+                <Value>1</Value>
+                <Value>1</Value>
+                <Value>9.5</Value>
+            </Parameter>
+            <Parameter Name="Validation.NumInput">12</Parameter>
+      </Row>
+      </Table>
+
+    <Table Id="UnaryIntOpTable">
+      <ParameterTypes>
+        <ParameterType Name="ShaderOp.Name">String</ParameterType>
+        <ParameterType Name="ShaderOp.Target">String</ParameterType>
+        <ParameterType Name="ShaderOp.EntryPoint">String</ParameterType>
+        <ParameterType Name="ShaderOp.Text">String</ParameterType>
+        <ParameterType Name="Validation.Input" Array="true">int</ParameterType>
+        <ParameterType Name="Validation.Expected" Array="true">int</ParameterType>
+        <ParameterType Name="Validation.Tolerance">int</ParameterType>
+        <ParameterType Name="Validation.NumInput">unsigned int</ParameterType>
+      </ParameterTypes>
+
+      <Row Name="Bfrev">
+        <Parameter Name="ShaderOp.Name">bfrev</Parameter>
+        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
+        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
+        <Parameter Name="ShaderOp.Text">
+        <![CDATA[
+            struct SUnaryIntOp {
+                int input;
+                int output;
+            };
+            RWStructuredBuffer<SUnaryIntOp> g_buf : register(u0);
+            [numthreads(8,8,1)]
+            void main(uint GI : SV_GroupIndex) {
+                SUnaryIntOp l = g_buf[GI];
+                l.output = reversebits(l.input);
+                g_buf[GI] = l;
+            };
+        ]]>
+        </Parameter>
+
+        <Parameter Name="Validation.Tolerance">0</Parameter>
+        <Parameter Name="Validation.NumInput">9</Parameter>
+        <Parameter Name="Validation.Input">
+            <Value>-2147483648</Value>
+            <Value>-65536</Value>
+            <Value>-8</Value>
+            <Value>-1</Value>
+            <Value>0</Value>
+            <Value>1</Value>
+            <Value>8</Value>
+            <Value>65536</Value>
+            <Value>2147483647</Value>
+        </Parameter>
+        <Parameter Name="Validation.Expected">
+            <Value>1</Value>
+            <Value>65535</Value>
+            <Value>536870911</Value>
+            <Value>-1</Value>
+            <Value>0</Value>
+            <Value>-2147483648</Value>
+            <Value>268435456</Value>
+            <Value>32768</Value>
+            <Value>-2</Value>
+        </Parameter>
+      </Row>
+
+      <Row Name="FirstbitSHi">
+        <Parameter Name="ShaderOp.Name">Firstbithi</Parameter>
+        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
+        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
+        <Parameter Name="ShaderOp.Text">
+        <![CDATA[
+            struct SUnaryIntOp {
+                int input;
+                int output;
+            };
+            RWStructuredBuffer<SUnaryIntOp> g_buf : register(u0);
+            [numthreads(8,8,1)]
+            void main(uint GI : SV_GroupIndex) {
+                SUnaryIntOp l = g_buf[GI];
+                l.output = firstbithigh(l.input);
+                g_buf[GI] = l;
+            };
+        ]]>
+        </Parameter>
+
+        <Parameter Name="Validation.Tolerance">0</Parameter>
+        <Parameter Name="Validation.NumInput">9</Parameter>
+        <Parameter Name="Validation.Input">
+            <Value>-2147483648</Value>
+            <Value>-65536</Value>
+            <Value>-8</Value>
+            <Value>-1</Value>
+            <Value>0</Value>
+            <Value>1</Value>
+            <Value>8</Value>
+            <Value>65536</Value>
+            <Value>2147483647</Value>
+        </Parameter>
+        <Parameter Name="Validation.Expected">
+            <Value>30</Value>
+            <Value>15</Value>
+            <Value>2</Value>
+            <Value>-1</Value>
+            <Value>-1</Value>
+            <Value>0</Value>
+            <Value>3</Value>
+            <Value>16</Value>
+            <Value>30</Value>
+        </Parameter>
+      </Row>
+
+      <Row Name="FirstBitLo">
+        <Parameter Name="ShaderOp.Name">Firstbitlo</Parameter>
+        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
+        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
+        <Parameter Name="ShaderOp.Text">
+        <![CDATA[
+            struct SUnaryIntOp {
+                int input;
+                int output;
+            };
+            RWStructuredBuffer<SUnaryIntOp> g_buf : register(u0);
+            [numthreads(8,8,1)]
+            void main(uint GI : SV_GroupIndex) {
+                SUnaryIntOp l = g_buf[GI];
+                l.output = firstbitlow(l.input);
+                g_buf[GI] = l;
+            };
+        ]]>
+        </Parameter>
+
+        <Parameter Name="Validation.Tolerance">0</Parameter>
+        <Parameter Name="Validation.NumInput">9</Parameter>
+        <Parameter Name="Validation.Input">
+            <Value>-2147483648</Value>
+            <Value>-65536</Value>
+            <Value>-8</Value>
+            <Value>-1</Value>
+            <Value>0</Value>
+            <Value>1</Value>
+            <Value>8</Value>
+            <Value>65536</Value>
+            <Value>2147483647</Value>
+        </Parameter>
+        <Parameter Name="Validation.Expected">
+            <Value>31</Value>
+            <Value>16</Value>
+            <Value>3</Value>
+            <Value>0</Value>
+            <Value>-1</Value>
+            <Value>0</Value>
+            <Value>3</Value>
+            <Value>16</Value>
+            <Value>0</Value>
+        </Parameter>
+      </Row>
+
+      <Row Name="Countbits">
+        <Parameter Name="ShaderOp.Name">Countbits</Parameter>
+        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
+        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
+        <Parameter Name="ShaderOp.Text">
+        <![CDATA[
+            struct SUnaryIntOp {
+                int input;
+                int output;
+            };
+            RWStructuredBuffer<SUnaryIntOp> g_buf : register(u0);
+            [numthreads(8,8,1)]
+            void main(uint GI : SV_GroupIndex) {
+                SUnaryIntOp l = g_buf[GI];
+                l.output = countbits(l.input);
+                g_buf[GI] = l;
+            };
+        ]]>
+        </Parameter>
+
+        <Parameter Name="Validation.Tolerance">0</Parameter>
+        <Parameter Name="Validation.NumInput">9</Parameter>
+        <Parameter Name="Validation.Input">
+            <Value>-2147483648</Value>
+            <Value>-65536</Value>
+            <Value>-8</Value>
+            <Value>-1</Value>
+            <Value>0</Value>
+            <Value>1</Value>
+            <Value>8</Value>
+            <Value>65536</Value>
+            <Value>2147483647</Value>
+        </Parameter>
+        <Parameter Name="Validation.Expected">
+            <Value>1</Value>
+            <Value>16</Value>
+            <Value>29</Value>
+            <Value>32</Value>
+            <Value>0</Value>
+            <Value>1</Value>
+            <Value>1</Value>
+            <Value>1</Value>
+            <Value>31</Value>
+        </Parameter>
+      </Row>
+    </Table>
+
+    <Table Id="UnaryUintOpTable">
+      <ParameterTypes>
+        <ParameterType Name="ShaderOp.Name">String</ParameterType>
+        <ParameterType Name="ShaderOp.Target">String</ParameterType>
+        <ParameterType Name="ShaderOp.EntryPoint">String</ParameterType>
+        <ParameterType Name="ShaderOp.Text">String</ParameterType>
+        <ParameterType Name="Validation.Input" Array="true">unsigned int</ParameterType>
+        <ParameterType Name="Validation.Expected" Array="true">unsigned int</ParameterType>
+        <ParameterType Name="Validation.Tolerance">int</ParameterType>
+        <ParameterType Name="Validation.NumInput">unsigned int</ParameterType>
+      </ParameterTypes>
+
+      <Row Name="FirstbitHi">
+        <Parameter Name="ShaderOp.Name">Firstbithi</Parameter>
+        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
+        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
+        <Parameter Name="ShaderOp.Text">
+        <![CDATA[
+            struct SUnaryUintOp {
+                uint input;
+                uint output;
+            };
+            RWStructuredBuffer<SUnaryUintOp> g_buf : register(u0);
+            [numthreads(8,8,1)]
+            void main(uint GI : SV_GroupIndex) {
+                SUnaryUintOp l = g_buf[GI];
+                l.output = firstbithigh(l.input);
+                g_buf[GI] = l;
+            };
+        ]]>
+        </Parameter>
+        <Parameter Name="Validation.Tolerance">0</Parameter>
+        <Parameter Name="Validation.NumInput">6</Parameter>
+        <Parameter Name="Validation.Input">
+            <Value>0</Value>
+            <Value>1</Value>
+            <Value>8</Value>
+            <Value>65536</Value>
+            <Value>2147483647</Value>
+            <Value>4294967295</Value>
+        </Parameter>
+        <Parameter Name="Validation.Expected">
+            <Value>-1</Value>
+            <Value>0</Value>
+            <Value>3</Value>
+            <Value>16</Value>
+            <Value>30</Value>
+            <Value>31</Value>
+        </Parameter>
+      </Row>
+    </Table>
+
+    <Table Id="BinaryIntOpTable">
+      <ParameterTypes>
+        <ParameterType Name="ShaderOp.Name">String</ParameterType>
+        <ParameterType Name="ShaderOp.Target">String</ParameterType>
+        <ParameterType Name="ShaderOp.EntryPoint">String</ParameterType>
+        <ParameterType Name="ShaderOp.Text">String</ParameterType>
+        <ParameterType Name="Validation.NumExpected">int</ParameterType>
+        <ParameterType Name="Validation.Input1" Array="true">int</ParameterType>
+        <ParameterType Name="Validation.Input2" Array="true">int</ParameterType>
+        <ParameterType Name="Validation.Expected1" Array="true">int</ParameterType>
+        <ParameterType Name="Validation.Expected2" Array="true">int</ParameterType>
+        <ParameterType Name="Validation.Tolerance">int</ParameterType>
+        <ParameterType Name="Validation.NumInput">unsigned int</ParameterType>
+      </ParameterTypes>
+
+      <Row Name="IMax">
+        <Parameter Name="ShaderOp.Name">IMax</Parameter>
+        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
+        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
+        <Parameter Name="ShaderOp.Text">
+        <![CDATA[
+            struct SBinaryIntOp {
+                int input1;
+                int input2;
+                int output1;
+                int output2;
+            };
+            RWStructuredBuffer<SBinaryIntOp> g_buf : register(u0);
+            [numthreads(8,8,1)]
+            void main(uint GI : SV_GroupIndex) {
+                SBinaryIntOp l = g_buf[GI];
+                l.output1 = max(l.input1, l.input2);
+                g_buf[GI] = l;
+            };
+        ]]>
+        </Parameter>
+        <Parameter Name="Validation.NumExpected">1</Parameter>
+        <Parameter Name="Validation.NumInput">6</Parameter>
+        <Parameter Name="Validation.Input1">
+            <Value>-2147483648</Value>
+            <Value>-10</Value>
+            <Value>0</Value>
+            <Value>0</Value>
+            <Value>10</Value>
+            <Value>2147483647</Value>
+        </Parameter>
+        <Parameter Name="Validation.Input2">
+            <Value>0</Value>
+            <Value>10</Value>
+            <Value>-10</Value>
+            <Value>10</Value>
+            <Value>10</Value>
+            <Value>0</Value>
+        </Parameter>
+        <Parameter Name="Validation.Expected1">
+            <Value>0</Value>
+            <Value>10</Value>
+            <Value>0</Value>
+            <Value>10</Value>
+            <Value>10</Value>
+            <Value>2147483647</Value>
+        </Parameter>
+        <Parameter Name="Validation.Tolerance">0</Parameter>
+      </Row>
+      <Row Name="IMin">
+        <Parameter Name="ShaderOp.Name">IMin</Parameter>
+        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
+        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
+        <Parameter Name="ShaderOp.Text">
+        <![CDATA[
+            struct SBinaryIntOp {
+                int input1;
+                int input2;
+                int output1;
+                int output2;
+            };
+            RWStructuredBuffer<SBinaryIntOp> g_buf : register(u0);
+            [numthreads(8,8,1)]
+            void main(uint GI : SV_GroupIndex) {
+                SBinaryIntOp l = g_buf[GI];
+                l.output1 = min(l.input1, l.input2);
+                g_buf[GI] = l;
+            };
+        ]]>
+        </Parameter>
+        <Parameter Name="Validation.NumExpected">1</Parameter>
+        <Parameter Name="Validation.NumInput">6</Parameter>
+        <Parameter Name="Validation.Input1">
+            <Value>-2147483648</Value>
+            <Value>-10</Value>
+            <Value>0</Value>
+            <Value>0</Value>
+            <Value>10</Value>
+            <Value>2147483647</Value>
+        </Parameter>
+        <Parameter Name="Validation.Input2">
+            <Value>0</Value>
+            <Value>10</Value>
+            <Value>-10</Value>
+            <Value>10</Value>
+            <Value>10</Value>
+            <Value>0</Value>
+        </Parameter>
+        <Parameter Name="Validation.Expected1">
+            <Value>-2147483648</Value>
+            <Value>-10</Value>
+            <Value>-10</Value>
+            <Value>0</Value>
+            <Value>10</Value>
+            <Value>0</Value>
+        </Parameter>
+        <Parameter Name="Validation.Tolerance">0</Parameter>
+      </Row>
+      <Row Name="IMul">
+        <Parameter Name="ShaderOp.Name">Mul</Parameter>
+        <Parameter Name="ShaderOp.Description">integer multiplication. Note that this calls llvm "mul" operation and not IMul</Parameter>
+        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
+        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
+        <Parameter Name="ShaderOp.Text">
+        <![CDATA[
+            struct SBinaryIntOp {
+                int input1;
+                int input2;
+                int output1;
+                int output2;
+            };
+            RWStructuredBuffer<SBinaryIntOp> g_buf : register(u0);
+            [numthreads(8,8,1)]
+            void main(uint GI : SV_GroupIndex) {
+                SBinaryIntOp l = g_buf[GI];
+                l.output1 = l.input1 * l.input2;
+                g_buf[GI] = l;
+            };
+        ]]>
+        </Parameter>
+        <Parameter Name="Validation.NumExpected">1</Parameter>
+        <Parameter Name="Validation.NumInput">9</Parameter>
+        <Parameter Name="Validation.Input1">
+            <Value>-2147483648</Value>
+            <Value>-10</Value>
+            <Value>-1</Value>
+            <Value>0</Value>
+            <Value>1</Value>
+            <Value>10</Value>
+            <Value>10000</Value>
+            <Value>2147483647</Value>
+            <Value>2147483647</Value>
+        </Parameter>
+        <Parameter Name="Validation.Input2">
+            <Value>-10</Value>
+            <Value>-10</Value>
+            <Value>10</Value>
+            <Value>0</Value>
+            <Value>256</Value>
+            <Value>4</Value>
+            <Value>10001</Value>
+            <Value>0</Value>
+            <Value>2147483647</Value>
+        </Parameter>
+        <Parameter Name="Validation.Expected1">
+            <Value>0</Value>
+            <Value>100</Value>
+            <Value>-10</Value>
+            <Value>0</Value>
+            <Value>256</Value>
+            <Value>40</Value>
+            <Value>100010000</Value>
+            <Value>0</Value>
+            <Value>1</Value>
+        </Parameter>
+        <Parameter Name="Validation.Tolerance">0</Parameter>
+      </Row>
+    </Table>
+
+    <Table Id="BinaryUintOpTable">
+      <ParameterTypes>
+        <ParameterType Name="ShaderOp.Name">String</ParameterType>
+        <ParameterType Name="ShaderOp.Target">String</ParameterType>
+        <ParameterType Name="ShaderOp.EntryPoint">String</ParameterType>
+        <ParameterType Name="ShaderOp.Text">String</ParameterType>
+        <ParameterType Name="Validation.NumExpected">int</ParameterType>
+        <ParameterType Name="Validation.Input1" Array="true">unsigned int</ParameterType>
+        <ParameterType Name="Validation.Input2" Array="true">unsigned int</ParameterType>
+        <ParameterType Name="Validation.Expected1" Array="true">unsigned int</ParameterType>
+        <ParameterType Name="Validation.Expected2" Array="true">unsigned int</ParameterType>
+        <ParameterType Name="Validation.Tolerance">int</ParameterType>
+        <ParameterType Name="Validation.NumInput">unsigned int</ParameterType>
+      </ParameterTypes>
+      <Row Name="UMax">
+        <Parameter Name="ShaderOp.Name">UMax</Parameter>
+        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
+        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
+        <Parameter Name="ShaderOp.Text">
+        <![CDATA[
+            struct SBinaryUintOp {
+                uint input1;
+                uint input2;
+                uint output1;
+                uint output2;
+            };
+            RWStructuredBuffer<SBinaryUintOp> g_buf : register(u0);
+            [numthreads(8,8,1)]
+            void main(uint GI : SV_GroupIndex) {
+                SBinaryUintOp l = g_buf[GI];
+                l.output1 = max(l.input1, l.input2);
+                g_buf[GI] = l;
+            };
+        ]]>
+        </Parameter>
+        <Parameter Name="Validation.NumExpected">1</Parameter>
+        <Parameter Name="Validation.NumInput">6</Parameter>
+        <Parameter Name="Validation.Input1">
+            <Value>0</Value>
+            <Value>0</Value>
+            <Value>10</Value>
+            <Value>10000</Value>
+            <Value>2147483647</Value>
+            <Value>4294967295</Value>
+        </Parameter>
+        <Parameter Name="Validation.Input2">
+            <Value>0</Value>
+            <Value>256</Value>
+            <Value>4</Value>
+            <Value>10001</Value>
+            <Value>0</Value>
+            <Value>4294967295</Value>
+        </Parameter>
+        <Parameter Name="Validation.Expected1">
+            <Value>0</Value>
+            <Value>256</Value>
+            <Value>10</Value>
+            <Value>10001</Value>
+            <Value>2147483647</Value>
+            <Value>4294967295</Value>
+        </Parameter>
+        <Parameter Name="Validation.Tolerance">0</Parameter>
+      </Row>
+      <Row Name="UMin">
+        <Parameter Name="ShaderOp.Name">UMin</Parameter>
+        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
+        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
+        <Parameter Name="ShaderOp.Text">
+        <![CDATA[
+            struct SBinaryUintOp {
+                uint input1;
+                uint input2;
+                uint output1;
+                uint output2;
+            };
+            RWStructuredBuffer<SBinaryUintOp> g_buf : register(u0);
+            [numthreads(8,8,1)]
+            void main(uint GI : SV_GroupIndex) {
+                SBinaryUintOp l = g_buf[GI];
+                l.output1 = min(l.input1, l.input2);
+                g_buf[GI] = l;
+            };
+        ]]>
+        </Parameter>
+        <Parameter Name="Validation.NumExpected">1</Parameter>
+        <Parameter Name="Validation.NumInput">6</Parameter>
+        <Parameter Name="Validation.Input1">
+            <Value>0</Value>
+            <Value>0</Value>
+            <Value>10</Value>
+            <Value>10000</Value>
+            <Value>2147483647</Value>
+            <Value>4294967295</Value>
+        </Parameter>
+        <Parameter Name="Validation.Input2">
+            <Value>0</Value>
+            <Value>256</Value>
+            <Value>4</Value>
+            <Value>10001</Value>
+            <Value>0</Value>
+            <Value>4294967295</Value>
+        </Parameter>
+        <Parameter Name="Validation.Expected1">
+            <Value>0</Value>
+            <Value>0</Value>
+            <Value>4</Value>
+            <Value>10000</Value>
+            <Value>0</Value>
+            <Value>4294967295</Value>
+        </Parameter>
+        <Parameter Name="Validation.Tolerance">0</Parameter>
+      </Row>
+
+      <Row Name="UMul">
+        <Parameter Name="ShaderOp.Name">Mul</Parameter>
+        <Parameter Name="ShaderOp.Description">integer multiplication. Note that this calls llvm "mul" operation and not IMul</Parameter>
+        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
+        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
+        <Parameter Name="ShaderOp.Text">
+        <![CDATA[
+            struct SBinaryUintOp {
+                uint input1;
+                uint input2;
+                uint output1;
+                uint output2;
+            };
+            RWStructuredBuffer<SBinaryUintOp> g_buf : register(u0);
+            [numthreads(8,8,1)]
+            void main(uint GI : SV_GroupIndex) {
+                SBinaryUintOp l = g_buf[GI];
+                l.output1 = l.input1 * l.input2;
+                g_buf[GI] = l;
+            };
+        ]]>
+        </Parameter>
+        <Parameter Name="Validation.NumExpected">1</Parameter>
+        <Parameter Name="Validation.NumInput">5</Parameter>
+        <Parameter Name="Validation.Input1">
+            <Value>0</Value>
+            <Value>1</Value>
+            <Value>10</Value>
+            <Value>10000</Value>
+            <Value>2147483647</Value>
+        </Parameter>
+        <Parameter Name="Validation.Input2">
+            <Value>0</Value>
+            <Value>256</Value>
+            <Value>4</Value>
+            <Value>10001</Value>
+            <Value>0</Value>
+        </Parameter>
+        <Parameter Name="Validation.Expected1">
+            <Value>0</Value>
+            <Value>256</Value>
+            <Value>40</Value>
+            <Value>100010000</Value>
+            <Value>0</Value>
+        </Parameter>
+        <Parameter Name="Validation.Tolerance">0</Parameter>
+      </Row>
+
+      <Row Name="UDiv">
+        <Parameter Name="ShaderOp.Name">UDiv</Parameter>
+        <Parameter Name="ShaderOp.Description">integer division. Note that this calls llvm "div" and "rem" operations and not UDiv</Parameter>
+        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
+        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
+        <Parameter Name="ShaderOp.Text">
+        <![CDATA[
+            struct SBinaryUintOp {
+                uint input1;
+                uint input2;
+                uint output1;
+                uint output2;
+            };
+            RWStructuredBuffer<SBinaryUintOp> g_buf : register(u0);
+            [numthreads(8,8,1)]
+            void main(uint GI : SV_GroupIndex) {
+                SBinaryUintOp l = g_buf[GI];
+                l.output1 = l.input1 / l.input2;
+                l.output2 = l.input1 % l.input2;
+                g_buf[GI] = l;
+            };
+        ]]>
+        </Parameter>
+        <Parameter Name="Operation.Unsigned">true</Parameter>
+        <Parameter Name="Validation.NumExpected">2</Parameter>
+        <Parameter Name="Validation.NumInput">7</Parameter>
+        <Parameter Name="Validation.Input1">
+            <Value>1</Value>
+            <Value>1</Value>
+            <Value>10</Value>
+            <Value>10000</Value>
+            <Value>2147483647</Value>
+            <Value>2147483647</Value>
+            <Value>0xffffffff</Value>
+        </Parameter>
+        <Parameter Name="Validation.Input2">
+            <Value>0</Value>
+            <Value>256</Value>
+            <Value>4</Value>
+            <Value>10001</Value>
+            <Value>0</Value>
+            <Value>2147483647</Value>
+            <Value>1</Value>
+        </Parameter>
+        <Parameter Name="Validation.Expected1">
+            <Value>0xffffffff</Value>
+            <Value>0</Value>
+            <Value>2</Value>
+            <Value>0</Value>
+            <Value>0xffffffff</Value>
+            <Value>1</Value>
+            <Value>0xffffffff</Value>
+        </Parameter>
+        <Parameter Name="Validation.Expected2">
+            <Value>0xffffffff</Value>
+            <Value>1</Value>
+            <Value>2</Value>
+            <Value>10000</Value>
+            <Value>0xffffffff</Value>
+            <Value>0</Value>
+            <Value>0</Value>
+        </Parameter>
+        <Parameter Name="Validation.Tolerance">0</Parameter>
+      </Row>
+
+       <Row Name="UAddc">
+        <Parameter Name="ShaderOp.Name">UAddc</Parameter>
+        <Parameter Name="ShaderOp.Description">UAddc is called through AddUint64 intrinsic</Parameter>
+        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
+        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
+        <Parameter Name="ShaderOp.Text">
+        <![CDATA[
+            struct SBinaryUintOp {
+                uint input1;
+                uint input2;
+                uint output1;
+                uint output2;
+            };
+            RWStructuredBuffer<SBinaryUintOp> g_buf : register(u0);
+            [numthreads(8,8,1)]
+            void main(uint GI : SV_GroupIndex) {
+                SBinaryUintOp l = g_buf[GI];
+                uint2 x = uint2(l.input1, l.input2);
+                uint2 y = AddUint64(x, x);
+                l.output1 = y.x;
+                l.output2 = y.y;
+                g_buf[GI] = l;
+            };
+        ]]>
+        </Parameter>
+        <Parameter Name="Operation.Unsigned">true</Parameter>
+        <Parameter Name="Validation.NumExpected">2</Parameter>
+        <Parameter Name="Validation.NumInput">6</Parameter>
+        <Parameter Name="Validation.Input1">
+            <Value>1</Value>
+            <Value>1</Value>
+            <Value>10000</Value>
+            <Value>0x80000000</Value>
+            <Value>0x7fffffff</Value>
+            <Value>0xffffffff</Value>
+        </Parameter>
+        <Parameter Name="Validation.Input2">
+            <Value>0</Value>
+            <Value>256</Value>
+            <Value>10001</Value>
+            <Value>1</Value>
+            <Value>0x7fffffff</Value>
+            <Value>0x7fffffff</Value>
+        </Parameter>
+        <Parameter Name="Validation.Expected1">
+            <Value>2</Value>
+            <Value>2</Value>
+            <Value>20000</Value>
+            <Value>0</Value>
+            <Value>0xfffffffe</Value>
+            <Value>0xfffffffe</Value>
+        </Parameter>
+        <Parameter Name="Validation.Expected2">
+            <Value>0</Value>
+            <Value>512</Value>
+            <Value>20002</Value>
+            <Value>3</Value>
+            <Value>0xfffffffe</Value>
+            <Value>0xffffffff</Value>
+        </Parameter>
+        <Parameter Name="Validation.Tolerance">0</Parameter>
+      </Row>
+    </Table>
+
+    <Table Id="TertiaryIntOpTable">
+      <ParameterTypes>
+        <ParameterType Name="Description">String</ParameterType>
+        <ParameterType Name="ShaderOp.Name">String</ParameterType>
+        <ParameterType Name="ShaderOp.Target">String</ParameterType>
+        <ParameterType Name="ShaderOp.EntryPoint">String</ParameterType>
+        <ParameterType Name="ShaderOp.Text">String</ParameterType>
+        <ParameterType Name="Validation.Type">String</ParameterType>
+        <ParameterType Name="Validation.Tolerance">int</ParameterType>
+        <ParameterType Name="Validation.Input1" Array="true">int</ParameterType>
+        <ParameterType Name="Validation.Input2" Array="true">int</ParameterType>
+        <ParameterType Name="Validation.Input3" Array="true">int</ParameterType>
+        <ParameterType Name="Validation.Expected" Array="true">int</ParameterType>
+        <ParameterType Name="Validation.NumInput">unsigned int</ParameterType>
+      </ParameterTypes>
+
+      <Row Name="IMad">
+        <Parameter Name="ShaderOp.Name">msad4</Parameter>
+        <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
+        <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
+        <Parameter Name="ShaderOp.Text">
+        <![CDATA[
+            struct STertiaryIntOp {
+                int input1;
+                int input2;
+                int input3;
+                int output;
+            };
+            RWStructuredBuffer<STertiaryIntOp> g_buf : register(u0);
+            [numthreads(8,8,1)]
+            void main(uint GI : SV_GroupIndex) {
+                STertiaryIntOp l = g_buf[GI];
+                l.output = mad(l.input1, l.input2, l.input3);
+                g_buf[GI] = l;
+            };
+        ]]>
+        </Parameter>
+        <Parameter Name="Validation.Type">epsilon</Parameter>
+        <Parameter Name="Validation.Tolerance">0</Parameter>
+        <Parameter Name="Validation.Input1">
+            <Value>-2147483647</Value>
+            <Value>-256</Value>
+            <Value>-1</Value>
+            <Value>0</Value>
+            <Value>1</Value>
+            <Value>2</Value>
+            <Value>16</Value>
+            <Value>2147483647</Value>
+            <Value>1</Value>
+            <Value>-1</Value>
+            <Value>1</Value>
+            <Value>10</Value>
+        </Parameter>
+        <Parameter Name="Validation.Input2">
+            <Value>1</Value>
+            <Value>-256</Value>
+            <Value>-1</Value>
+            <Value>0</Value>
+            <Value>1</Value>
+            <Value>3</Value>
+            <Value>16</Value>
+            <Value>0</Value>
+            <Value>1</Value>
+            <Value>-1</Value>
+            <Value>10</Value>
+            <Value>100</Value>
+        </Parameter>
+        <Parameter Name="Validation.Input3">
+            <Value>0</Value>
+            <Value>0</Value>
+            <Value>0</Value>
+            <Value>0</Value>
+            <Value>1</Value>
+            <Value>3</Value>
+            <Value>1</Value>
+            <Value>255</Value>
+            <Value>2147483646</Value>
+            <Value>-2147483647</Value>
+            <Value>-10</Value>
+            <Value>-2000</Value>
+        </Parameter>
+
+        <Parameter Name="Validation.Expected">
+            <Value>-2147483647</Value>
+            <Value>65536</Value>
+            <Value>1</Value>
+            <Value>0</Value>
+            <Value>2</Value>
+            <Value>9</Value>
+            <Value>257</Value>
+            <Value>255</Value>
+            <Value>2147483647</Value>
+            <Value>-2147483646</Value>
+            <Value>0</Value>
+            <Value>-1000</Value>
+        </Parameter>
+        <Parameter Name="Validation.NumInput">12</Parameter>
+      </Row>
+      </Table>
+
+      <Table Id="TertiaryUintOpTable">
+        <ParameterTypes>
+            <ParameterType Name="Description">String</ParameterType>
+            <ParameterType Name="ShaderOp.Name">String</ParameterType>
+            <ParameterType Name="ShaderOp.Target">String</ParameterType>
+            <ParameterType Name="ShaderOp.EntryPoint">String</ParameterType>
+            <ParameterType Name="ShaderOp.Text">String</ParameterType>
+            <ParameterType Name="Validation.Type">String</ParameterType>
+            <ParameterType Name="Validation.Tolerance">int</ParameterType>
+            <ParameterType Name="Validation.Input1" Array="true">int</ParameterType>
+            <ParameterType Name="Validation.Input2" Array="true">int</ParameterType>
+            <ParameterType Name="Validation.Input3" Array="true">int</ParameterType>
+            <ParameterType Name="Validation.Expected" Array="true">int</ParameterType>
+            <ParameterType Name="Validation.NumInput">unsigned int</ParameterType>
+        </ParameterTypes>
+
+        <Row Name="UMad">
+            <Parameter Name="ShaderOp.Name">UMad</Parameter>
+            <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
+            <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
+            <Parameter Name="ShaderOp.Text">
+            <![CDATA[
+                struct STertiaryUintOp {
+                    uint input1;
+                    uint input2;
+                    uint input3;
+                    uint output;
+                };
+                RWStructuredBuffer<STertiaryUintOp> g_buf : register(u0);
+                [numthreads(8,8,1)]
+                void main(uint GI : SV_GroupIndex) {
+                    STertiaryUintOp l = g_buf[GI];
+                    l.output = mad(l.input1, l.input2, l.input3);
+                    g_buf[GI] = l;
+                };
+            ]]>
+            </Parameter>
+            <Parameter Name="Validation.Type">epsilon</Parameter>
+            <Parameter Name="Validation.Tolerance">0</Parameter>
+            <Parameter Name="Validation.Input1">
+                <Value>0</Value>
+                <Value>1</Value>
+                <Value>2</Value>
+                <Value>16</Value>
+                <Value>2147483647</Value>
+                <Value>0</Value>
+                <Value>10</Value>
+            </Parameter>
+            <Parameter Name="Validation.Input2">
+                <Value>0</Value>
+                <Value>1</Value>
+                <Value>2</Value>
+                <Value>16</Value>
+                <Value>1</Value>
+                <Value>0</Value>
+                <Value>10</Value>
+            </Parameter>
+            <Parameter Name="Validation.Input3">
+                <Value>0</Value>
+                <Value>0</Value>
+                <Value>1</Value>
+                <Value>15</Value>
+                <Value>0</Value>
+                <Value>10</Value>
+                <Value>10</Value>
+            </Parameter>
+
+            <Parameter Name="Validation.Expected">
+                <Value>0</Value>
+                <Value>1</Value>
+                <Value>5</Value>
+                <Value>271</Value>
+                <Value>2147483647</Value>
+                <Value>10</Value>
+                <Value>110</Value>
+            </Parameter>
+            <Parameter Name="Validation.NumInput">7</Parameter>
+      </Row>
+      </Table>
+
+      <Table Id="DotOpTable">
+        <ParameterTypes>
+            <ParameterType Name="Description">String</ParameterType>
+            <ParameterType Name="ShaderOp.Name">String</ParameterType>
+            <ParameterType Name="ShaderOp.Target">String</ParameterType>
+            <ParameterType Name="ShaderOp.EntryPoint">String</ParameterType>
+            <ParameterType Name="ShaderOp.Text">String</ParameterType>
+            <ParameterType Name="Validation.Type">String</ParameterType>
+            <ParameterType Name="Validation.Tolerance">int</ParameterType>
+            <ParameterType Name="Validation.Input1" Array="true">String</ParameterType>
+            <ParameterType Name="Validation.Input2" Array="true">String</ParameterType>
+            <ParameterType Name="Validation.dot2" Array="true">String</ParameterType>
+            <ParameterType Name="Validation.dot3" Array="true">String</ParameterType>
+            <ParameterType Name="Validation.dot4" Array="true">String</ParameterType>
+            <ParameterType Name="Validation.NumInput">unsigned int</ParameterType>
+        </ParameterTypes>
+
+        <Row Name="Dot">
+            <Parameter Name="ShaderOp.Name">Dot</Parameter>
+            <Parameter Name="ShaderOp.Target">cs_6_0</Parameter>
+            <Parameter Name="ShaderOp.EntryPoint">main</Parameter>
+            <Parameter Name="ShaderOp.Text">
+            <![CDATA[
+                struct SDotOp {
+                   float4 input1;
+                   float4 input2;
+                   float o_dot2;
+                   float o_dot3;
+                   float o_dot4;
+                };
+                RWStructuredBuffer<SDotOp> g_buf : register(u0);
+                [numthreads(8,8,1)]
+                void main(uint GI : SV_GroupIndex) {
+                    SDotOp l = g_buf[GI];
+                    l.o_dot2 = dot(l.input1.xy, l.input2.xy);
+                    l.o_dot3 = dot(l.input1.xyz, l.input2.xyz);
+                    l.o_dot4 = dot(l.input1.xyzw, l.input2.xyzw);
+                    g_buf[GI] = l;
+                };
+            ]]>
+            </Parameter>
+            <Parameter Name="Validation.Type">epsilon</Parameter>
+            <Parameter Name="Validation.Tolerance">0.008</Parameter>
+            <Parameter Name="Validation.Input1">
+                <Value>NaN,NaN,NaN,NaN</Value>
+                <Value>-Inf,-Inf,-Inf,-Inf</Value>
+                <Value>-denorm,-denorm,-denorm,-denorm</Value>
+                <Value>-0,-0,-0,-0</Value>
+                <Value>0,0,0,0</Value>
+                <Value>denorm,denorm,denorm,denorm</Value>
+                <Value>Inf,Inf,Inf,Inf</Value>
+                <Value>1,1,1,1</Value>
+                <Value>-10,0,0,10</Value>
+                <Value>Inf,Inf,Inf,-Inf</Value>
+            </Parameter>
+            <Parameter Name="Validation.Input2">
+                <Value>NaN,NaN,NaN,NaN</Value>
+                <Value>-Inf,-Inf,-Inf,-Inf</Value>
+                <Value>-denorm,-denorm,-denorm,-denorm</Value>
+                <Value>-0,-0,-0,-0</Value>
+                <Value>0,0,0,0</Value>
+                <Value>denorm,denorm,denorm,denorm</Value>
+                <Value>Inf,Inf,Inf,Inf</Value>
+                <Value>1,1,1,1</Value>
+                <Value>10,0,0,10</Value>
+                <Value>Inf,Inf,Inf,Inf</Value>
+            </Parameter>
+            <Parameter Name="Validation.dot2">
+                <Value>NaN</Value>
+                <Value>Inf</Value>
+                <Value>0</Value>
+                <Value>0</Value>
+                <Value>0</Value>
+                <Value>0</Value>
+                <Value>Inf</Value>
+                <Value>2</Value>
+                <Value>-100</Value>
+                <Value>Inf</Value>
+            </Parameter>
+            <Parameter Name="Validation.dot3">
+                <Value>NaN</Value>
+                <Value>Inf</Value>
+                <Value>0</Value>
+                <Value>0</Value>
+                <Value>0</Value>
+                <Value>0</Value>
+                <Value>Inf</Value>
+                <Value>3</Value>
+                <Value>-100</Value>
+                <Value>Inf</Value>
+            </Parameter>
+            <Parameter Name="Validation.dot4">
+                <Value>NaN</Value>
+                <Value>Inf</Value>
+                <Value>0</Value>
+                <Value>0</Value>
+                <Value>0</Value>
+                <Value>0</Value>
+                <Value>Inf</Value>
+                <Value>4</Value>
+                <Value>0</Value>
+                <Value>NaN</Value>
+            </Parameter>
+            <Parameter Name="Validation.NumInput">10</Parameter>
+        </Row>
+      </Table>
+
+      <Table Id="MSad4Table">
+        <ParameterTypes>
+            <ParameterType Name="Description">String</ParameterType>
+            <ParameterType Name="ShaderOp.Text">String</ParameterType>
+            <ParameterType Name="Validation.Tolerance">int</ParameterType>
+            <ParameterType Name="Validation.Reference" Array="true">unsigned int</ParameterType>
+            <ParameterType Name="Validation.Source" Array="true">String</ParameterType>
+            <ParameterType Name="Validation.Accum" Array="true">String</ParameterType>
+            <ParameterType Name="Validation.Expected" Array="true">String</ParameterType>
+            <ParameterType Name="Validation.NumInput">unsigned int</ParameterType>
+        </ParameterTypes>
+        <Row Name="MSad4">
+            <Parameter Name="Description">Msad4 intrinsic calls both Bfi and Msad</Parameter>
+            <Parameter Name="ShaderOp.Text">
+                <![CDATA[
+                    struct SMsad4 {
+                        uint ref;
+                        uint2 source;
+                        uint4 accum;
+                        uint4 result;
+                    };
+                    RWStructuredBuffer<SMsad4> g_buf : register(u0);
+                    [numthreads(8,8,1)]
+                    void main(uint GI : SV_GroupIndex) {
+                        SMsad4 l = g_buf[GI];
+                        l.result = msad4(l.ref, l.source, l.accum);
+                        g_buf[GI] = l;
+                    }
+                ]]>
+            </Parameter>
+            <Parameter Name="Validation.Tolerance">0</Parameter>
+            <Parameter Name="Validation.NumInput">4</Parameter>
+            <Parameter Name="Validation.Reference">
+                <Value>0xA100B2C3</Value>
+                <Value>0x00000000</Value>
+                <Value>0xFFFF01C1</Value>
+                <Value>0xFFFFFFFF</Value>
+            </Parameter>
+            <Parameter Name="Validation.Source">
+                <Value>0xD7B0C372, 0x4F57C2A3</Value>
+                <Value>0xFFFFFFFF, 0x00000000</Value>
+                <Value>0x38A03AEF, 0x38194DA3</Value>
+                <Value>0xFFFFFFFF, 0x00000000</Value>
+            </Parameter>
+            <Parameter Name="Validation.Accum">
+                <Value>1,2,3,4</Value>
+                <Value>1,2,3,4</Value>
+                <Value>0,0,0,0</Value>
+                <Value>10,10,10,10</Value>
+            </Parameter>
+            <Parameter Name="Validation.Expected">
+                <Value>153,6,92,113</Value>
+                <Value>1,2,3,4</Value>
+                <Value>397,585,358,707</Value>
+                <Value>10,265,520,775</Value>
+            </Parameter>
+        </Row>
+       </Table>
+
+       <Table Id="WaveIntrinsicsActiveIntTable">
+        <ParameterTypes>
+            <ParameterType Name="ShaderOp.Name">String</ParameterType>
+            <ParameterType Name="ShaderOp.Text">String</ParameterType>
+            <ParameterType Name="Validation.NumInputSet">unsigned int</ParameterType>
+            <ParameterType Name="Validation.InputSet1" Array="true">int</ParameterType>
+            <ParameterType Name="Validation.InputSet2" Array="true">int</ParameterType>
+            <ParameterType Name="Validation.InputSet3" Array="true">int</ParameterType>
+            <ParameterType Name="Validation.InputSet4" Array="true">int</ParameterType>
+        </ParameterTypes>
+
+        <Row Name="WaveActiveSum">
+            <Parameter Name="ShaderOp.Name">WaveActiveSum</Parameter>
+            <Parameter Name="ShaderOp.Text">
+                <![CDATA[
+                    struct PerThreadData {
+                        int firstLaneId;
+                        int mask;
+                        int input;
+                        int output;
+                    };
+                    RWStructuredBuffer<PerThreadData> g_sb : register(u0);
+                    [numthreads(8,12,1)]
+                    void main(uint GI : SV_GroupIndex) {
+                        PerThreadData pts = g_sb[GI];
+                        pts.firstLaneId = WaveReadLaneFirst(GI);
+                        if (pts.mask != 0) {
+                            pts.output = WaveActiveSum(pts.input);
+                        }
+                        else {
+                            pts.output = WaveActiveSum(pts.input);
+                        }
+                        g_sb[GI] = pts;
+                    }
+                ]]>
+            </Parameter>
+            <Parameter Name="Validation.NumInputSet">3</Parameter>
+            <Parameter Name="Validation.InputSet1">
+                <Value>1</Value>
+                <Value>2</Value>
+                <Value>3</Value>
+                <Value>4</Value>
+            </Parameter>
+            <Parameter Name="Validation.InputSet2">
+                <Value>0</Value>
+            </Parameter>
+            <Parameter Name="Validation.InputSet3">
+                <Value>2</Value>
+                <Value>4</Value>
+                <Value>8</Value>
+                <Value>-64</Value>
+            </Parameter>
+        </Row>
+
+         <Row Name="WaveActiveProduct">
+            <Parameter Name="ShaderOp.Name">WaveActiveProduct</Parameter>
+            <Parameter Name="ShaderOp.Text">
+                <![CDATA[
+                    struct PerThreadData {
+                        int firstLaneId;
+                        int mask;
+                        int input;
+                        int output;
+                    };
+                    RWStructuredBuffer<PerThreadData> g_sb : register(u0);
+                    [numthreads(8,12,1)]
+                    void main(uint GI : SV_GroupIndex) {
+                        PerThreadData pts = g_sb[GI];
+                        pts.firstLaneId = WaveReadLaneFirst(GI);
+                        if (pts.mask != 0) {
+                            pts.output = WaveActiveProduct(pts.input);
+                        }
+                        else {
+                            pts.output = WaveActiveProduct(pts.input);
+                        }
+                        g_sb[GI] = pts;
+                    }
+                ]]>
+            </Parameter>
+            <Parameter Name="Validation.NumInputSet">3</Parameter>
+            <Parameter Name="Validation.InputSet1">
+                <Value>1</Value>
+                <Value>2</Value>
+                <Value>3</Value>
+                <Value>4</Value>
+            </Parameter>
+            <Parameter Name="Validation.InputSet2">
+                <Value>0</Value>
+            </Parameter>
+            <Parameter Name="Validation.InputSet3">
+                <Value>1</Value>
+                <Value>2</Value>
+                <Value>4</Value>
+                <Value>-64</Value>
+            </Parameter>
+        </Row>
+
+        <Row Name="WaveActiveCountBits">
+            <Parameter Name="ShaderOp.Name">WaveActiveCountBits</Parameter>
+            <Parameter Name="ShaderOp.Text">
+                <![CDATA[
+                    struct PerThreadData {
+                        int firstLaneId;
+                        int mask;
+                        int input;
+                        int output;
+                    };
+                    RWStructuredBuffer<PerThreadData> g_sb : register(u0);
+                    [numthreads(8,12,1)]
+                    void main(uint GI : SV_GroupIndex) {
+                        PerThreadData pts = g_sb[GI];
+                        pts.firstLaneId = WaveReadLaneFirst(GI);
+                        if (pts.mask != 0) {
+                            pts.output = WaveActiveCountBits(pts.input > 3);
+                        }
+                        else {
+                            pts.output = WaveActiveCountBits(pts.input > 3);
+                        }
+                        g_sb[GI] = pts;
+                    }
+                ]]>
+            </Parameter>
+            <Parameter Name="Validation.NumInputSet">4</Parameter>
+            <Parameter Name="Validation.InputSet1">
+                <Value>1</Value>
+                <Value>2</Value>
+                <Value>3</Value>
+                <Value>4</Value>
+            </Parameter>
+            <Parameter Name="Validation.InputSet2">
+                <Value>0</Value>
+            </Parameter>
+            <Parameter Name="Validation.InputSet3">
+                <Value>1</Value>
+                <Value>10</Value>
+                <Value>-4</Value>
+                <Value>-64</Value>
+            </Parameter>
+            <Parameter Name="Validation.InputSet4">
+                <Value>-100</Value>
+                <Value>-1000</Value>
+                <Value>300</Value>
+            </Parameter>
+        </Row>
+
+        <Row Name="WaveActiveMax">
+            <Parameter Name="ShaderOp.Name">WaveActiveMax</Parameter>
+            <Parameter Name="ShaderOp.Text">
+                <![CDATA[
+                    struct PerThreadData {
+                        int firstLaneId;
+                        int mask;
+                        int input;
+                        int output;
+                    };
+                    RWStructuredBuffer<PerThreadData> g_sb : register(u0);
+                    [numthreads(8,12,1)]
+                    void main(uint GI : SV_GroupIndex) {
+                        PerThreadData pts = g_sb[GI];
+                        pts.firstLaneId = WaveReadLaneFirst(GI);
+                        if (pts.mask != 0) {
+                            pts.output = WaveActiveMax(pts.input);
+                        }
+                        else {
+                            pts.output = WaveActiveMax(pts.input);
+                        }
+                        g_sb[GI] = pts;
+                    }
+                ]]>
+            </Parameter>
+            <Parameter Name="Validation.NumInputSet">4</Parameter>
+            <Parameter Name="Validation.InputSet1">
+                <Value>1</Value>
+                <Value>2</Value>
+                <Value>3</Value>
+                <Value>4</Value>
+            </Parameter>
+            <Parameter Name="Validation.InputSet2">
+                <Value>0</Value>
+            </Parameter>
+            <Parameter Name="Validation.InputSet3">
+                <Value>1</Value>
+                <Value>10</Value>
+                <Value>-4</Value>
+                <Value>-64</Value>
+            </Parameter>
+            <Parameter Name="Validation.InputSet4">
+                <Value>-100</Value>
+                <Value>-1000</Value>
+                <Value>300</Value>
+            </Parameter>
+        </Row>
+
+        <Row Name="WaveActiveMin">
+            <Parameter Name="ShaderOp.Name">WaveActiveMin</Parameter>
+            <Parameter Name="ShaderOp.Text">
+                <![CDATA[
+                    struct PerThreadData {
+                        int firstLaneId;
+                        int mask;
+                        int input;
+                        int output;
+                    };
+                    RWStructuredBuffer<PerThreadData> g_sb : register(u0);
+                    [numthreads(8,12,1)]
+                    void main(uint GI : SV_GroupIndex) {
+                        PerThreadData pts = g_sb[GI];
+                        pts.firstLaneId = WaveReadLaneFirst(GI);
+                        if (pts.mask != 0) {
+                            pts.output = WaveActiveMin(pts.input);
+                        }
+                        else {
+                            pts.output = WaveActiveMin(pts.input);
+                        }
+                        g_sb[GI] = pts;
+                    }
+                ]]>
+            </Parameter>
+            <Parameter Name="Validation.NumInputSet">4</Parameter>
+            <Parameter Name="Validation.InputSet1">
+                <Value>1</Value>
+                <Value>2</Value>
+                <Value>3</Value>
+                <Value>4</Value>
+                <Value>5</Value>
+                <Value>6</Value>
+                <Value>7</Value>
+                <Value>8</Value>
+                <Value>9</Value>
+                <Value>10</Value>
+            </Parameter>
+            <Parameter Name="Validation.InputSet2">
+                <Value>0</Value>
+            </Parameter>
+            <Parameter Name="Validation.InputSet3">
+                <Value>1</Value>
+                <Value>10</Value>
+                <Value>-4</Value>
+                <Value>-64</Value>
+            </Parameter>
+            <Parameter Name="Validation.InputSet4">
+                <Value>-100</Value>
+                <Value>-1000</Value>
+                <Value>300</Value>
+            </Parameter>
+        </Row>
+
+        <Row Name="WaveActiveAllEqual">
+            <Parameter Name="ShaderOp.Name">WaveActiveAllEqual</Parameter>
+            <Parameter Name="ShaderOp.Text">
+                <![CDATA[
+                    struct PerThreadData {
+                        int firstLaneId;
+                        int mask;
+                        int input;
+                        int output;
+                    };
+                    RWStructuredBuffer<PerThreadData> g_sb : register(u0);
+                    [numthreads(8,12,1)]
+                    void main(uint GI : SV_GroupIndex) {
+                        PerThreadData pts = g_sb[GI];
+                        pts.firstLaneId = WaveReadLaneFirst(GI);
+                        if (pts.mask != 0) {
+                            pts.output = WaveActiveAllEqual(pts.input);
+                        }
+                        else {
+                            pts.output = WaveActiveAllEqual(pts.input);
+                        }
+                        g_sb[GI] = pts;
+                    }
+                ]]>
+            </Parameter>
+            <Parameter Name="Validation.NumInputSet">3</Parameter>
+            <Parameter Name="Validation.InputSet1">
+                <Value>1</Value>
+                <Value>2</Value>
+                <Value>3</Value>
+                <Value>4</Value>
+                <Value>1</Value>
+                <Value>1</Value>
+                <Value>1</Value>
+                <Value>1</Value>
+            </Parameter>
+            <Parameter Name="Validation.InputSet2">
+                <Value>3</Value>
+            </Parameter>
+            <Parameter Name="Validation.InputSet3">
+                <Value>-10</Value>
+            </Parameter>
+        </Row>
+
+        <Row Name="WaveActiveAnyTrue">
+            <Parameter Name="ShaderOp.Name">WaveActiveAnyTrue</Parameter>
+            <Parameter Name="ShaderOp.Text">
+                <![CDATA[
+                    struct PerThreadData {
+                        int firstLaneId;
+                        int mask;
+                        bool input;
+                        bool output;
+                    };
+                    RWStructuredBuffer<PerThreadData> g_sb : register(u0);
+                    [numthreads(8,12,1)]
+                    void main(uint GI : SV_GroupIndex) {
+                        PerThreadData pts = g_sb[GI];
+                        pts.firstLaneId = WaveReadLaneFirst(GI);
+                        if (pts.mask != 0) {
+                            pts.output = WaveActiveAnyTrue(pts.input);
+                        }
+                        else {
+                            pts.output = WaveActiveAnyTrue(pts.input);
+                        }
+                        g_sb[GI] = pts;
+                    }
+                ]]>
+            </Parameter>
+            <Parameter Name="Validation.NumInputSet">3</Parameter>
+            <Parameter Name="Validation.InputSet1">
+                <Value>1</Value>
+                <Value>0</Value>
+                <Value>1</Value>
+                <Value>0</Value>
+                <Value>1</Value>
+            </Parameter>
+            <Parameter Name="Validation.InputSet2">
+                <Value>1</Value>
+            </Parameter>
+            <Parameter Name="Validation.InputSet3">
+                <Value>0</Value>
+            </Parameter>
+        </Row>
+
+        <Row Name="WaveActiveAllTrue">
+            <Parameter Name="ShaderOp.Name">WaveActiveAllTrue</Parameter>
+            <Parameter Name="ShaderOp.Text">
+                <![CDATA[
+                    struct PerThreadData {
+                        int firstLaneId;
+                        int mask;
+                        bool input;
+                        bool output;
+                    };
+                    RWStructuredBuffer<PerThreadData> g_sb : register(u0);
+                    [numthreads(8,12,1)]
+                    void main(uint GI : SV_GroupIndex) {
+                        PerThreadData pts = g_sb[GI];
+                        pts.firstLaneId = WaveReadLaneFirst(GI);
+                        if (pts.mask != 0) {
+                            pts.output = WaveActiveAllTrue(pts.input);
+                        }
+                        else {
+                            pts.output = WaveActiveAllTrue(pts.input);
+                        }
+                        g_sb[GI] = pts;
+                    }
+                ]]>
+            </Parameter>
+            <Parameter Name="Validation.NumInputSet">3</Parameter>
+            <Parameter Name="Validation.InputSet1">
+                <Value>1</Value>
+                <Value>0</Value>
+                <Value>1</Value>
+                <Value>0</Value>
+                <Value>1</Value>
+            </Parameter>
+            <Parameter Name="Validation.InputSet2">
+                <Value>1</Value>
+            </Parameter>
+            <Parameter Name="Validation.InputSet3">
+                <Value>1</Value>
+            </Parameter>
+        </Row>
+      </Table>
+
+      <Table Id="WaveIntrinsicsActiveUintTable">
+        <ParameterTypes>
+            <ParameterType Name="ShaderOp.Name">String</ParameterType>
+            <ParameterType Name="ShaderOp.Text">String</ParameterType>
+            <ParameterType Name="Validation.NumInputSet">unsigned int</ParameterType>
+            <ParameterType Name="Validation.InputSet1" Array="true">unsigned int</ParameterType>
+            <ParameterType Name="Validation.InputSet2" Array="true">unsigned int</ParameterType>
+            <ParameterType Name="Validation.InputSet3" Array="true">unsigned int</ParameterType>
+            <ParameterType Name="Validation.InputSet4" Array="true">unsigned int</ParameterType>
+        </ParameterTypes>
+
+       <Row Name="WaveActiveUSum">
+            <Parameter Name="ShaderOp.Name">WaveActiveUSum</Parameter>
+            <Parameter Name="ShaderOp.Text">
+                <![CDATA[
+                    struct PerThreadData {
+                        int firstLaneId;
+                        int mask;
+                        uint input;
+                        uint output;
+                    };
+                    RWStructuredBuffer<PerThreadData> g_sb : register(u0);
+                    [numthreads(8,12,1)]
+                    void main(uint GI : SV_GroupIndex) {
+                        PerThreadData pts = g_sb[GI];
+                        pts.firstLaneId = WaveReadLaneFirst(GI);
+                        if (pts.mask != 0) {
+                            pts.output = WaveActiveSum(pts.input);
+                        }
+                        else {
+                            pts.output = WaveActiveSum(pts.input);
+                        }
+                        g_sb[GI] = pts;
+                    }
+                ]]>
+            </Parameter>
+            <Parameter Name="Validation.NumInputSet">3</Parameter>
+            <Parameter Name="Validation.InputSet1">
+                <Value>1</Value>
+                <Value>2</Value>
+                <Value>3</Value>
+                <Value>4</Value>
+            </Parameter>
+            <Parameter Name="Validation.InputSet2">
+                <Value>0</Value>
+            </Parameter>
+            <Parameter Name="Validation.InputSet3">
+                <Value>2</Value>
+                <Value>4</Value>
+                <Value>8</Value>
+                <Value>64</Value>
+            </Parameter>
+        </Row>
+
+         <Row Name="WaveActiveUProduct">
+            <Parameter Name="ShaderOp.Name">WaveActiveUProduct</Parameter>
+            <Parameter Name="ShaderOp.Text">
+                <![CDATA[
+                    struct PerThreadData {
+                        int firstLaneId;
+                        int mask;
+                        uint input;
+                        uint output;
+                    };
+                    RWStructuredBuffer<PerThreadData> g_sb : register(u0);
+                    [numthreads(8,12,1)]
+                    void main(uint GI : SV_GroupIndex) {
+                        PerThreadData pts = g_sb[GI];
+                        pts.firstLaneId = WaveReadLaneFirst(GI);
+                        if (pts.mask != 0) {
+                            pts.output = WaveActiveProduct(pts.input);
+                        }
+                        else {
+                            pts.output = WaveActiveProduct(pts.input);
+                        }
+                        g_sb[GI] = pts;
+                    }
+                ]]>
+            </Parameter>
+            <Parameter Name="Validation.NumInputSet">3</Parameter>
+            <Parameter Name="Validation.InputSet1">
+                <Value>1</Value>
+                <Value>2</Value>
+                <Value>3</Value>
+                <Value>4</Value>
+            </Parameter>
+            <Parameter Name="Validation.InputSet2">
+                <Value>0</Value>
+            </Parameter>
+            <Parameter Name="Validation.InputSet3">
+                <Value>1</Value>
+                <Value>2</Value>
+                <Value>4</Value>
+                <Value>64</Value>
+            </Parameter>
+        </Row>
+
+        <Row Name="WaveActiveUMax">
+            <Parameter Name="ShaderOp.Name">WaveActiveUMax</Parameter>
+            <Parameter Name="ShaderOp.Text">
+                <![CDATA[
+                    struct PerThreadData {
+                        int firstLaneId;
+                        int mask;
+                        uint input;
+                        uint output;
+                    };
+                    RWStructuredBuffer<PerThreadData> g_sb : register(u0);
+                    [numthreads(8,12,1)]
+                    void main(uint GI : SV_GroupIndex) {
+                        PerThreadData pts = g_sb[GI];
+                        pts.firstLaneId = WaveReadLaneFirst(GI);
+                        if (pts.mask != 0) {
+                            pts.output = WaveActiveMax(pts.input);
+                        }
+                        else {
+                            pts.output = WaveActiveMax(pts.input);
+                        }
+                        g_sb[GI] = pts;
+                    }
+                ]]>
+            </Parameter>
+            <Parameter Name="Validation.NumInputSet">3</Parameter>
+            <Parameter Name="Validation.InputSet1">
+                <Value>1</Value>
+                <Value>2</Value>
+                <Value>3</Value>
+                <Value>4</Value>
+            </Parameter>
+            <Parameter Name="Validation.InputSet2">
+                <Value>0</Value>
+            </Parameter>
+            <Parameter Name="Validation.InputSet3">
+                <Value>1</Value>
+                <Value>10</Value>
+                <Value>4</Value>
+                <Value>64</Value>
+            </Parameter>
+        </Row>
+
+        <Row Name="WaveActiveUMin">
+            <Parameter Name="ShaderOp.Name">WaveActiveUMin</Parameter>
+            <Parameter Name="ShaderOp.Text">
+                <![CDATA[
+                    struct PerThreadData {
+                        int firstLaneId;
+                        int mask;
+                        uint input;
+                        uint output;
+                    };
+                    RWStructuredBuffer<PerThreadData> g_sb : register(u0);
+                    [numthreads(8,12,1)]
+                    void main(uint GI : SV_GroupIndex) {
+                        PerThreadData pts = g_sb[GI];
+                        pts.firstLaneId = WaveReadLaneFirst(GI);
+                        if (pts.mask != 0) {
+                            pts.output = WaveActiveMin(pts.input);
+                        }
+                        else {
+                            pts.output = WaveActiveMin(pts.input);
+                        }
+                        g_sb[GI] = pts;
+                    }
+                ]]>
+            </Parameter>
+            <Parameter Name="Validation.NumInputSet">3</Parameter>
+            <Parameter Name="Validation.InputSet1">
+                <Value>1</Value>
+                <Value>2</Value>
+                <Value>3</Value>
+                <Value>4</Value>
+                <Value>5</Value>
+                <Value>6</Value>
+                <Value>7</Value>
+                <Value>8</Value>
+                <Value>9</Value>
+                <Value>10</Value>
+            </Parameter>
+            <Parameter Name="Validation.InputSet2">
+                <Value>0</Value>
+            </Parameter>
+            <Parameter Name="Validation.InputSet3">
+                <Value>1</Value>
+                <Value>10</Value>
+                <Value>4</Value>
+                <Value>64</Value>
+            </Parameter>
+        </Row>
+
+        <Row Name="WaveActiveBitOr">
+            <Parameter Name="ShaderOp.Name">WaveActiveBitOr</Parameter>
+            <Parameter Name="ShaderOp.Text">
+                <![CDATA[
+                    struct PerThreadData {
+                        int firstLaneId;
+                        int mask;
+                        uint input;
+                        uint output;
+                    };
+                    RWStructuredBuffer<PerThreadData> g_sb : register(u0);
+                    [numthreads(8,12,1)]
+                    void main(uint GI : SV_GroupIndex) {
+                        PerThreadData pts = g_sb[GI];
+                        pts.firstLaneId = WaveReadLaneFirst(GI);
+                        if (pts.mask != 0) {
+                            pts.output = WaveActiveBitOr(pts.input);
+                        }
+                        else {
+                            pts.output = WaveActiveBitOr(pts.input);
+                        }
+                        g_sb[GI] = pts;
+                    }
+                ]]>
+            </Parameter>
+            <Parameter Name="Validation.NumInputSet">4</Parameter>
+            <Parameter Name="Validation.InputSet1">
+                <Value>0xe0000000</Value>
+                <Value>0x0d000000</Value>
+                <Value>0x00b00000</Value>
+                <Value>0x00070000</Value>
+                <Value>0x0000e000</Value>
+                <Value>0x00000d00</Value>
+                <Value>0x000000b0</Value>
+                <Value>0x00000007</Value>
+            </Parameter>
+            <Parameter Name="Validation.InputSet2">
+                <Value>0xedb7edb7</Value>
+                <Value>0xdb7edb7e</Value>
+                <Value>0xb7edb7ed</Value>
+                <Value>0x7edb7edb</Value>
+            </Parameter>
+            <Parameter Name="Validation.InputSet3">
+                <Value>0x12481248</Value>
+                <Value>0x24812481</Value>
+                <Value>0x48124812</Value>
+                <Value>0x81248124</Value>
+            </Parameter>
+            <Parameter Name="Validation.InputSet4">
+                <Value>0x00000000</Value>
+                <Value>0xffffffff</Value>
+            </Parameter>
+        </Row>
+
+        <Row Name="WaveActiveBitAnd">
+            <Parameter Name="ShaderOp.Name">WaveActiveBitAnd</Parameter>
+            <Parameter Name="ShaderOp.Text">
+                <![CDATA[
+                    struct PerThreadData {
+                        int firstLaneId;
+                        int mask;
+                        uint input;
+                        uint output;
+                    };
+                    RWStructuredBuffer<PerThreadData> g_sb : register(u0);
+                    [numthreads(8,12,1)]
+                    void main(uint GI : SV_GroupIndex) {
+                        PerThreadData pts = g_sb[GI];
+                        pts.firstLaneId = WaveReadLaneFirst(GI);
+                        if (pts.mask != 0) {
+                            pts.output = WaveActiveBitAnd(pts.input);
+                        }
+                        else {
+                            pts.output = WaveActiveBitAnd(pts.input);
+                        }
+                        g_sb[GI] = pts;
+                    }
+                ]]>
+            </Parameter>
+            <Parameter Name="Validation.NumInputSet">4</Parameter>
+            <Parameter Name="Validation.InputSet1">
+                <Value>0xefffffff</Value>
+                <Value>0xfdffffff</Value>
+                <Value>0xffbfffff</Value>
+                <Value>0xfff7ffff</Value>
+                <Value>0xffffefff</Value>
+                <Value>0xfffffdff</Value>
+                <Value>0xffffffbf</Value>
+                <Value>0xfffffff7</Value>
+            </Parameter>
+            <Parameter Name="Validation.InputSet2">
+                <Value>0xedb7edb7</Value>
+                <Value>0xdb7edb7e</Value>
+                <Value>0xb7edb7ed</Value>
+                <Value>0x7edb7edb</Value>
+            </Parameter>
+            <Parameter Name="Validation.InputSet3">
+                <Value>0x12481248</Value>
+                <Value>0x24812481</Value>
+                <Value>0x48124812</Value>
+                <Value>0x81248124</Value>
+            </Parameter>
+            <Parameter Name="Validation.InputSet4">
+                <Value>0x00000000</Value>
+                <Value>0xffffffff</Value>
+            </Parameter>
+        </Row>
+
+        <Row Name="WaveActiveBitXor">
+            <Parameter Name="ShaderOp.Name">WaveActiveBitXor</Parameter>
+            <Parameter Name="ShaderOp.Text">
+                <![CDATA[
+                    struct PerThreadData {
+                        int firstLaneId;
+                        int mask;
+                        uint input;
+                        uint output;
+                    };
+                    RWStructuredBuffer<PerThreadData> g_sb : register(u0);
+                    [numthreads(8,12,1)]
+                    void main(uint GI : SV_GroupIndex) {
+                        PerThreadData pts = g_sb[GI];
+                        pts.firstLaneId = WaveReadLaneFirst(GI);
+                        if (pts.mask != 0) {
+                            pts.output = WaveActiveBitXor(pts.input);
+                        }
+                        else {
+                            pts.output = WaveActiveBitXor(pts.input);
+                        }
+                        g_sb[GI] = pts;
+                    }
+                ]]>
+            </Parameter>
+            <Parameter Name="Validation.NumInputSet">3</Parameter>
+            <Parameter Name="Validation.InputSet1">
+                <Value>0xe0000000</Value>
+                <Value>0x0d000000</Value>
+                <Value>0x00b00000</Value>
+                <Value>0x00070000</Value>
+                <Value>0x0000e000</Value>
+                <Value>0x00000d00</Value>
+                <Value>0x000000b0</Value>
+                <Value>0x00000007</Value>
+            </Parameter>
+            <Parameter Name="Validation.InputSet2">
+                <Value>0xedb7edb7</Value>
+                <Value>0xdb7edb7e</Value>
+                <Value>0xb7edb7ed</Value>
+                <Value>0x7edb7edb</Value>
+            </Parameter>
+            <Parameter Name="Validation.InputSet3">
+                <Value>0x12481248</Value>
+                <Value>0x24812481</Value>
+                <Value>0x48124812</Value>
+                <Value>0x81248124</Value>
+            </Parameter>
+            <Parameter Name="Validation.InputSet4">
+                <Value>0x00000000</Value>
+                <Value>0xffffffff</Value>
+            </Parameter>
+        </Row>
+    </Table>
+
+    <Table Id="WaveIntrinsicsPrefixIntTable">
+        <ParameterTypes>
+            <ParameterType Name="ShaderOp.Name">String</ParameterType>
+            <ParameterType Name="ShaderOp.Text">String</ParameterType>
+            <ParameterType Name="Validation.NumInputSet">unsigned int</ParameterType>
+            <ParameterType Name="Validation.InputSet1" Array="true">int</ParameterType>
+            <ParameterType Name="Validation.InputSet2" Array="true">int</ParameterType>
+            <ParameterType Name="Validation.InputSet3" Array="true">int</ParameterType>
+            <ParameterType Name="Validation.InputSet4" Array="true">int</ParameterType>
+        </ParameterTypes>
+        <Row Name="WavePrefixCountBits">
+            <Parameter Name="ShaderOp.Name">WavePrefixCountBits</Parameter>
+            <Parameter Name="ShaderOp.Text">
+                <![CDATA[
+                    struct PerThreadData {
+                        int firstLaneId;
+                        int mask;
+                        int input;
+                        int output;
+                    };
+                    RWStructuredBuffer<PerThreadData> g_sb : register(u0);
+                    [numthreads(8,12,1)]
+                    void main(uint GI : SV_GroupIndex) {
+                        PerThreadData pts = g_sb[GI];
+                        pts.firstLaneId = WaveReadLaneFirst(GI);
+                        if (pts.mask != 0) {
+                            pts.output = WavePrefixCountBits(pts.input > 3);
+                        }
+                        else {
+                            pts.output = WavePrefixCountBits(pts.input > 3);
+                        }
+                        g_sb[GI] = pts;
+                    }
+                ]]>
+            </Parameter>
+            <Parameter Name="Validation.NumInputSet">4</Parameter>
+            <Parameter Name="Validation.InputSet1">
+                <Value>1</Value>
+                <Value>2</Value>
+                <Value>3</Value>
+                <Value>4</Value>
+                <Value>5</Value>
+            </Parameter>
+            <Parameter Name="Validation.InputSet2">
+                <Value>0</Value>
+            </Parameter>
+            <Parameter Name="Validation.InputSet3">
+                <Value>1</Value>
+                <Value>10</Value>
+                <Value>-4</Value>
+                <Value>-64</Value>
+            </Parameter>
+            <Parameter Name="Validation.InputSet4">
+                <Value>-100</Value>
+                <Value>-1000</Value>
+                <Value>300</Value>
+            </Parameter>
+        </Row>
+
+        <Row Name="WavePrefixSum">
+            <Parameter Name="ShaderOp.Name">WavePrefixSum</Parameter>
+            <Parameter Name="ShaderOp.Text">
+                <![CDATA[
+                    struct PerThreadData {
+                        int firstLaneId;
+                        int mask;
+                        int input;
+                        int output;
+                    };
+                    RWStructuredBuffer<PerThreadData> g_sb : register(u0);
+                    [numthreads(8,12,1)]
+                    void main(uint GI : SV_GroupIndex) {
+                        PerThreadData pts = g_sb[GI];
+                        pts.firstLaneId = WaveReadLaneFirst(GI);
+                        if (pts.mask != 0) {
+                            pts.output = WavePrefixSum(pts.input);
+                        }
+                        else {
+                            pts.output = WavePrefixSum(pts.input);
+                        }
+                        g_sb[GI] = pts;
+                    }
+                ]]>
+            </Parameter>
+            <Parameter Name="Validation.NumInputSet">3</Parameter>
+            <Parameter Name="Validation.InputSet1">
+                <Value>1</Value>
+                <Value>2</Value>
+                <Value>3</Value>
+                <Value>4</Value>
+                <Value>5</Value>
+            </Parameter>
+            <Parameter Name="Validation.InputSet2">
+                <Value>0</Value>
+                <Value>1</Value>
+            </Parameter>
+            <Parameter Name="Validation.InputSet3">
+                <Value>1</Value>
+                <Value>2</Value>
+                <Value>4</Value>
+                <Value>-64</Value>
+                <Value>128</Value>
+            </Parameter>
+        </Row>
+
+         <Row Name="WavePrefixProduct">
+            <Parameter Name="ShaderOp.Name">WavePrefixProduct</Parameter>
+            <Parameter Name="ShaderOp.Text">
+                <![CDATA[
+                    struct PerThreadData {
+                        int firstLaneId;
+                        int mask;
+                        int input;
+                        int output;
+                    };
+                    RWStructuredBuffer<PerThreadData> g_sb : register(u0);
+                    [numthreads(8,12,1)]
+                    void main(uint GI : SV_GroupIndex) {
+                        PerThreadData pts = g_sb[GI];
+                        pts.firstLaneId = WaveReadLaneFirst(GI);
+                        if (pts.mask != 0) {
+                            pts.output = WavePrefixProduct(pts.input);
+                        }
+                        else {
+                            pts.output = WavePrefixProduct(pts.input);
+                        }
+                        g_sb[GI] = pts;
+                    }
+                ]]>
+            </Parameter>
+            <Parameter Name="Validation.NumInputSet">3</Parameter>
+            <Parameter Name="Validation.InputSet1">
+                <Value>1</Value>
+                <Value>2</Value>
+                <Value>3</Value>
+                <Value>4</Value>
+                <Value>5</Value>
+            </Parameter>
+            <Parameter Name="Validation.InputSet2">
+                <Value>0</Value>
+                <Value>1</Value>
+            </Parameter>
+            <Parameter Name="Validation.InputSet3">
+                <Value>1</Value>
+                <Value>2</Value>
+                <Value>4</Value>
+                <Value>-64</Value>
+                <Value>128</Value>
+            </Parameter>
+        </Row>
+    </Table>
+
+    <Table Id="WaveIntrinsicsPrefixUintTable">
+        <ParameterTypes>
+            <ParameterType Name="ShaderOp.Name">String</ParameterType>
+            <ParameterType Name="ShaderOp.Text">String</ParameterType>
+            <ParameterType Name="Validation.NumInputSet">unsigned int</ParameterType>
+            <ParameterType Name="Validation.InputSet1" Array="true">unsigned int</ParameterType>
+            <ParameterType Name="Validation.InputSet2" Array="true">unsigned int</ParameterType>
+            <ParameterType Name="Validation.InputSet3" Array="true">unsigned int</ParameterType>
+            <ParameterType Name="Validation.InputSet4" Array="true">unsigned int</ParameterType>
+        </ParameterTypes>
+        <Row Name="WavePrefixCountBits">
+            <Parameter Name="ShaderOp.Name">WavePrefixCountBits</Parameter>
+            <Parameter Name="ShaderOp.Text">
+                <![CDATA[
+                    struct PerThreadData {
+                        int firstLaneId;
+                        int mask;
+                        uint input;
+                        uint output;
+                    };
+                    RWStructuredBuffer<PerThreadData> g_sb : register(u0);
+                    [numthreads(8,12,1)]
+                    void main(uint GI : SV_GroupIndex) {
+                        PerThreadData pts = g_sb[GI];
+                        pts.firstLaneId = WaveReadLaneFirst(GI);
+                        if (pts.mask != 0) {
+                            pts.output = WavePrefixCountBits(pts.input > 3);
+                        }
+                        else {
+                            pts.output = WavePrefixCountBits(pts.input > 3);
+                        }
+                        g_sb[GI] = pts;
+                    }
+                ]]>
+            </Parameter>
+            <Parameter Name="Validation.NumInputSet">3</Parameter>
+            <Parameter Name="Validation.InputSet1">
+                <Value>1</Value>
+                <Value>2</Value>
+                <Value>3</Value>
+                <Value>4</Value>
+                <Value>5</Value>
+            </Parameter>
+            <Parameter Name="Validation.InputSet2">
+                <Value>0</Value>
+            </Parameter>
+            <Parameter Name="Validation.InputSet3">
+                <Value>1</Value>
+                <Value>10</Value>
+            </Parameter>
+            <Parameter Name="Validation.InputSet4">
+                <Value>100</Value>
+                <Value>300</Value>
+            </Parameter>
+        </Row>
+
+        <Row Name="WavePrefixSum">
+            <Parameter Name="ShaderOp.Name">WavePrefixSum</Parameter>
+            <Parameter Name="ShaderOp.Text">
+                <![CDATA[
+                    struct PerThreadData {
+                        int firstLaneId;
+                        int mask;
+                        uint input;
+                        uint output;
+                    };
+                    RWStructuredBuffer<PerThreadData> g_sb : register(u0);
+                    [numthreads(8,12,1)]
+                    void main(uint GI : SV_GroupIndex) {
+                        PerThreadData pts = g_sb[GI];
+                        pts.firstLaneId = WaveReadLaneFirst(GI);
+                        if (pts.mask != 0) {
+                            pts.output = WavePrefixSum(pts.input);
+                        }
+                        else {
+                            pts.output = WavePrefixSum(pts.input);
+                        }
+                        g_sb[GI] = pts;
+                    }
+                ]]>
+            </Parameter>
+            <Parameter Name="Validation.NumInputSet">3</Parameter>
+            <Parameter Name="Validation.InputSet1">
+                <Value>1</Value>
+                <Value>2</Value>
+                <Value>3</Value>
+                <Value>4</Value>
+                <Value>5</Value>
+            </Parameter>
+            <Parameter Name="Validation.InputSet2">
+                <Value>0</Value>
+                <Value>1</Value>
+            </Parameter>
+            <Parameter Name="Validation.InputSet3">
+                <Value>1</Value>
+                <Value>2</Value>
+                <Value>4</Value>
+                <Value>128</Value>
+            </Parameter>
+        </Row>
+
+         <Row Name="WavePrefixProduct">
+            <Parameter Name="ShaderOp.Name">WavePrefixProduct</Parameter>
+            <Parameter Name="ShaderOp.Text">
+                <![CDATA[
+                    struct PerThreadData {
+                        int firstLaneId;
+                        int mask;
+                        uint input;
+                        uint output;
+                    };
+                    RWStructuredBuffer<PerThreadData> g_sb : register(u0);
+                    [numthreads(8,12,1)]
+                    void main(uint GI : SV_GroupIndex) {
+                        PerThreadData pts = g_sb[GI];
+                        pts.firstLaneId = WaveReadLaneFirst(GI);
+                        if (pts.mask != 0) {
+                            pts.output = WavePrefixProduct(pts.input);
+                        }
+                        else {
+                            pts.output = WavePrefixProduct(pts.input);
+                        }
+                        g_sb[GI] = pts;
+                    }
+                ]]>
+            </Parameter>
+            <Parameter Name="Validation.NumInputSet">3</Parameter>
+            <Parameter Name="Validation.InputSet1">
+                <Value>1</Value>
+                <Value>2</Value>
+                <Value>3</Value>
+                <Value>4</Value>
+                <Value>5</Value>
+            </Parameter>
+            <Parameter Name="Validation.InputSet2">
+                <Value>0</Value>
+                <Value>1</Value>
+            </Parameter>
+            <Parameter Name="Validation.InputSet3">
+                <Value>1</Value>
+                <Value>2</Value>
+                <Value>4</Value>
+                <Value>128</Value>
+            </Parameter>
+        </Row>
+    </Table>
+</Data>
diff --git a/tools/clang/unittests/HLSL/clang-hlsl-tests.rc b/tools/clang/unittests/HLSL/clang-hlsl-tests.rc
index 9e1550781..6f4659910 100644
--- a/tools/clang/unittests/HLSL/clang-hlsl-tests.rc
+++ b/tools/clang/unittests/HLSL/clang-hlsl-tests.rc
@@ -1,3 +1,3 @@
-#include <windows.h>
-
+#include <windows.h>
+
 ShaderOpArithTable.xml DATASOURCE_XML "ShaderOpArithTable.xml"
\ No newline at end of file